mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-21 15:10:44 +00:00
Compare commits
108 Commits
jcsp/tenan
...
arpad/lsn_
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
96b3171799 | ||
|
|
74d150ba45 | ||
|
|
b7f45204a2 | ||
|
|
a05f104cce | ||
|
|
d672e44eee | ||
|
|
a6f892e200 | ||
|
|
71b380f90a | ||
|
|
6e145a44fa | ||
|
|
8e5e3971ba | ||
|
|
8dd29f1e27 | ||
|
|
f5344fb85a | ||
|
|
f95f001b8b | ||
|
|
e0821e1eab | ||
|
|
4469b1a62c | ||
|
|
842223b47f | ||
|
|
893616051d | ||
|
|
7cdde285a5 | ||
|
|
9c30883c4b | ||
|
|
0495798591 | ||
|
|
87389bc933 | ||
|
|
ea118a238a | ||
|
|
e9b227a11e | ||
|
|
40441f8ada | ||
|
|
a8a39cd464 | ||
|
|
b989ad1922 | ||
|
|
acef742a6e | ||
|
|
11d9d801b5 | ||
|
|
fc47af156f | ||
|
|
e310533ed3 | ||
|
|
1d68f52b57 | ||
|
|
4cd47b7d4b | ||
|
|
0141c95788 | ||
|
|
0ac4cf67a6 | ||
|
|
4be6bc7251 | ||
|
|
a394f49e0d | ||
|
|
c00651ff9b | ||
|
|
2caa795139 | ||
|
|
bea8efac24 | ||
|
|
52017eff68 | ||
|
|
ad5b02e175 | ||
|
|
b09a851705 | ||
|
|
85cd97af61 | ||
|
|
e6470ee92e | ||
|
|
dc72567288 | ||
|
|
6defa2b5d5 | ||
|
|
b3d3a2587d | ||
|
|
b85fc39bdb | ||
|
|
09b5954526 | ||
|
|
306c4f9967 | ||
|
|
5ceccdc7de | ||
|
|
cdcaa329bf | ||
|
|
27bdbf5e36 | ||
|
|
4c7fa12a2a | ||
|
|
367971a0e9 | ||
|
|
51570114ea | ||
|
|
098d3111a5 | ||
|
|
090246cad7 | ||
|
|
3737fe3a4b | ||
|
|
5650138532 | ||
|
|
2dca4c03fc | ||
|
|
0b790b6d00 | ||
|
|
e82d1ad6b8 | ||
|
|
4f0a8e92ad | ||
|
|
5952f350cb | ||
|
|
726c8e6730 | ||
|
|
f7067a38b7 | ||
|
|
896347f307 | ||
|
|
d7715531da | ||
|
|
e5c81fef86 | ||
|
|
7ebe9ca1ac | ||
|
|
7a38352865 | ||
|
|
b06f36ebca | ||
|
|
1588601503 | ||
|
|
9c35e1e6e5 | ||
|
|
d8c21ec70d | ||
|
|
ad99fa5f03 | ||
|
|
e675f4cec8 | ||
|
|
4db8efb2cf | ||
|
|
07c2b29895 | ||
|
|
9cdffd164a | ||
|
|
87db4b441c | ||
|
|
964c5c56b7 | ||
|
|
bd59349af3 | ||
|
|
2bd79906d9 | ||
|
|
493b47e1da | ||
|
|
c13e932c3b | ||
|
|
a5292f7e67 | ||
|
|
262348e41b | ||
|
|
68f15cf967 | ||
|
|
39f8fd6945 | ||
|
|
83567f9e4e | ||
|
|
06a9692956 | ||
|
|
804508ac17 | ||
|
|
3d2266c842 | ||
|
|
61783d6000 | ||
|
|
bae377ed81 | ||
|
|
3b4872eea7 | ||
|
|
57095b98c1 | ||
|
|
c205e96d89 | ||
|
|
ed923b6102 | ||
|
|
71611f4ab3 | ||
|
|
7c16b5215e | ||
|
|
39b148b74e | ||
|
|
116c342cad | ||
|
|
ba4fe9e10f | ||
|
|
de90bf4663 | ||
|
|
8360307ea0 | ||
|
|
6129077d31 |
@@ -22,5 +22,11 @@ platforms = [
|
||||
# "x86_64-pc-windows-msvc",
|
||||
]
|
||||
|
||||
[final-excludes]
|
||||
# vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
|
||||
# it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
|
||||
# from depending on workspace-hack because most of the dependencies are not used.
|
||||
workspace-members = ["vm_monitor"]
|
||||
|
||||
# Write out exact versions rather than a semver range. (Defaults to false.)
|
||||
# exact-versions = true
|
||||
|
||||
5
.github/ISSUE_TEMPLATE/epic-template.md
vendored
5
.github/ISSUE_TEMPLATE/epic-template.md
vendored
@@ -17,8 +17,9 @@ assignees: ''
|
||||
## Implementation ideas
|
||||
|
||||
|
||||
## Tasks
|
||||
- [ ]
|
||||
```[tasklist]
|
||||
### Tasks
|
||||
```
|
||||
|
||||
|
||||
## Other related tasks and Epics
|
||||
|
||||
2
.github/actionlint.yml
vendored
2
.github/actionlint.yml
vendored
@@ -1,5 +1,7 @@
|
||||
self-hosted-runner:
|
||||
labels:
|
||||
- arm64
|
||||
- dev
|
||||
- gen3
|
||||
- large
|
||||
- small
|
||||
|
||||
8
.github/workflows/build_and_test.yml
vendored
8
.github/workflows/build_and_test.yml
vendored
@@ -172,10 +172,10 @@ jobs:
|
||||
# https://github.com/EmbarkStudios/cargo-deny
|
||||
- name: Check rust licenses/bans/advisories/sources
|
||||
if: ${{ !cancelled() }}
|
||||
run: cargo deny check
|
||||
run: cargo deny check --hide-inclusion-graph
|
||||
|
||||
build-neon:
|
||||
needs: [ check-permissions ]
|
||||
needs: [ check-permissions, tag ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
@@ -187,6 +187,7 @@ jobs:
|
||||
env:
|
||||
BUILD_TYPE: ${{ matrix.build_type }}
|
||||
GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
|
||||
|
||||
steps:
|
||||
- name: Fix git ownership
|
||||
@@ -723,6 +724,7 @@ jobs:
|
||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
|
||||
--context .
|
||||
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
--build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
|
||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
|
||||
--destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
|
||||
@@ -847,7 +849,7 @@ jobs:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
env:
|
||||
VM_BUILDER_VERSION: v0.18.2
|
||||
VM_BUILDER_VERSION: v0.18.5
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
|
||||
181
.github/workflows/neon_extra_builds.yml
vendored
181
.github/workflows/neon_extra_builds.yml
vendored
@@ -21,7 +21,10 @@ env:
|
||||
|
||||
jobs:
|
||||
check-macos-build:
|
||||
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')
|
||||
if: |
|
||||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') ||
|
||||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
|
||||
github.ref_name == 'main'
|
||||
timeout-minutes: 90
|
||||
runs-on: macos-latest
|
||||
|
||||
@@ -112,8 +115,182 @@ jobs:
|
||||
- name: Check that no warnings are produced
|
||||
run: ./run_clippy.sh
|
||||
|
||||
check-linux-arm-build:
|
||||
timeout-minutes: 90
|
||||
runs-on: [ self-hosted, dev, arm64 ]
|
||||
|
||||
env:
|
||||
# Use release build only, to have less debug info around
|
||||
# Hence keeping target/ (and general cache size) smaller
|
||||
BUILD_TYPE: release
|
||||
CARGO_FEATURES: --features testing
|
||||
CARGO_FLAGS: --locked --release
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- name: Fix git ownership
|
||||
run: |
|
||||
# Workaround for `fatal: detected dubious ownership in repository at ...`
|
||||
#
|
||||
# Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
|
||||
# Ref https://github.com/actions/checkout/issues/785
|
||||
#
|
||||
git config --global --add safe.directory ${{ github.workspace }}
|
||||
git config --global --add safe.directory ${GITHUB_WORKSPACE}
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Set pg 14 revision for caching
|
||||
id: pg_v14_rev
|
||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Set pg 15 revision for caching
|
||||
id: pg_v15_rev
|
||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Set pg 16 revision for caching
|
||||
id: pg_v16_rev
|
||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Set env variables
|
||||
run: |
|
||||
echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" >> $GITHUB_ENV
|
||||
|
||||
- name: Cache postgres v14 build
|
||||
id: cache_pg_14
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: pg_install/v14
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Cache postgres v15 build
|
||||
id: cache_pg_15
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: pg_install/v15
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Cache postgres v16 build
|
||||
id: cache_pg_16
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: pg_install/v16
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Build postgres v14
|
||||
if: steps.cache_pg_14.outputs.cache-hit != 'true'
|
||||
run: mold -run make postgres-v14 -j$(nproc)
|
||||
|
||||
- name: Build postgres v15
|
||||
if: steps.cache_pg_15.outputs.cache-hit != 'true'
|
||||
run: mold -run make postgres-v15 -j$(nproc)
|
||||
|
||||
- name: Build postgres v16
|
||||
if: steps.cache_pg_16.outputs.cache-hit != 'true'
|
||||
run: mold -run make postgres-v16 -j$(nproc)
|
||||
|
||||
- name: Build neon extensions
|
||||
run: mold -run make neon-pg-ext -j$(nproc)
|
||||
|
||||
- name: Build walproposer-lib
|
||||
run: mold -run make walproposer-lib -j$(nproc)
|
||||
|
||||
- name: Run cargo build
|
||||
run: |
|
||||
mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
|
||||
|
||||
- name: Run cargo test
|
||||
run: |
|
||||
cargo test $CARGO_FLAGS $CARGO_FEATURES
|
||||
|
||||
# Run separate tests for real S3
|
||||
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
|
||||
export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
|
||||
export REMOTE_STORAGE_S3_REGION=eu-central-1
|
||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
||||
cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
|
||||
|
||||
# Run separate tests for real Azure Blob Storage
|
||||
# XXX: replace region with `eu-central-1`-like region
|
||||
export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
|
||||
export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
|
||||
export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
|
||||
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
|
||||
export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
|
||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
||||
cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
|
||||
|
||||
check-codestyle-rust-arm:
|
||||
timeout-minutes: 90
|
||||
runs-on: [ self-hosted, dev, arm64 ]
|
||||
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
|
||||
# Some of our rust modules use FFI and need those to be checked
|
||||
- name: Get postgres headers
|
||||
run: make postgres-headers -j$(nproc)
|
||||
|
||||
# cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
|
||||
# This will catch compiler & clippy warnings in all feature combinations.
|
||||
# TODO: use cargo hack for build and test as well, but, that's quite expensive.
|
||||
# NB: keep clippy args in sync with ./run_clippy.sh
|
||||
- run: |
|
||||
CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
|
||||
if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
|
||||
echo "No clippy args found in .neon_clippy_args"
|
||||
exit 1
|
||||
fi
|
||||
echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
|
||||
- name: Run cargo clippy (debug)
|
||||
run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
|
||||
- name: Run cargo clippy (release)
|
||||
run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
|
||||
|
||||
- name: Check documentation generation
|
||||
run: cargo doc --workspace --no-deps --document-private-items
|
||||
env:
|
||||
RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
|
||||
|
||||
# Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
|
||||
- name: Check formatting
|
||||
if: ${{ !cancelled() }}
|
||||
run: cargo fmt --all -- --check
|
||||
|
||||
# https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
|
||||
- name: Check rust dependencies
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date
|
||||
cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack
|
||||
|
||||
# https://github.com/EmbarkStudios/cargo-deny
|
||||
- name: Check rust licenses/bans/advisories/sources
|
||||
if: ${{ !cancelled() }}
|
||||
run: cargo deny check
|
||||
|
||||
gather-rust-build-stats:
|
||||
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats')
|
||||
if: |
|
||||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
|
||||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
|
||||
github.ref_name == 'main'
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
|
||||
2
.github/workflows/release.yml
vendored
2
.github/workflows/release.yml
vendored
@@ -2,7 +2,7 @@ name: Create Release Branch
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 7 * * 2'
|
||||
- cron: '0 7 * * 5'
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
|
||||
674
Cargo.lock
generated
674
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
19
Cargo.toml
19
Cargo.toml
@@ -36,6 +36,7 @@ license = "Apache-2.0"
|
||||
## All dependency versions, used in the project
|
||||
[workspace.dependencies]
|
||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
arc-swap = "1.6"
|
||||
async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
|
||||
azure_core = "0.16"
|
||||
azure_identity = "0.16"
|
||||
@@ -47,6 +48,7 @@ async-trait = "0.1"
|
||||
aws-config = { version = "0.56", default-features = false, features=["rustls"] }
|
||||
aws-sdk-s3 = "0.29"
|
||||
aws-smithy-http = "0.56"
|
||||
aws-smithy-async = { version = "0.56", default-features = false, features=["rt-tokio"] }
|
||||
aws-credential-types = "0.56"
|
||||
aws-types = "0.56"
|
||||
axum = { version = "0.6.20", features = ["ws"] }
|
||||
@@ -65,7 +67,7 @@ comfy-table = "6.1"
|
||||
const_format = "0.2"
|
||||
crc32c = "0.6"
|
||||
crossbeam-utils = "0.8.5"
|
||||
dashmap = "5.5.0"
|
||||
dashmap = { version = "5.5.0", features = ["raw-api"] }
|
||||
either = "1.8"
|
||||
enum-map = "2.4.2"
|
||||
enumset = "1.0.12"
|
||||
@@ -81,7 +83,7 @@ hex = "0.4"
|
||||
hex-literal = "0.4"
|
||||
hmac = "0.12.1"
|
||||
hostname = "0.3.1"
|
||||
http-types = "2"
|
||||
http-types = { version = "2", default-features = false }
|
||||
humantime = "2.1"
|
||||
humantime-serde = "1.1.1"
|
||||
hyper = "0.14"
|
||||
@@ -124,6 +126,7 @@ sentry = { version = "0.31", default-features = false, features = ["backtrace",
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
serde_with = "2.0"
|
||||
serde_assert = "0.5.0"
|
||||
sha2 = "0.10.2"
|
||||
signal-hook = "0.3"
|
||||
smallvec = "1.11"
|
||||
@@ -161,11 +164,11 @@ env_logger = "0.10"
|
||||
log = "0.4"
|
||||
|
||||
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
|
||||
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
|
||||
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
|
||||
|
||||
## Other git libraries
|
||||
heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
|
||||
@@ -202,7 +205,7 @@ tonic-build = "0.9"
|
||||
|
||||
# This is only needed for proxy's tests.
|
||||
# TODO: we should probably fork `tokio-postgres-rustls` instead.
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
|
||||
|
||||
################# Binary contents sections
|
||||
|
||||
|
||||
@@ -27,6 +27,7 @@ RUN set -e \
|
||||
FROM $REPOSITORY/$IMAGE:$TAG AS build
|
||||
WORKDIR /home/nonroot
|
||||
ARG GIT_VERSION=local
|
||||
ARG BUILD_TAG
|
||||
|
||||
# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
|
||||
# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
|
||||
@@ -78,9 +79,9 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pg_sni_router
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin
|
||||
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
|
||||
|
||||
4
Makefile
4
Makefile
@@ -72,6 +72,10 @@ neon: postgres-headers walproposer-lib
|
||||
#
|
||||
$(POSTGRES_INSTALL_DIR)/build/%/config.status:
|
||||
+@echo "Configuring Postgres $* build"
|
||||
@test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \
|
||||
echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \
|
||||
echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \
|
||||
exit 1; }
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/$*
|
||||
(cd $(POSTGRES_INSTALL_DIR)/build/$* && \
|
||||
env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \
|
||||
|
||||
@@ -156,6 +156,7 @@ fn main() -> Result<()> {
|
||||
let path = Path::new(sp);
|
||||
let file = File::open(path)?;
|
||||
spec = Some(serde_json::from_reader(file)?);
|
||||
live_config_allowed = true;
|
||||
} else if let Some(id) = compute_id {
|
||||
if let Some(cp_base) = control_plane_uri {
|
||||
live_config_allowed = true;
|
||||
@@ -277,32 +278,26 @@ fn main() -> Result<()> {
|
||||
if #[cfg(target_os = "linux")] {
|
||||
use std::env;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::warn;
|
||||
let vm_monitor_addr = matches.get_one::<String>("vm-monitor-addr");
|
||||
let vm_monitor_addr = matches
|
||||
.get_one::<String>("vm-monitor-addr")
|
||||
.expect("--vm-monitor-addr should always be set because it has a default arg");
|
||||
let file_cache_connstr = matches.get_one::<String>("filecache-connstr");
|
||||
let cgroup = matches.get_one::<String>("cgroup");
|
||||
let file_cache_on_disk = matches.get_flag("file-cache-on-disk");
|
||||
|
||||
// Only make a runtime if we need to.
|
||||
// Note: it seems like you can make a runtime in an inner scope and
|
||||
// if you start a task in it it won't be dropped. However, make it
|
||||
// in the outermost scope just to be safe.
|
||||
let rt = match (env::var_os("AUTOSCALING"), vm_monitor_addr) {
|
||||
(None, None) => None,
|
||||
(None, Some(_)) => {
|
||||
warn!("--vm-monitor-addr option set but AUTOSCALING env var not present");
|
||||
None
|
||||
}
|
||||
(Some(_), None) => {
|
||||
panic!("AUTOSCALING env var present but --vm-monitor-addr option not set")
|
||||
}
|
||||
(Some(_), Some(_)) => Some(
|
||||
let rt = if env::var_os("AUTOSCALING").is_some() {
|
||||
Some(
|
||||
tokio::runtime::Builder::new_multi_thread()
|
||||
.worker_threads(4)
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("failed to create tokio runtime for monitor"),
|
||||
),
|
||||
.expect("failed to create tokio runtime for monitor")
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// This token is used internally by the monitor to clean up all threads
|
||||
@@ -313,8 +308,7 @@ fn main() -> Result<()> {
|
||||
Box::leak(Box::new(vm_monitor::Args {
|
||||
cgroup: cgroup.cloned(),
|
||||
pgconnstr: file_cache_connstr.cloned(),
|
||||
addr: vm_monitor_addr.cloned().unwrap(),
|
||||
file_cache_on_disk,
|
||||
addr: vm_monitor_addr.clone(),
|
||||
})),
|
||||
token.clone(),
|
||||
))
|
||||
@@ -486,6 +480,8 @@ fn cli() -> clap::Command {
|
||||
.value_name("FILECACHE_CONNSTR"),
|
||||
)
|
||||
.arg(
|
||||
// DEPRECATED, NO LONGER DOES ANYTHING.
|
||||
// See https://github.com/neondatabase/cloud/issues/7516
|
||||
Arg::new("file-cache-on-disk")
|
||||
.long("file-cache-on-disk")
|
||||
.action(clap::ArgAction::SetTrue),
|
||||
|
||||
@@ -710,8 +710,12 @@ impl ComputeNode {
|
||||
// `pg_ctl` for start / stop, so this just seems much easier to do as we already
|
||||
// have opened connection to Postgres and superuser access.
|
||||
#[instrument(skip_all)]
|
||||
fn pg_reload_conf(&self, client: &mut Client) -> Result<()> {
|
||||
client.simple_query("SELECT pg_reload_conf()")?;
|
||||
fn pg_reload_conf(&self) -> Result<()> {
|
||||
let pgctl_bin = Path::new(&self.pgbin).parent().unwrap().join("pg_ctl");
|
||||
Command::new(pgctl_bin)
|
||||
.args(["reload", "-D", &self.pgdata])
|
||||
.output()
|
||||
.expect("cannot run pg_ctl process");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -724,9 +728,9 @@ impl ComputeNode {
|
||||
// Write new config
|
||||
let pgdata_path = Path::new(&self.pgdata);
|
||||
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;
|
||||
self.pg_reload_conf()?;
|
||||
|
||||
let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
|
||||
self.pg_reload_conf(&mut client)?;
|
||||
|
||||
// Proceed with post-startup configuration. Note, that order of operations is important.
|
||||
// Disable DDL forwarding because control plane already knows about these roles/databases.
|
||||
|
||||
@@ -78,7 +78,7 @@ use regex::Regex;
|
||||
use remote_storage::*;
|
||||
use serde_json;
|
||||
use std::io::Read;
|
||||
use std::num::{NonZeroU32, NonZeroUsize};
|
||||
use std::num::NonZeroUsize;
|
||||
use std::path::Path;
|
||||
use std::str;
|
||||
use tar::Archive;
|
||||
@@ -281,8 +281,6 @@ pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRem
|
||||
max_keys_per_list_response: None,
|
||||
};
|
||||
let config = RemoteStorageConfig {
|
||||
max_concurrent_syncs: NonZeroUsize::new(100).expect("100 != 0"),
|
||||
max_sync_errors: NonZeroU32::new(100).expect("100 != 0"),
|
||||
storage: RemoteStorageKind::AwsS3(config),
|
||||
};
|
||||
GenericRemoteStorage::from_config(&config)
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
//!
|
||||
//! Various tools and helpers to handle cluster / compute node (Postgres)
|
||||
//! configuration.
|
||||
//!
|
||||
#![deny(unsafe_code)]
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
pub mod checker;
|
||||
pub mod config;
|
||||
pub mod configurator;
|
||||
|
||||
@@ -193,11 +193,16 @@ impl Escaping for PgIdent {
|
||||
/// Build a list of existing Postgres roles
|
||||
pub fn get_existing_roles(xact: &mut Transaction<'_>) -> Result<Vec<Role>> {
|
||||
let postgres_roles = xact
|
||||
.query("SELECT rolname, rolpassword FROM pg_catalog.pg_authid", &[])?
|
||||
.query(
|
||||
"SELECT rolname, rolpassword, rolreplication, rolbypassrls FROM pg_catalog.pg_authid",
|
||||
&[],
|
||||
)?
|
||||
.iter()
|
||||
.map(|row| Role {
|
||||
name: row.get("rolname"),
|
||||
encrypted_password: row.get("rolpassword"),
|
||||
replication: Some(row.get("rolreplication")),
|
||||
bypassrls: Some(row.get("rolbypassrls")),
|
||||
options: None,
|
||||
})
|
||||
.collect();
|
||||
|
||||
@@ -24,7 +24,7 @@ fn do_control_plane_request(
|
||||
) -> Result<ControlPlaneSpecResponse, (bool, String)> {
|
||||
let resp = reqwest::blocking::Client::new()
|
||||
.get(uri)
|
||||
.header("Authorization", jwt)
|
||||
.header("Authorization", format!("Bearer {}", jwt))
|
||||
.send()
|
||||
.map_err(|e| {
|
||||
(
|
||||
@@ -68,7 +68,7 @@ pub fn get_spec_from_control_plane(
|
||||
base_uri: &str,
|
||||
compute_id: &str,
|
||||
) -> Result<Option<ComputeSpec>> {
|
||||
let cp_uri = format!("{base_uri}/management/api/v2/computes/{compute_id}/spec");
|
||||
let cp_uri = format!("{base_uri}/compute/api/v2/computes/{compute_id}/spec");
|
||||
let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") {
|
||||
Ok(v) => v,
|
||||
Err(_) => "".to_string(),
|
||||
@@ -265,6 +265,8 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
let action = if let Some(r) = pg_role {
|
||||
if (r.encrypted_password.is_none() && role.encrypted_password.is_some())
|
||||
|| (r.encrypted_password.is_some() && role.encrypted_password.is_none())
|
||||
|| !r.bypassrls.unwrap_or(false)
|
||||
|| !r.replication.unwrap_or(false)
|
||||
{
|
||||
RoleAction::Update
|
||||
} else if let Some(pg_pwd) = &r.encrypted_password {
|
||||
@@ -296,7 +298,8 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
match action {
|
||||
RoleAction::None => {}
|
||||
RoleAction::Update => {
|
||||
let mut query: String = format!("ALTER ROLE {} ", name.pg_quote());
|
||||
let mut query: String =
|
||||
format!("ALTER ROLE {} BYPASSRLS REPLICATION", name.pg_quote());
|
||||
query.push_str(&role.to_pg_options());
|
||||
xact.execute(query.as_str(), &[])?;
|
||||
}
|
||||
|
||||
@@ -2,7 +2,6 @@ use crate::{background_process, local_env::LocalEnv};
|
||||
use anyhow::anyhow;
|
||||
use camino::Utf8PathBuf;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use std::{path::PathBuf, process::Child};
|
||||
use utils::id::{NodeId, TenantId};
|
||||
|
||||
@@ -14,12 +13,10 @@ pub struct AttachmentService {
|
||||
|
||||
const COMMAND: &str = "attachment_service";
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct AttachHookRequest {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub tenant_id: TenantId,
|
||||
pub pageserver_id: Option<NodeId>,
|
||||
pub node_id: Option<NodeId>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
@@ -85,7 +82,7 @@ impl AttachmentService {
|
||||
.control_plane_api
|
||||
.clone()
|
||||
.unwrap()
|
||||
.join("attach_hook")
|
||||
.join("attach-hook")
|
||||
.unwrap();
|
||||
let client = reqwest::blocking::ClientBuilder::new()
|
||||
.build()
|
||||
@@ -93,7 +90,7 @@ impl AttachmentService {
|
||||
|
||||
let request = AttachHookRequest {
|
||||
tenant_id,
|
||||
pageserver_id: Some(pageserver_id),
|
||||
node_id: Some(pageserver_id),
|
||||
};
|
||||
|
||||
let response = client.post(url).json(&request).send()?;
|
||||
|
||||
@@ -262,7 +262,7 @@ where
|
||||
P: Into<Utf8PathBuf>,
|
||||
{
|
||||
let path: Utf8PathBuf = path.into();
|
||||
// SAFETY
|
||||
// SAFETY:
|
||||
// pre_exec is marked unsafe because it runs between fork and exec.
|
||||
// Why is that dangerous in various ways?
|
||||
// Long answer: https://github.com/rust-lang/rust/issues/39575
|
||||
|
||||
@@ -12,6 +12,7 @@ use hyper::{Body, Request, Response};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
use utils::http::endpoint::request_span;
|
||||
use utils::logging::{self, LogFormat};
|
||||
use utils::signals::{ShutdownSignals, Signal};
|
||||
|
||||
@@ -171,7 +172,7 @@ async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiE
|
||||
state.generation += 1;
|
||||
response.tenants.push(ReAttachResponseTenant {
|
||||
id: *t,
|
||||
generation: state.generation,
|
||||
gen: state.generation,
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -217,27 +218,31 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
|
||||
.tenants
|
||||
.entry(attach_req.tenant_id)
|
||||
.or_insert_with(|| TenantState {
|
||||
pageserver: attach_req.pageserver_id,
|
||||
pageserver: attach_req.node_id,
|
||||
generation: 0,
|
||||
});
|
||||
|
||||
if let Some(attaching_pageserver) = attach_req.pageserver_id.as_ref() {
|
||||
if let Some(attaching_pageserver) = attach_req.node_id.as_ref() {
|
||||
tenant_state.generation += 1;
|
||||
tracing::info!(
|
||||
"attach_hook: issuing generation {} to pageserver {}",
|
||||
attaching_pageserver,
|
||||
tenant_state.generation
|
||||
tenant_id = %attach_req.tenant_id,
|
||||
ps_id = %attaching_pageserver,
|
||||
generation = %tenant_state.generation,
|
||||
"issuing",
|
||||
);
|
||||
} else if let Some(ps_id) = tenant_state.pageserver {
|
||||
tracing::info!(
|
||||
"attach_hook: dropping pageserver {} in generation {}",
|
||||
ps_id,
|
||||
tenant_state.generation
|
||||
tenant_id = %attach_req.tenant_id,
|
||||
%ps_id,
|
||||
generation = %tenant_state.generation,
|
||||
"dropping",
|
||||
);
|
||||
} else {
|
||||
tracing::info!("attach_hook: no-op: tenant already has no pageserver");
|
||||
tracing::info!(
|
||||
tenant_id = %attach_req.tenant_id,
|
||||
"no-op: tenant already has no pageserver");
|
||||
}
|
||||
tenant_state.pageserver = attach_req.pageserver_id;
|
||||
tenant_state.pageserver = attach_req.node_id;
|
||||
let generation = tenant_state.generation;
|
||||
|
||||
locked.save().await.map_err(ApiError::InternalServerError)?;
|
||||
@@ -245,7 +250,7 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
AttachHookResponse {
|
||||
gen: attach_req.pageserver_id.map(|_| generation),
|
||||
gen: attach_req.node_id.map(|_| generation),
|
||||
},
|
||||
)
|
||||
}
|
||||
@@ -253,9 +258,9 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
|
||||
fn make_router(persistent_state: PersistentState) -> RouterBuilder<hyper::Body, ApiError> {
|
||||
endpoint::make_router()
|
||||
.data(Arc::new(State::new(persistent_state)))
|
||||
.post("/re-attach", handle_re_attach)
|
||||
.post("/validate", handle_validate)
|
||||
.post("/attach_hook", handle_attach_hook)
|
||||
.post("/re-attach", |r| request_span(r, handle_re_attach))
|
||||
.post("/validate", |r| request_span(r, handle_validate))
|
||||
.post("/attach-hook", |r| request_span(r, handle_attach_hook))
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
|
||||
@@ -798,6 +798,24 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
|
||||
ep.start(&auth_token, safekeepers, remote_ext_config)?;
|
||||
}
|
||||
}
|
||||
"reconfigure" => {
|
||||
let endpoint_id = sub_args
|
||||
.get_one::<String>("endpoint_id")
|
||||
.ok_or_else(|| anyhow!("No endpoint ID provided to reconfigure"))?;
|
||||
let endpoint = cplane
|
||||
.endpoints
|
||||
.get(endpoint_id.as_str())
|
||||
.with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
|
||||
let pageserver_id =
|
||||
if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
|
||||
Some(NodeId(
|
||||
id_str.parse().context("while parsing pageserver id")?,
|
||||
))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
endpoint.reconfigure(pageserver_id)?;
|
||||
}
|
||||
"stop" => {
|
||||
let endpoint_id = sub_args
|
||||
.get_one::<String>("endpoint_id")
|
||||
@@ -1369,6 +1387,12 @@ fn cli() -> Command {
|
||||
.arg(safekeepers_arg)
|
||||
.arg(remote_ext_config_args)
|
||||
)
|
||||
.subcommand(Command::new("reconfigure")
|
||||
.about("Reconfigure the endpoint")
|
||||
.arg(endpoint_pageserver_id_arg)
|
||||
.arg(endpoint_id_arg.clone())
|
||||
.arg(tenant_id_arg.clone())
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("stop")
|
||||
.arg(endpoint_id_arg)
|
||||
|
||||
@@ -46,7 +46,6 @@ use std::time::Duration;
|
||||
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
|
||||
use crate::local_env::LocalEnv;
|
||||
@@ -57,13 +56,10 @@ use compute_api::responses::{ComputeState, ComputeStatus};
|
||||
use compute_api::spec::{Cluster, ComputeMode, ComputeSpec};
|
||||
|
||||
// contents of a endpoint.json file
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||
pub struct EndpointConf {
|
||||
endpoint_id: String,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
tenant_id: TenantId,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
timeline_id: TimelineId,
|
||||
mode: ComputeMode,
|
||||
pg_port: u16,
|
||||
@@ -414,18 +410,34 @@ impl Endpoint {
|
||||
);
|
||||
}
|
||||
|
||||
// Also wait for the compute_ctl process to die. It might have some cleanup
|
||||
// work to do after postgres stops, like syncing safekeepers, etc.
|
||||
//
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn wait_for_compute_ctl_to_exit(&self) -> Result<()> {
|
||||
// TODO use background_process::stop_process instead
|
||||
let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
|
||||
let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
|
||||
let pid = nix::unistd::Pid::from_raw(pid as i32);
|
||||
crate::background_process::wait_until_stopped("compute_ctl", pid)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn read_postgresql_conf(&self) -> Result<String> {
|
||||
// Slurp the endpoints/<endpoint id>/postgresql.conf file into
|
||||
// memory. We will include it in the spec file that we pass to
|
||||
// `compute_ctl`, and `compute_ctl` will write it to the postgresql.conf
|
||||
// in the data directory.
|
||||
let postgresql_conf_path = self.endpoint_path().join("postgresql.conf");
|
||||
match std::fs::read(&postgresql_conf_path) {
|
||||
Ok(content) => Ok(String::from_utf8(content)?),
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok("".to_string()),
|
||||
Err(e) => Err(anyhow::Error::new(e).context(format!(
|
||||
"failed to read config file in {}",
|
||||
postgresql_conf_path.to_str().unwrap()
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn start(
|
||||
&self,
|
||||
auth_token: &Option<String>,
|
||||
@@ -436,21 +448,7 @@ impl Endpoint {
|
||||
anyhow::bail!("The endpoint is already running");
|
||||
}
|
||||
|
||||
// Slurp the endpoints/<endpoint id>/postgresql.conf file into
|
||||
// memory. We will include it in the spec file that we pass to
|
||||
// `compute_ctl`, and `compute_ctl` will write it to the postgresql.conf
|
||||
// in the data directory.
|
||||
let postgresql_conf_path = self.endpoint_path().join("postgresql.conf");
|
||||
let postgresql_conf = match std::fs::read(&postgresql_conf_path) {
|
||||
Ok(content) => String::from_utf8(content)?,
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => "".to_string(),
|
||||
Err(e) => {
|
||||
return Err(anyhow::Error::new(e).context(format!(
|
||||
"failed to read config file in {}",
|
||||
postgresql_conf_path.to_str().unwrap()
|
||||
)))
|
||||
}
|
||||
};
|
||||
let postgresql_conf = self.read_postgresql_conf()?;
|
||||
|
||||
// We always start the compute node from scratch, so if the Postgres
|
||||
// data dir exists from a previous launch, remove it first.
|
||||
@@ -621,6 +619,61 @@ impl Endpoint {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn reconfigure(&self, pageserver_id: Option<NodeId>) -> Result<()> {
|
||||
let mut spec: ComputeSpec = {
|
||||
let spec_path = self.endpoint_path().join("spec.json");
|
||||
let file = std::fs::File::open(spec_path)?;
|
||||
serde_json::from_reader(file)?
|
||||
};
|
||||
|
||||
let postgresql_conf = self.read_postgresql_conf()?;
|
||||
spec.cluster.postgresql_conf = Some(postgresql_conf);
|
||||
|
||||
if let Some(pageserver_id) = pageserver_id {
|
||||
let endpoint_config_path = self.endpoint_path().join("endpoint.json");
|
||||
let mut endpoint_conf: EndpointConf = {
|
||||
let file = std::fs::File::open(&endpoint_config_path)?;
|
||||
serde_json::from_reader(file)?
|
||||
};
|
||||
endpoint_conf.pageserver_id = pageserver_id;
|
||||
std::fs::write(
|
||||
endpoint_config_path,
|
||||
serde_json::to_string_pretty(&endpoint_conf)?,
|
||||
)?;
|
||||
|
||||
let pageserver =
|
||||
PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
|
||||
let ps_http_conf = &pageserver.pg_connection_config;
|
||||
let (host, port) = (ps_http_conf.host(), ps_http_conf.port());
|
||||
spec.pageserver_connstring = Some(format!("postgresql://no_user@{host}:{port}"));
|
||||
}
|
||||
|
||||
let client = reqwest::blocking::Client::new();
|
||||
let response = client
|
||||
.post(format!(
|
||||
"http://{}:{}/configure",
|
||||
self.http_address.ip(),
|
||||
self.http_address.port()
|
||||
))
|
||||
.body(format!(
|
||||
"{{\"spec\":{}}}",
|
||||
serde_json::to_string_pretty(&spec)?
|
||||
))
|
||||
.send()?;
|
||||
|
||||
let status = response.status();
|
||||
if !(status.is_client_error() || status.is_server_error()) {
|
||||
Ok(())
|
||||
} else {
|
||||
let url = response.url().to_owned();
|
||||
let msg = match response.text() {
|
||||
Ok(err_body) => format!("Error: {}", err_body),
|
||||
Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
|
||||
};
|
||||
Err(anyhow::anyhow!(msg))
|
||||
}
|
||||
}
|
||||
|
||||
pub fn stop(&self, destroy: bool) -> Result<()> {
|
||||
// If we are going to destroy data directory,
|
||||
// use immediate shutdown mode, otherwise,
|
||||
@@ -629,15 +682,25 @@ impl Endpoint {
|
||||
// Postgres is always started from scratch, so stop
|
||||
// without destroy only used for testing and debugging.
|
||||
//
|
||||
self.pg_ctl(
|
||||
if destroy {
|
||||
&["-m", "immediate", "stop"]
|
||||
} else {
|
||||
&["stop"]
|
||||
},
|
||||
&None,
|
||||
)?;
|
||||
|
||||
// Also wait for the compute_ctl process to die. It might have some cleanup
|
||||
// work to do after postgres stops, like syncing safekeepers, etc.
|
||||
//
|
||||
self.wait_for_compute_ctl_to_exit()?;
|
||||
if destroy {
|
||||
self.pg_ctl(&["-m", "immediate", "stop"], &None)?;
|
||||
println!(
|
||||
"Destroying postgres data directory '{}'",
|
||||
self.pgdata().to_str().unwrap()
|
||||
);
|
||||
std::fs::remove_dir_all(self.endpoint_path())?;
|
||||
} else {
|
||||
self.pg_ctl(&["stop"], &None)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,11 +1,10 @@
|
||||
//
|
||||
// Local control plane.
|
||||
//
|
||||
// Can start, configure and stop postgres instances running as a local processes.
|
||||
//
|
||||
// Intended to be used in integration tests and in CLI tools for
|
||||
// local installations.
|
||||
//
|
||||
//! Local control plane.
|
||||
//!
|
||||
//! Can start, configure and stop postgres instances running as a local processes.
|
||||
//!
|
||||
//! Intended to be used in integration tests and in CLI tools for
|
||||
//! local installations.
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
|
||||
pub mod attachment_service;
|
||||
mod background_process;
|
||||
|
||||
@@ -8,7 +8,6 @@ use anyhow::{bail, ensure, Context};
|
||||
use postgres_backend::AuthType;
|
||||
use reqwest::Url;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use std::collections::HashMap;
|
||||
use std::env;
|
||||
use std::fs;
|
||||
@@ -33,7 +32,6 @@ pub const DEFAULT_PG_VERSION: u32 = 15;
|
||||
// to 'neon_local init --config=<path>' option. See control_plane/simple.conf for
|
||||
// an example.
|
||||
//
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||
pub struct LocalEnv {
|
||||
// Base directory for all the nodes (the pageserver, safekeepers and
|
||||
@@ -59,7 +57,6 @@ pub struct LocalEnv {
|
||||
// Default tenant ID to use with the 'neon_local' command line utility, when
|
||||
// --tenant_id is not explicitly specified.
|
||||
#[serde(default)]
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub default_tenant_id: Option<TenantId>,
|
||||
|
||||
// used to issue tokens during e.g pg start
|
||||
@@ -84,7 +81,6 @@ pub struct LocalEnv {
|
||||
// A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
|
||||
// but deserialization into a generic toml object as `toml::Value::try_from` fails with an error.
|
||||
// https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table".
|
||||
#[serde_as(as = "HashMap<_, Vec<(DisplayFromStr, DisplayFromStr)>>")]
|
||||
branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
|
||||
}
|
||||
|
||||
|
||||
22
deny.toml
22
deny.toml
@@ -74,10 +74,30 @@ highlight = "all"
|
||||
workspace-default-features = "allow"
|
||||
external-default-features = "allow"
|
||||
allow = []
|
||||
deny = []
|
||||
|
||||
skip = []
|
||||
skip-tree = []
|
||||
|
||||
[[bans.deny]]
|
||||
# we use tokio, the same rationale applies for async-{io,waker,global-executor,executor,channel,lock}, smol
|
||||
# if you find yourself here while adding a dependency, try "default-features = false", ask around on #rust
|
||||
name = "async-std"
|
||||
|
||||
[[bans.deny]]
|
||||
name = "async-io"
|
||||
|
||||
[[bans.deny]]
|
||||
name = "async-waker"
|
||||
|
||||
[[bans.deny]]
|
||||
name = "async-global-executor"
|
||||
|
||||
[[bans.deny]]
|
||||
name = "async-executor"
|
||||
|
||||
[[bans.deny]]
|
||||
name = "smol"
|
||||
|
||||
# This section is considered when running `cargo deny check sources`.
|
||||
# More documentation about the 'sources' section can be found here:
|
||||
# https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html
|
||||
|
||||
108
docs/updating-postgres.md
Normal file
108
docs/updating-postgres.md
Normal file
@@ -0,0 +1,108 @@
|
||||
# Updating Postgres
|
||||
|
||||
## Minor Versions
|
||||
|
||||
When upgrading to a new minor version of Postgres, please follow these steps:
|
||||
|
||||
_Example: 15.4 is the new minor version to upgrade to from 15.3._
|
||||
|
||||
1. Clone the Neon Postgres repository if you have not done so already.
|
||||
|
||||
```shell
|
||||
git clone git@github.com:neondatabase/postgres.git
|
||||
```
|
||||
|
||||
1. Add the Postgres upstream remote.
|
||||
|
||||
```shell
|
||||
git remote add upstream https://git.postgresql.org/git/postgresql.git
|
||||
```
|
||||
|
||||
1. Create a new branch based on the stable branch you are updating.
|
||||
|
||||
```shell
|
||||
git checkout -b my-branch REL_15_STABLE_neon
|
||||
```
|
||||
|
||||
1. Tag the last commit on the stable branch you are updating.
|
||||
|
||||
```shell
|
||||
git tag REL_15_3_neon
|
||||
```
|
||||
|
||||
1. Push the new tag to the Neon Postgres repository.
|
||||
|
||||
```shell
|
||||
git push origin REL_15_3_neon
|
||||
```
|
||||
|
||||
1. Find the release tags you're looking for. They are of the form `REL_X_Y`.
|
||||
|
||||
1. Rebase the branch you created on the tag and resolve any conflicts.
|
||||
|
||||
```shell
|
||||
git fetch upstream REL_15_4
|
||||
git rebase REL_15_4
|
||||
```
|
||||
|
||||
1. Run the Postgres test suite to make sure our commits have not affected
|
||||
Postgres in a negative way.
|
||||
|
||||
```shell
|
||||
make check
|
||||
# OR
|
||||
meson test -C builddir
|
||||
```
|
||||
|
||||
1. Push your branch to the Neon Postgres repository.
|
||||
|
||||
```shell
|
||||
git push origin my-branch
|
||||
```
|
||||
|
||||
1. Clone the Neon repository if you have not done so already.
|
||||
|
||||
```shell
|
||||
git clone git@github.com:neondatabase/neon.git
|
||||
```
|
||||
|
||||
1. Create a new branch.
|
||||
|
||||
1. Change the `revisions.json` file to point at the HEAD of your Postgres
|
||||
branch.
|
||||
|
||||
1. Update the Git submodule.
|
||||
|
||||
```shell
|
||||
git submodule set-branch --branch my-branch vendor/postgres-v15
|
||||
git submodule update --remote vendor/postgres-v15
|
||||
```
|
||||
|
||||
1. Run the Neon test suite to make sure that Neon is still good to go on this
|
||||
minor Postgres release.
|
||||
|
||||
```shell
|
||||
./scripts/poetry -k pg15
|
||||
```
|
||||
|
||||
1. Commit your changes.
|
||||
|
||||
1. Create a pull request, and wait for CI to go green.
|
||||
|
||||
1. Force push the rebased Postgres branches into the Neon Postgres repository.
|
||||
|
||||
```shell
|
||||
git push --force origin my-branch:REL_15_STABLE_neon
|
||||
```
|
||||
|
||||
It may require disabling various branch protections.
|
||||
|
||||
1. Update your Neon PR to point at the branches.
|
||||
|
||||
```shell
|
||||
git submodule set-branch --branch REL_15_STABLE_neon vendor/postgres-v15
|
||||
git commit --amend --no-edit
|
||||
git push --force origin
|
||||
```
|
||||
|
||||
1. Merge the pull request after getting approval(s) and CI completion.
|
||||
@@ -1,3 +1,5 @@
|
||||
#![deny(unsafe_code)]
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
pub mod requests;
|
||||
pub mod responses;
|
||||
pub mod spec;
|
||||
|
||||
@@ -6,7 +6,6 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
@@ -19,7 +18,6 @@ pub type PgIdent = String;
|
||||
|
||||
/// Cluster spec or configuration represented as an optional number of
|
||||
/// delta operations + final cluster state description.
|
||||
#[serde_as]
|
||||
#[derive(Clone, Debug, Default, Deserialize, Serialize)]
|
||||
pub struct ComputeSpec {
|
||||
pub format_version: f32,
|
||||
@@ -50,12 +48,12 @@ pub struct ComputeSpec {
|
||||
// these, and instead set the "neon.tenant_id", "neon.timeline_id",
|
||||
// etc. GUCs in cluster.settings. TODO: Once the control plane has been
|
||||
// updated to fill these fields, we can make these non optional.
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub tenant_id: Option<TenantId>,
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
|
||||
pub timeline_id: Option<TimelineId>,
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
|
||||
pub pageserver_connstring: Option<String>,
|
||||
|
||||
#[serde(default)]
|
||||
pub safekeeper_connstrings: Vec<String>,
|
||||
|
||||
@@ -140,14 +138,13 @@ impl RemoteExtSpec {
|
||||
}
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
|
||||
pub enum ComputeMode {
|
||||
/// A read-write node
|
||||
#[default]
|
||||
Primary,
|
||||
/// A read-only node, pinned at a particular LSN
|
||||
Static(#[serde_as(as = "DisplayFromStr")] Lsn),
|
||||
Static(Lsn),
|
||||
/// A read-only node that follows the tip of the branch in hot standby mode
|
||||
///
|
||||
/// Future versions may want to distinguish between replicas with hot standby
|
||||
@@ -190,6 +187,8 @@ pub struct DeltaOp {
|
||||
pub struct Role {
|
||||
pub name: PgIdent,
|
||||
pub encrypted_password: Option<String>,
|
||||
pub replication: Option<bool>,
|
||||
pub bypassrls: Option<bool>,
|
||||
pub options: GenericOptions,
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
//!
|
||||
//! Shared code for consumption metics collection
|
||||
//!
|
||||
#![deny(unsafe_code)]
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
use chrono::{DateTime, Utc};
|
||||
use rand::Rng;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
//! make sure that we use the same dep version everywhere.
|
||||
//! Otherwise, we might not see all metrics registered via
|
||||
//! a default registry.
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
use once_cell::sync::Lazy;
|
||||
use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec};
|
||||
pub use prometheus::opts;
|
||||
@@ -89,14 +90,14 @@ pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
|
||||
0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
|
||||
];
|
||||
|
||||
pub fn set_build_info_metric(revision: &str) {
|
||||
pub fn set_build_info_metric(revision: &str, build_tag: &str) {
|
||||
let metric = register_int_gauge_vec!(
|
||||
"libmetrics_build_info",
|
||||
"Build/version information",
|
||||
&["revision"]
|
||||
&["revision", "build_tag"]
|
||||
)
|
||||
.expect("Failed to register build info metric");
|
||||
metric.with_label_values(&[revision]).set(1);
|
||||
metric.with_label_values(&[revision, build_tag]).set(1);
|
||||
}
|
||||
|
||||
// Records I/O stats in a "cross-platform" way.
|
||||
|
||||
@@ -4,7 +4,6 @@
|
||||
//! See docs/rfcs/025-generation-numbers.md
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use utils::id::{NodeId, TenantId};
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
@@ -12,12 +11,10 @@ pub struct ReAttachRequest {
|
||||
pub node_id: NodeId,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ReAttachResponseTenant {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub id: TenantId,
|
||||
pub generation: u32,
|
||||
pub gen: u32,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
@@ -25,10 +22,8 @@ pub struct ReAttachResponse {
|
||||
pub tenants: Vec<ReAttachResponseTenant>,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ValidateRequestTenant {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub id: TenantId,
|
||||
pub gen: u32,
|
||||
}
|
||||
@@ -43,10 +38,8 @@ pub struct ValidateResponse {
|
||||
pub tenants: Vec<ValidateResponseTenant>,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ValidateResponseTenant {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub id: TenantId,
|
||||
pub valid: bool,
|
||||
}
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
#![deny(unsafe_code)]
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
use const_format::formatcp;
|
||||
|
||||
/// Public API types
|
||||
|
||||
@@ -6,7 +6,7 @@ use std::{
|
||||
|
||||
use byteorder::{BigEndian, ReadBytesExt};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use serde_with::serde_as;
|
||||
use strum_macros;
|
||||
use utils::{
|
||||
completion,
|
||||
@@ -110,7 +110,6 @@ impl TenantState {
|
||||
// So, return `Maybe` while Attaching, making Console wait for the attach task to finish.
|
||||
Self::Attaching | Self::Activating(ActivatingFrom::Attaching) => Maybe,
|
||||
// tenant mgr startup distinguishes attaching from loading via marker file.
|
||||
// If it's loading, there is no attach marker file, i.e., attach had finished in the past.
|
||||
Self::Loading | Self::Activating(ActivatingFrom::Loading) => Attached,
|
||||
// We only reach Active after successful load / attach.
|
||||
// So, call atttachment status Attached.
|
||||
@@ -175,25 +174,19 @@ pub enum TimelineState {
|
||||
Broken { reason: String, backtrace: String },
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TimelineCreateRequest {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub new_timeline_id: TimelineId,
|
||||
#[serde(default)]
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub ancestor_timeline_id: Option<TimelineId>,
|
||||
#[serde(default)]
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub ancestor_start_lsn: Option<Lsn>,
|
||||
pub pg_version: Option<u32>,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantCreateRequest {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub new_tenant_id: TenantId,
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
@@ -202,7 +195,6 @@ pub struct TenantCreateRequest {
|
||||
pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantLoadRequest {
|
||||
@@ -279,31 +271,26 @@ pub struct LocationConfig {
|
||||
pub tenant_conf: TenantConfig,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[serde(transparent)]
|
||||
pub struct TenantCreateResponse(#[serde_as(as = "DisplayFromStr")] pub TenantId);
|
||||
pub struct TenantCreateResponse(pub TenantId);
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct StatusResponse {
|
||||
pub id: NodeId,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantLocationConfigRequest {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub tenant_id: TenantId,
|
||||
#[serde(flatten)]
|
||||
pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantConfigRequest {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub tenant_id: TenantId,
|
||||
#[serde(flatten)]
|
||||
pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
|
||||
@@ -375,10 +362,8 @@ pub enum TenantAttachmentStatus {
|
||||
Failed { reason: String },
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct TenantInfo {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub id: TenantId,
|
||||
// NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's
|
||||
pub state: TenantState,
|
||||
@@ -389,33 +374,22 @@ pub struct TenantInfo {
|
||||
}
|
||||
|
||||
/// This represents the output of the "timeline_detail" and "timeline_list" API calls.
|
||||
#[serde_as]
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct TimelineInfo {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub tenant_id: TenantId,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub timeline_id: TimelineId,
|
||||
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub ancestor_timeline_id: Option<TimelineId>,
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub ancestor_lsn: Option<Lsn>,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub last_record_lsn: Lsn,
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub prev_record_lsn: Option<Lsn>,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub latest_gc_cutoff_lsn: Lsn,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub disk_consistent_lsn: Lsn,
|
||||
|
||||
/// The LSN that we have succesfully uploaded to remote storage
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub remote_consistent_lsn: Lsn,
|
||||
|
||||
/// The LSN that we are advertizing to safekeepers
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub remote_consistent_lsn_visible: Lsn,
|
||||
|
||||
pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
|
||||
@@ -427,7 +401,6 @@ pub struct TimelineInfo {
|
||||
pub timeline_dir_layer_file_size_sum: Option<u64>,
|
||||
|
||||
pub wal_source_connstr: Option<String>,
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub last_received_msg_lsn: Option<Lsn>,
|
||||
/// the timestamp (in microseconds) of the last received message
|
||||
pub last_received_msg_ts: Option<u128>,
|
||||
@@ -524,23 +497,13 @@ pub struct LayerAccessStats {
|
||||
pub residence_events_history: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
#[serde(tag = "kind")]
|
||||
pub enum InMemoryLayerInfo {
|
||||
Open {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
lsn_start: Lsn,
|
||||
},
|
||||
Frozen {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
lsn_start: Lsn,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
lsn_end: Lsn,
|
||||
},
|
||||
Open { lsn_start: Lsn },
|
||||
Frozen { lsn_start: Lsn, lsn_end: Lsn },
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
#[serde(tag = "kind")]
|
||||
pub enum HistoricLayerInfo {
|
||||
@@ -548,9 +511,7 @@ pub enum HistoricLayerInfo {
|
||||
layer_file_name: String,
|
||||
layer_file_size: u64,
|
||||
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
lsn_start: Lsn,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
lsn_end: Lsn,
|
||||
remote: bool,
|
||||
access_stats: LayerAccessStats,
|
||||
@@ -559,7 +520,6 @@ pub enum HistoricLayerInfo {
|
||||
layer_file_name: String,
|
||||
layer_file_size: u64,
|
||||
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
lsn_start: Lsn,
|
||||
remote: bool,
|
||||
access_stats: LayerAccessStats,
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
//! To use, create PostgresBackend and run() it, passing the Handler
|
||||
//! implementation determining how to process the queries. Currently its API
|
||||
//! is rather narrow, but we can extend it once required.
|
||||
#![deny(unsafe_code)]
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
use anyhow::Context;
|
||||
use bytes::Bytes;
|
||||
use futures::pin_mut;
|
||||
@@ -15,7 +17,7 @@ use std::{fmt, io};
|
||||
use std::{future::Future, str::FromStr};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio_rustls::TlsAcceptor;
|
||||
use tracing::{debug, error, info, trace};
|
||||
use tracing::{debug, error, info, trace, warn};
|
||||
|
||||
use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
|
||||
use pq_proto::{
|
||||
@@ -33,6 +35,11 @@ pub enum QueryError {
|
||||
/// We were instructed to shutdown while processing the query
|
||||
#[error("Shutting down")]
|
||||
Shutdown,
|
||||
/// Authentication failure
|
||||
#[error("Unauthorized: {0}")]
|
||||
Unauthorized(std::borrow::Cow<'static, str>),
|
||||
#[error("Simulated Connection Error")]
|
||||
SimulatedConnectionError,
|
||||
/// Some other error
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
@@ -47,8 +54,9 @@ impl From<io::Error> for QueryError {
|
||||
impl QueryError {
|
||||
pub fn pg_error_code(&self) -> &'static [u8; 5] {
|
||||
match self {
|
||||
Self::Disconnected(_) => b"08006", // connection failure
|
||||
Self::Disconnected(_) | Self::SimulatedConnectionError => b"08006", // connection failure
|
||||
Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
|
||||
Self::Unauthorized(_) => SQLSTATE_INTERNAL_ERROR,
|
||||
Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
|
||||
}
|
||||
}
|
||||
@@ -242,6 +250,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> MaybeWriteOnly<IO> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Cancellation safe as long as the underlying IO is cancellation safe.
|
||||
async fn shutdown(&mut self) -> io::Result<()> {
|
||||
match self {
|
||||
MaybeWriteOnly::Full(framed) => framed.shutdown().await,
|
||||
@@ -393,13 +402,23 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
shutdown_watcher: F,
|
||||
) -> Result<(), QueryError>
|
||||
where
|
||||
F: Fn() -> S,
|
||||
F: Fn() -> S + Clone,
|
||||
S: Future,
|
||||
{
|
||||
let ret = self.run_message_loop(handler, shutdown_watcher).await;
|
||||
// socket might be already closed, e.g. if previously received error,
|
||||
// so ignore result.
|
||||
self.framed.shutdown().await.ok();
|
||||
let ret = self
|
||||
.run_message_loop(handler, shutdown_watcher.clone())
|
||||
.await;
|
||||
|
||||
tokio::select! {
|
||||
_ = shutdown_watcher() => {
|
||||
// do nothing; we most likely got already stopped by shutdown and will log it next.
|
||||
}
|
||||
_ = self.framed.shutdown() => {
|
||||
// socket might be already closed, e.g. if previously received error,
|
||||
// so ignore result.
|
||||
},
|
||||
}
|
||||
|
||||
match ret {
|
||||
Ok(()) => Ok(()),
|
||||
Err(QueryError::Shutdown) => {
|
||||
@@ -597,7 +616,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
|
||||
if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
|
||||
self.write_message_noflush(&BeMessage::ErrorResponse(
|
||||
&e.to_string(),
|
||||
&short_error(&e),
|
||||
Some(e.pg_error_code()),
|
||||
))?;
|
||||
return Err(e);
|
||||
@@ -717,12 +736,20 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
|
||||
trace!("got query {query_string:?}");
|
||||
if let Err(e) = handler.process_query(self, query_string).await {
|
||||
log_query_error(query_string, &e);
|
||||
let short_error = short_error(&e);
|
||||
self.write_message_noflush(&BeMessage::ErrorResponse(
|
||||
&short_error,
|
||||
Some(e.pg_error_code()),
|
||||
))?;
|
||||
match e {
|
||||
QueryError::Shutdown => return Ok(ProcessMsgResult::Break),
|
||||
QueryError::SimulatedConnectionError => {
|
||||
return Err(QueryError::SimulatedConnectionError)
|
||||
}
|
||||
e => {
|
||||
log_query_error(query_string, &e);
|
||||
let short_error = short_error(&e);
|
||||
self.write_message_noflush(&BeMessage::ErrorResponse(
|
||||
&short_error,
|
||||
Some(e.pg_error_code()),
|
||||
))?;
|
||||
}
|
||||
}
|
||||
}
|
||||
self.write_message_noflush(&BeMessage::ReadyForQuery)?;
|
||||
}
|
||||
@@ -948,6 +975,8 @@ pub fn short_error(e: &QueryError) -> String {
|
||||
match e {
|
||||
QueryError::Disconnected(connection_error) => connection_error.to_string(),
|
||||
QueryError::Shutdown => "shutdown".to_string(),
|
||||
QueryError::Unauthorized(_e) => "JWT authentication error".to_string(),
|
||||
QueryError::SimulatedConnectionError => "simulated connection error".to_string(),
|
||||
QueryError::Other(e) => format!("{e:#}"),
|
||||
}
|
||||
}
|
||||
@@ -964,9 +993,15 @@ fn log_query_error(query: &str, e: &QueryError) {
|
||||
QueryError::Disconnected(other_connection_error) => {
|
||||
error!("query handler for '{query}' failed with connection error: {other_connection_error:?}")
|
||||
}
|
||||
QueryError::SimulatedConnectionError => {
|
||||
error!("query handler for query '{query}' failed due to a simulated connection error")
|
||||
}
|
||||
QueryError::Shutdown => {
|
||||
info!("query handler for '{query}' cancelled during tenant shutdown")
|
||||
}
|
||||
QueryError::Unauthorized(e) => {
|
||||
warn!("query handler for '{query}' failed with authentication error: {e}");
|
||||
}
|
||||
QueryError::Other(e) => {
|
||||
error!("query handler for '{query}' failed: {e:?}");
|
||||
}
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
#![deny(unsafe_code)]
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
use anyhow::{bail, Context};
|
||||
use itertools::Itertools;
|
||||
use std::borrow::Cow;
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
// modules included with the postgres_ffi macro depend on the types of the specific version's
|
||||
// types, and trigger a too eager lint.
|
||||
#![allow(clippy::duplicate_mod)]
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
|
||||
use bytes::Bytes;
|
||||
use utils::bin_ser::SerializeError;
|
||||
@@ -20,6 +21,7 @@ macro_rules! postgres_ffi {
|
||||
pub mod bindings {
|
||||
// bindgen generates bindings for a lot of stuff we don't need
|
||||
#![allow(dead_code)]
|
||||
#![allow(clippy::undocumented_unsafe_blocks)]
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
include!(concat!(
|
||||
|
||||
@@ -14,6 +14,7 @@ macro_rules! xlog_utils_test {
|
||||
($version:ident) => {
|
||||
#[path = "."]
|
||||
mod $version {
|
||||
#[allow(unused_imports)]
|
||||
pub use postgres_ffi::$version::wal_craft_test_export::*;
|
||||
#[allow(clippy::duplicate_mod)]
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -214,27 +214,24 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
/// Cancellation safe as long as the AsyncWrite is cancellation safe.
|
||||
async fn flush<S: AsyncWrite + Unpin>(
|
||||
stream: &mut S,
|
||||
write_buf: &mut BytesMut,
|
||||
) -> Result<(), io::Error> {
|
||||
while write_buf.has_remaining() {
|
||||
let bytes_written = stream.write(write_buf.chunk()).await?;
|
||||
let bytes_written = stream.write_buf(write_buf).await?;
|
||||
if bytes_written == 0 {
|
||||
return Err(io::Error::new(
|
||||
ErrorKind::WriteZero,
|
||||
"failed to write message",
|
||||
));
|
||||
}
|
||||
// The advanced part will be garbage collected, likely during shifting
|
||||
// data left on next attempt to write to buffer when free space is not
|
||||
// enough.
|
||||
write_buf.advance(bytes_written);
|
||||
}
|
||||
write_buf.clear();
|
||||
stream.flush().await
|
||||
}
|
||||
|
||||
/// Cancellation safe as long as the AsyncWrite is cancellation safe.
|
||||
async fn shutdown<S: AsyncWrite + Unpin>(
|
||||
stream: &mut S,
|
||||
write_buf: &mut BytesMut,
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
//! Postgres protocol messages serialization-deserialization. See
|
||||
//! <https://www.postgresql.org/docs/devel/protocol-message-formats.html>
|
||||
//! on message formats.
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
|
||||
pub mod framed;
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@ license.workspace = true
|
||||
anyhow.workspace = true
|
||||
async-trait.workspace = true
|
||||
once_cell.workspace = true
|
||||
aws-smithy-async.workspace = true
|
||||
aws-smithy-http.workspace = true
|
||||
aws-types.workspace = true
|
||||
aws-config.workspace = true
|
||||
|
||||
@@ -1,21 +1,18 @@
|
||||
//! Azure Blob Storage wrapper
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::env;
|
||||
use std::num::NonZeroU32;
|
||||
use std::sync::Arc;
|
||||
use std::{borrow::Cow, collections::HashMap, io::Cursor};
|
||||
use std::{borrow::Cow, io::Cursor};
|
||||
|
||||
use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
|
||||
use anyhow::Result;
|
||||
use azure_core::request_options::{MaxResults, Metadata, Range};
|
||||
use azure_core::Header;
|
||||
use azure_identity::DefaultAzureCredential;
|
||||
use azure_storage::StorageCredentials;
|
||||
use azure_storage_blobs::prelude::ClientBuilder;
|
||||
use azure_storage_blobs::{
|
||||
blob::operations::GetBlobBuilder,
|
||||
prelude::{BlobClient, ContainerClient},
|
||||
};
|
||||
use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
|
||||
use futures_util::StreamExt;
|
||||
use http_types::StatusCode;
|
||||
use tokio::io::AsyncRead;
|
||||
@@ -23,8 +20,8 @@ use tracing::debug;
|
||||
|
||||
use crate::s3_bucket::RequestKind;
|
||||
use crate::{
|
||||
AzureConfig, ConcurrencyLimiter, Download, DownloadError, RemotePath, RemoteStorage,
|
||||
StorageMetadata,
|
||||
AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath,
|
||||
RemoteStorage, StorageMetadata,
|
||||
};
|
||||
|
||||
pub struct AzureBlobStorage {
|
||||
@@ -112,16 +109,19 @@ impl AzureBlobStorage {
|
||||
|
||||
async fn download_for_builder(
|
||||
&self,
|
||||
metadata: StorageMetadata,
|
||||
builder: GetBlobBuilder,
|
||||
) -> Result<Download, DownloadError> {
|
||||
let mut response = builder.into_stream();
|
||||
|
||||
let mut metadata = HashMap::new();
|
||||
// TODO give proper streaming response instead of buffering into RAM
|
||||
// https://github.com/neondatabase/neon/issues/5563
|
||||
let mut buf = Vec::new();
|
||||
while let Some(part) = response.next().await {
|
||||
let part = part.map_err(to_download_error)?;
|
||||
if let Some(blob_meta) = part.blob.metadata {
|
||||
metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
|
||||
}
|
||||
let data = part
|
||||
.data
|
||||
.collect()
|
||||
@@ -131,28 +131,9 @@ impl AzureBlobStorage {
|
||||
}
|
||||
Ok(Download {
|
||||
download_stream: Box::pin(Cursor::new(buf)),
|
||||
metadata: Some(metadata),
|
||||
metadata: Some(StorageMetadata(metadata)),
|
||||
})
|
||||
}
|
||||
// TODO get rid of this function once we have metadata included in the response
|
||||
// https://github.com/Azure/azure-sdk-for-rust/issues/1439
|
||||
async fn get_metadata(
|
||||
&self,
|
||||
blob_client: &BlobClient,
|
||||
) -> Result<StorageMetadata, DownloadError> {
|
||||
let builder = blob_client.get_metadata();
|
||||
|
||||
let response = builder.into_future().await.map_err(to_download_error)?;
|
||||
let mut map = HashMap::new();
|
||||
|
||||
for md in response.metadata.iter() {
|
||||
map.insert(
|
||||
md.name().as_str().to_string(),
|
||||
md.value().as_str().to_string(),
|
||||
);
|
||||
}
|
||||
Ok(StorageMetadata(map))
|
||||
}
|
||||
|
||||
async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
|
||||
self.concurrency_limiter
|
||||
@@ -184,10 +165,11 @@ fn to_download_error(error: azure_core::Error) -> DownloadError {
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for AzureBlobStorage {
|
||||
async fn list_prefixes(
|
||||
async fn list(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
mode: ListingMode,
|
||||
) -> anyhow::Result<Listing, DownloadError> {
|
||||
// get the passed prefix or if it is not set use prefix_in_bucket value
|
||||
let list_prefix = prefix
|
||||
.map(|p| self.relative_path_to_name(p))
|
||||
@@ -195,16 +177,19 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
.map(|mut p| {
|
||||
// required to end with a separator
|
||||
// otherwise request will return only the entry of a prefix
|
||||
if !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
|
||||
if matches!(mode, ListingMode::WithDelimiter)
|
||||
&& !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
|
||||
{
|
||||
p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
|
||||
}
|
||||
p
|
||||
});
|
||||
|
||||
let mut builder = self
|
||||
.client
|
||||
.list_blobs()
|
||||
.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
|
||||
let mut builder = self.client.list_blobs();
|
||||
|
||||
if let ListingMode::WithDelimiter = mode {
|
||||
builder = builder.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
|
||||
}
|
||||
|
||||
if let Some(prefix) = list_prefix {
|
||||
builder = builder.prefix(Cow::from(prefix.to_owned()));
|
||||
@@ -215,46 +200,23 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
}
|
||||
|
||||
let mut response = builder.into_stream();
|
||||
let mut res = Vec::new();
|
||||
while let Some(entry) = response.next().await {
|
||||
let entry = entry.map_err(to_download_error)?;
|
||||
let name_iter = entry
|
||||
let mut res = Listing::default();
|
||||
while let Some(l) = response.next().await {
|
||||
let entry = l.map_err(to_download_error)?;
|
||||
let prefix_iter = entry
|
||||
.blobs
|
||||
.prefixes()
|
||||
.map(|prefix| self.name_to_relative_path(&prefix.name));
|
||||
res.extend(name_iter);
|
||||
}
|
||||
Ok(res)
|
||||
}
|
||||
res.prefixes.extend(prefix_iter);
|
||||
|
||||
async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
|
||||
let folder_name = folder
|
||||
.map(|p| self.relative_path_to_name(p))
|
||||
.or_else(|| self.prefix_in_container.clone());
|
||||
|
||||
let mut builder = self.client.list_blobs();
|
||||
|
||||
if let Some(folder_name) = folder_name {
|
||||
builder = builder.prefix(Cow::from(folder_name.to_owned()));
|
||||
}
|
||||
|
||||
if let Some(limit) = self.max_keys_per_list_response {
|
||||
builder = builder.max_results(MaxResults::new(limit));
|
||||
}
|
||||
|
||||
let mut response = builder.into_stream();
|
||||
let mut res = Vec::new();
|
||||
while let Some(l) = response.next().await {
|
||||
let entry = l.map_err(anyhow::Error::new)?;
|
||||
let name_iter = entry
|
||||
let blob_iter = entry
|
||||
.blobs
|
||||
.blobs()
|
||||
.map(|bl| self.name_to_relative_path(&bl.name));
|
||||
res.extend(name_iter);
|
||||
.map(|k| self.name_to_relative_path(&k.name));
|
||||
res.keys.extend(blob_iter);
|
||||
}
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
mut from: impl AsyncRead + Unpin + Send + Sync + 'static,
|
||||
@@ -288,11 +250,9 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
let _permit = self.permit(RequestKind::Get).await;
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(from));
|
||||
|
||||
let metadata = self.get_metadata(&blob_client).await?;
|
||||
|
||||
let builder = blob_client.get();
|
||||
|
||||
self.download_for_builder(metadata, builder).await
|
||||
self.download_for_builder(builder).await
|
||||
}
|
||||
|
||||
async fn download_byte_range(
|
||||
@@ -304,8 +264,6 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
let _permit = self.permit(RequestKind::Get).await;
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(from));
|
||||
|
||||
let metadata = self.get_metadata(&blob_client).await?;
|
||||
|
||||
let mut builder = blob_client.get();
|
||||
|
||||
if let Some(end_exclusive) = end_exclusive {
|
||||
@@ -320,7 +278,7 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
builder = builder.range(Range::new(start_inclusive, end_exclusive));
|
||||
}
|
||||
|
||||
self.download_for_builder(metadata, builder).await
|
||||
self.download_for_builder(builder).await
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
|
||||
|
||||
@@ -6,19 +6,15 @@
|
||||
//! * [`s3_bucket`] uses AWS S3 bucket as an external storage
|
||||
//! * [`azure_blob`] allows to use Azure Blob storage as an external storage
|
||||
//!
|
||||
#![deny(unsafe_code)]
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
|
||||
mod azure_blob;
|
||||
mod local_fs;
|
||||
mod s3_bucket;
|
||||
mod simulate_failures;
|
||||
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
fmt::Debug,
|
||||
num::{NonZeroU32, NonZeroUsize},
|
||||
pin::Pin,
|
||||
sync::Arc,
|
||||
};
|
||||
use std::{collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc};
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
@@ -34,12 +30,6 @@ pub use self::{
|
||||
};
|
||||
use s3_bucket::RequestKind;
|
||||
|
||||
/// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage.
|
||||
/// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency
|
||||
/// during start (where local and remote timelines are compared and initial sync tasks are scheduled) and timeline attach.
|
||||
/// Both cases may trigger timeline download, that might download a lot of layers. This concurrency is limited by the clients internally, if needed.
|
||||
pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS: usize = 50;
|
||||
pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
|
||||
/// Currently, sync happens with AWS S3, that has two limits on requests per second:
|
||||
/// ~200 RPS for IAM services
|
||||
/// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
|
||||
@@ -129,6 +119,22 @@ impl RemotePath {
|
||||
}
|
||||
}
|
||||
|
||||
/// We don't need callers to be able to pass arbitrary delimiters: just control
|
||||
/// whether listings will use a '/' separator or not.
|
||||
///
|
||||
/// The WithDelimiter mode will populate `prefixes` and `keys` in the result. The
|
||||
/// NoDelimiter mode will only populate `keys`.
|
||||
pub enum ListingMode {
|
||||
WithDelimiter,
|
||||
NoDelimiter,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct Listing {
|
||||
pub prefixes: Vec<RemotePath>,
|
||||
pub keys: Vec<RemotePath>,
|
||||
}
|
||||
|
||||
/// Storage (potentially remote) API to manage its state.
|
||||
/// This storage tries to be unaware of any layered repository context,
|
||||
/// providing basic CRUD operations for storage files.
|
||||
@@ -141,8 +147,13 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
) -> Result<Vec<RemotePath>, DownloadError>;
|
||||
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
let result = self
|
||||
.list(prefix, ListingMode::WithDelimiter)
|
||||
.await?
|
||||
.prefixes;
|
||||
Ok(result)
|
||||
}
|
||||
/// Lists all files in directory "recursively"
|
||||
/// (not really recursively, because AWS has a flat namespace)
|
||||
/// Note: This is subtely different than list_prefixes,
|
||||
@@ -154,7 +165,16 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
/// whereas,
|
||||
/// list_prefixes("foo/bar/") = ["cat", "dog"]
|
||||
/// See `test_real_s3.rs` for more details.
|
||||
async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>>;
|
||||
async fn list_files(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
|
||||
let result = self.list(prefix, ListingMode::NoDelimiter).await?.keys;
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
async fn list(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
_mode: ListingMode,
|
||||
) -> anyhow::Result<Listing, DownloadError>;
|
||||
|
||||
/// Streams the local file contents into remote into the remote storage entry.
|
||||
async fn upload(
|
||||
@@ -205,6 +225,9 @@ pub enum DownloadError {
|
||||
BadInput(anyhow::Error),
|
||||
/// The file was not found in the remote storage.
|
||||
NotFound,
|
||||
/// A cancellation token aborted the download, typically during
|
||||
/// tenant detach or process shutdown.
|
||||
Cancelled,
|
||||
/// The file was found in the remote storage, but the download failed.
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
@@ -215,6 +238,7 @@ impl std::fmt::Display for DownloadError {
|
||||
DownloadError::BadInput(e) => {
|
||||
write!(f, "Failed to download a remote file due to user input: {e}")
|
||||
}
|
||||
DownloadError::Cancelled => write!(f, "Cancelled, shutting down"),
|
||||
DownloadError::NotFound => write!(f, "No file found for the remote object id given"),
|
||||
DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"),
|
||||
}
|
||||
@@ -234,6 +258,19 @@ pub enum GenericRemoteStorage {
|
||||
}
|
||||
|
||||
impl GenericRemoteStorage {
|
||||
pub async fn list(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
mode: ListingMode,
|
||||
) -> anyhow::Result<Listing, DownloadError> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.list(prefix, mode).await,
|
||||
Self::AwsS3(s) => s.list(prefix, mode).await,
|
||||
Self::AzureBlob(s) => s.list(prefix, mode).await,
|
||||
Self::Unreliable(s) => s.list(prefix, mode).await,
|
||||
}
|
||||
}
|
||||
|
||||
// A function for listing all the files in a "directory"
|
||||
// Example:
|
||||
// list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
|
||||
@@ -394,10 +431,6 @@ pub struct StorageMetadata(HashMap<String, String>);
|
||||
/// External backup storage configuration, enough for creating a client for that storage.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct RemoteStorageConfig {
|
||||
/// Max allowed number of concurrent sync operations between the API user and the remote storage.
|
||||
pub max_concurrent_syncs: NonZeroUsize,
|
||||
/// Max allowed errors before the sync task is considered failed and evicted.
|
||||
pub max_sync_errors: NonZeroU32,
|
||||
/// The storage connection configuration.
|
||||
pub storage: RemoteStorageKind,
|
||||
}
|
||||
@@ -493,18 +526,6 @@ impl RemoteStorageConfig {
|
||||
|
||||
let use_azure = container_name.is_some() && container_region.is_some();
|
||||
|
||||
let max_concurrent_syncs = NonZeroUsize::new(
|
||||
parse_optional_integer("max_concurrent_syncs", toml)?
|
||||
.unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS),
|
||||
)
|
||||
.context("Failed to parse 'max_concurrent_syncs' as a positive integer")?;
|
||||
|
||||
let max_sync_errors = NonZeroU32::new(
|
||||
parse_optional_integer("max_sync_errors", toml)?
|
||||
.unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS),
|
||||
)
|
||||
.context("Failed to parse 'max_sync_errors' as a positive integer")?;
|
||||
|
||||
let default_concurrency_limit = if use_azure {
|
||||
DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT
|
||||
} else {
|
||||
@@ -586,11 +607,7 @@ impl RemoteStorageConfig {
|
||||
}
|
||||
};
|
||||
|
||||
Ok(Some(RemoteStorageConfig {
|
||||
max_concurrent_syncs,
|
||||
max_sync_errors,
|
||||
storage,
|
||||
}))
|
||||
Ok(Some(RemoteStorageConfig { storage }))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -15,7 +15,7 @@ use tokio::{
|
||||
use tracing::*;
|
||||
use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
|
||||
|
||||
use crate::{Download, DownloadError, RemotePath};
|
||||
use crate::{Download, DownloadError, Listing, ListingMode, RemotePath};
|
||||
|
||||
use super::{RemoteStorage, StorageMetadata};
|
||||
|
||||
@@ -75,7 +75,7 @@ impl LocalFs {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
|
||||
async fn list_all(&self) -> anyhow::Result<Vec<RemotePath>> {
|
||||
Ok(get_all_files(&self.storage_root, true)
|
||||
.await?
|
||||
.into_iter()
|
||||
@@ -89,52 +89,10 @@ impl LocalFs {
|
||||
})
|
||||
.collect())
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for LocalFs {
|
||||
async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
let path = match prefix {
|
||||
Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
|
||||
None => Cow::Borrowed(&self.storage_root),
|
||||
};
|
||||
|
||||
let prefixes_to_filter = get_all_files(path.as_ref(), false)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let mut prefixes = Vec::with_capacity(prefixes_to_filter.len());
|
||||
|
||||
// filter out empty directories to mirror s3 behavior.
|
||||
for prefix in prefixes_to_filter {
|
||||
if prefix.is_dir()
|
||||
&& is_directory_empty(&prefix)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
prefixes.push(
|
||||
prefix
|
||||
.strip_prefix(&self.storage_root)
|
||||
.context("Failed to strip prefix")
|
||||
.and_then(RemotePath::new)
|
||||
.expect(
|
||||
"We list files for storage root, hence should be able to remote the prefix",
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
Ok(prefixes)
|
||||
}
|
||||
|
||||
// recursively lists all files in a directory,
|
||||
// mirroring the `list_files` for `s3_bucket`
|
||||
async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
|
||||
async fn list_recursive(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
|
||||
let full_path = match folder {
|
||||
Some(folder) => folder.with_base(&self.storage_root),
|
||||
None => self.storage_root.clone(),
|
||||
@@ -186,6 +144,70 @@ impl RemoteStorage for LocalFs {
|
||||
|
||||
Ok(files)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for LocalFs {
|
||||
async fn list(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
mode: ListingMode,
|
||||
) -> Result<Listing, DownloadError> {
|
||||
let mut result = Listing::default();
|
||||
|
||||
if let ListingMode::NoDelimiter = mode {
|
||||
let keys = self
|
||||
.list_recursive(prefix)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
result.keys = keys
|
||||
.into_iter()
|
||||
.filter(|k| {
|
||||
let path = k.with_base(&self.storage_root);
|
||||
!path.is_dir()
|
||||
})
|
||||
.collect();
|
||||
|
||||
return Ok(result);
|
||||
}
|
||||
|
||||
let path = match prefix {
|
||||
Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
|
||||
None => Cow::Borrowed(&self.storage_root),
|
||||
};
|
||||
|
||||
let prefixes_to_filter = get_all_files(path.as_ref(), false)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
// filter out empty directories to mirror s3 behavior.
|
||||
for prefix in prefixes_to_filter {
|
||||
if prefix.is_dir()
|
||||
&& is_directory_empty(&prefix)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
let stripped = prefix
|
||||
.strip_prefix(&self.storage_root)
|
||||
.context("Failed to strip prefix")
|
||||
.and_then(RemotePath::new)
|
||||
.expect(
|
||||
"We list files for storage root, hence should be able to remote the prefix",
|
||||
);
|
||||
|
||||
if prefix.is_dir() {
|
||||
result.prefixes.push(stripped);
|
||||
} else {
|
||||
result.keys.push(stripped);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
@@ -479,7 +501,7 @@ mod fs_tests {
|
||||
|
||||
let target_path_1 = upload_dummy_file(&storage, "upload_1", None).await?;
|
||||
assert_eq!(
|
||||
storage.list().await?,
|
||||
storage.list_all().await?,
|
||||
vec![target_path_1.clone()],
|
||||
"Should list a single file after first upload"
|
||||
);
|
||||
@@ -667,7 +689,7 @@ mod fs_tests {
|
||||
let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
|
||||
|
||||
storage.delete(&upload_target).await?;
|
||||
assert!(storage.list().await?.is_empty());
|
||||
assert!(storage.list_all().await?.is_empty());
|
||||
|
||||
storage
|
||||
.delete(&upload_target)
|
||||
@@ -725,6 +747,43 @@ mod fs_tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn list() -> anyhow::Result<()> {
|
||||
// No delimiter: should recursively list everything
|
||||
let storage = create_storage()?;
|
||||
let child = upload_dummy_file(&storage, "grandparent/parent/child", None).await?;
|
||||
let uncle = upload_dummy_file(&storage, "grandparent/uncle", None).await?;
|
||||
|
||||
let listing = storage.list(None, ListingMode::NoDelimiter).await?;
|
||||
assert!(listing.prefixes.is_empty());
|
||||
assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());
|
||||
|
||||
// Delimiter: should only go one deep
|
||||
let listing = storage.list(None, ListingMode::WithDelimiter).await?;
|
||||
|
||||
assert_eq!(
|
||||
listing.prefixes,
|
||||
[RemotePath::from_string("timelines").unwrap()].to_vec()
|
||||
);
|
||||
assert!(listing.keys.is_empty());
|
||||
|
||||
// Delimiter & prefix
|
||||
let listing = storage
|
||||
.list(
|
||||
Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
|
||||
ListingMode::WithDelimiter,
|
||||
)
|
||||
.await?;
|
||||
assert_eq!(
|
||||
listing.prefixes,
|
||||
[RemotePath::from_string("timelines/some_timeline/grandparent/parent").unwrap()]
|
||||
.to_vec()
|
||||
);
|
||||
assert_eq!(listing.keys, [uncle.clone()].to_vec());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn upload_dummy_file(
|
||||
storage: &LocalFs,
|
||||
name: &str,
|
||||
@@ -777,7 +836,7 @@ mod fs_tests {
|
||||
}
|
||||
|
||||
async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result<Vec<RemotePath>> {
|
||||
let mut files = storage.list().await?;
|
||||
let mut files = storage.list_all().await?;
|
||||
files.sort_by(|a, b| a.0.cmp(&b.0));
|
||||
Ok(files)
|
||||
}
|
||||
|
||||
@@ -4,23 +4,27 @@
|
||||
//! allowing multiple api users to independently work with the same S3 bucket, if
|
||||
//! their bucket prefixes are both specified and different.
|
||||
|
||||
use std::borrow::Cow;
|
||||
use std::{borrow::Cow, sync::Arc};
|
||||
|
||||
use anyhow::Context;
|
||||
use aws_config::{
|
||||
environment::credentials::EnvironmentVariableCredentialsProvider,
|
||||
imds::credentials::ImdsCredentialsProvider, meta::credentials::CredentialsProviderChain,
|
||||
provider_config::ProviderConfig, web_identity_token::WebIdentityTokenCredentialsProvider,
|
||||
imds::credentials::ImdsCredentialsProvider,
|
||||
meta::credentials::CredentialsProviderChain,
|
||||
provider_config::ProviderConfig,
|
||||
retry::{RetryConfigBuilder, RetryMode},
|
||||
web_identity_token::WebIdentityTokenCredentialsProvider,
|
||||
};
|
||||
use aws_credential_types::cache::CredentialsCache;
|
||||
use aws_sdk_s3::{
|
||||
config::{Config, Region},
|
||||
config::{AsyncSleep, Config, Region, SharedAsyncSleep},
|
||||
error::SdkError,
|
||||
operation::get_object::GetObjectError,
|
||||
primitives::ByteStream,
|
||||
types::{Delete, ObjectIdentifier},
|
||||
Client,
|
||||
};
|
||||
use aws_smithy_async::rt::sleep::TokioSleep;
|
||||
use aws_smithy_http::body::SdkBody;
|
||||
use hyper::Body;
|
||||
use scopeguard::ScopeGuard;
|
||||
@@ -30,8 +34,8 @@ use tracing::debug;
|
||||
|
||||
use super::StorageMetadata;
|
||||
use crate::{
|
||||
ConcurrencyLimiter, Download, DownloadError, RemotePath, RemoteStorage, S3Config,
|
||||
MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage,
|
||||
S3Config, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
};
|
||||
|
||||
pub(super) mod metrics;
|
||||
@@ -83,10 +87,23 @@ impl S3Bucket {
|
||||
.or_else("imds", ImdsCredentialsProvider::builder().build())
|
||||
};
|
||||
|
||||
// AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off
|
||||
let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
|
||||
|
||||
// We do our own retries (see [`backoff::retry`]). However, for the AWS SDK to enable rate limiting in response to throttling
|
||||
// responses (e.g. 429 on too many ListObjectsv2 requests), we must provide a retry config. We set it to use at most one
|
||||
// attempt, and enable 'Adaptive' mode, which causes rate limiting to be enabled.
|
||||
let mut retry_config = RetryConfigBuilder::new();
|
||||
retry_config
|
||||
.set_max_attempts(Some(1))
|
||||
.set_mode(Some(RetryMode::Adaptive));
|
||||
|
||||
let mut config_builder = Config::builder()
|
||||
.region(region)
|
||||
.credentials_cache(CredentialsCache::lazy())
|
||||
.credentials_provider(credentials_provider);
|
||||
.credentials_provider(credentials_provider)
|
||||
.sleep_impl(SharedAsyncSleep::from(sleep_impl))
|
||||
.retry_config(retry_config.build());
|
||||
|
||||
if let Some(custom_endpoint) = aws_config.endpoint.clone() {
|
||||
config_builder = config_builder
|
||||
@@ -299,13 +316,13 @@ impl<S: AsyncRead> AsyncRead for TimedDownload<S> {
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for S3Bucket {
|
||||
/// See the doc for `RemoteStorage::list_prefixes`
|
||||
/// Note: it wont include empty "directories"
|
||||
async fn list_prefixes(
|
||||
async fn list(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
mode: ListingMode,
|
||||
) -> Result<Listing, DownloadError> {
|
||||
let kind = RequestKind::List;
|
||||
let mut result = Listing::default();
|
||||
|
||||
// get the passed prefix or if it is not set use prefix_in_bucket value
|
||||
let list_prefix = prefix
|
||||
@@ -314,28 +331,33 @@ impl RemoteStorage for S3Bucket {
|
||||
.map(|mut p| {
|
||||
// required to end with a separator
|
||||
// otherwise request will return only the entry of a prefix
|
||||
if !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
|
||||
if matches!(mode, ListingMode::WithDelimiter)
|
||||
&& !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
|
||||
{
|
||||
p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
|
||||
}
|
||||
p
|
||||
});
|
||||
|
||||
let mut document_keys = Vec::new();
|
||||
|
||||
let mut continuation_token = None;
|
||||
|
||||
loop {
|
||||
let _guard = self.permit(kind).await;
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let fetch_response = self
|
||||
let mut request = self
|
||||
.client
|
||||
.list_objects_v2()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.set_prefix(list_prefix.clone())
|
||||
.set_continuation_token(continuation_token)
|
||||
.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string())
|
||||
.set_max_keys(self.max_keys_per_list_response)
|
||||
.set_max_keys(self.max_keys_per_list_response);
|
||||
|
||||
if let ListingMode::WithDelimiter = mode {
|
||||
request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
|
||||
}
|
||||
|
||||
let response = request
|
||||
.send()
|
||||
.await
|
||||
.context("Failed to list S3 prefixes")
|
||||
@@ -345,71 +367,35 @@ impl RemoteStorage for S3Bucket {
|
||||
|
||||
metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &fetch_response, started_at);
|
||||
.observe_elapsed(kind, &response, started_at);
|
||||
|
||||
let fetch_response = fetch_response?;
|
||||
let response = response?;
|
||||
|
||||
document_keys.extend(
|
||||
fetch_response
|
||||
.common_prefixes
|
||||
.unwrap_or_default()
|
||||
.into_iter()
|
||||
let keys = response.contents().unwrap_or_default();
|
||||
let empty = Vec::new();
|
||||
let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty);
|
||||
|
||||
tracing::info!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
|
||||
|
||||
for object in keys {
|
||||
let object_path = object.key().expect("response does not contain a key");
|
||||
let remote_path = self.s3_object_to_relative_path(object_path);
|
||||
result.keys.push(remote_path);
|
||||
}
|
||||
|
||||
result.prefixes.extend(
|
||||
prefixes
|
||||
.iter()
|
||||
.filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
|
||||
);
|
||||
|
||||
continuation_token = match fetch_response.next_continuation_token {
|
||||
continuation_token = match response.next_continuation_token {
|
||||
Some(new_token) => Some(new_token),
|
||||
None => break,
|
||||
};
|
||||
}
|
||||
|
||||
Ok(document_keys)
|
||||
}
|
||||
|
||||
/// See the doc for `RemoteStorage::list_files`
|
||||
async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
|
||||
let kind = RequestKind::List;
|
||||
|
||||
let folder_name = folder
|
||||
.map(|p| self.relative_path_to_s3_object(p))
|
||||
.or_else(|| self.prefix_in_bucket.clone());
|
||||
|
||||
// AWS may need to break the response into several parts
|
||||
let mut continuation_token = None;
|
||||
let mut all_files = vec![];
|
||||
loop {
|
||||
let _guard = self.permit(kind).await;
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let response = self
|
||||
.client
|
||||
.list_objects_v2()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.set_prefix(folder_name.clone())
|
||||
.set_continuation_token(continuation_token)
|
||||
.set_max_keys(self.max_keys_per_list_response)
|
||||
.send()
|
||||
.await
|
||||
.context("Failed to list files in S3 bucket");
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &response, started_at);
|
||||
|
||||
let response = response?;
|
||||
|
||||
for object in response.contents().unwrap_or_default() {
|
||||
let object_path = object.key().expect("response does not contain a key");
|
||||
let remote_path = self.s3_object_to_relative_path(object_path);
|
||||
all_files.push(remote_path);
|
||||
}
|
||||
match response.next_continuation_token {
|
||||
Some(new_token) => continuation_token = Some(new_token),
|
||||
None => break,
|
||||
}
|
||||
}
|
||||
Ok(all_files)
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
|
||||
@@ -5,7 +5,9 @@ use std::collections::hash_map::Entry;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Mutex;
|
||||
|
||||
use crate::{Download, DownloadError, RemotePath, RemoteStorage, StorageMetadata};
|
||||
use crate::{
|
||||
Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
|
||||
};
|
||||
|
||||
pub struct UnreliableWrapper {
|
||||
inner: crate::GenericRemoteStorage,
|
||||
@@ -95,6 +97,15 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
self.inner.list_files(folder).await
|
||||
}
|
||||
|
||||
async fn list(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
mode: ListingMode,
|
||||
) -> Result<Listing, DownloadError> {
|
||||
self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))?;
|
||||
self.inner.list(prefix, mode).await
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::collections::HashSet;
|
||||
use std::env;
|
||||
use std::num::{NonZeroU32, NonZeroUsize};
|
||||
use std::num::NonZeroUsize;
|
||||
use std::ops::ControlFlow;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
@@ -469,8 +469,6 @@ fn create_azure_client(
|
||||
let random = rand::thread_rng().gen::<u32>();
|
||||
|
||||
let remote_storage_config = RemoteStorageConfig {
|
||||
max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
|
||||
max_sync_errors: NonZeroU32::new(5).unwrap(),
|
||||
storage: RemoteStorageKind::AzureContainer(AzureConfig {
|
||||
container_name: remote_storage_azure_container,
|
||||
container_region: remote_storage_azure_region,
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::collections::HashSet;
|
||||
use std::env;
|
||||
use std::num::{NonZeroU32, NonZeroUsize};
|
||||
use std::num::NonZeroUsize;
|
||||
use std::ops::ControlFlow;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
@@ -396,8 +396,6 @@ fn create_s3_client(
|
||||
let random = rand::thread_rng().gen::<u32>();
|
||||
|
||||
let remote_storage_config = RemoteStorageConfig {
|
||||
max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
|
||||
max_sync_errors: NonZeroU32::new(5).unwrap(),
|
||||
storage: RemoteStorageKind::AwsS3(S3Config {
|
||||
bucket_name: remote_storage_s3_bucket,
|
||||
bucket_region: remote_storage_s3_region,
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
#![deny(unsafe_code)]
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
use const_format::formatcp;
|
||||
|
||||
/// Public API types
|
||||
|
||||
@@ -1,23 +1,18 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
|
||||
use utils::{
|
||||
id::{NodeId, TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TimelineCreateRequest {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub tenant_id: TenantId,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub timeline_id: TimelineId,
|
||||
pub peer_ids: Option<Vec<NodeId>>,
|
||||
pub pg_version: u32,
|
||||
pub system_id: Option<u64>,
|
||||
pub wal_seg_size: Option<u32>,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub commit_lsn: Lsn,
|
||||
// If not passed, it is assigned to the beginning of commit_lsn segment.
|
||||
pub local_start_lsn: Option<Lsn>,
|
||||
@@ -28,7 +23,6 @@ fn lsn_invalid() -> Lsn {
|
||||
}
|
||||
|
||||
/// Data about safekeeper's timeline, mirrors broker.proto.
|
||||
#[serde_as]
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct SkTimelineInfo {
|
||||
/// Term.
|
||||
@@ -36,25 +30,19 @@ pub struct SkTimelineInfo {
|
||||
/// Term of the last entry.
|
||||
pub last_log_term: Option<u64>,
|
||||
/// LSN of the last record.
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
#[serde(default = "lsn_invalid")]
|
||||
pub flush_lsn: Lsn,
|
||||
/// Up to which LSN safekeeper regards its WAL as committed.
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
#[serde(default = "lsn_invalid")]
|
||||
pub commit_lsn: Lsn,
|
||||
/// LSN up to which safekeeper has backed WAL.
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
#[serde(default = "lsn_invalid")]
|
||||
pub backup_lsn: Lsn,
|
||||
/// LSN of last checkpoint uploaded by pageserver.
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
#[serde(default = "lsn_invalid")]
|
||||
pub remote_consistent_lsn: Lsn,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
#[serde(default = "lsn_invalid")]
|
||||
pub peer_horizon_lsn: Lsn,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
#[serde(default = "lsn_invalid")]
|
||||
pub local_start_lsn: Lsn,
|
||||
/// A connection string to use for WAL receiving.
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
//! Synthetic size calculation
|
||||
#![deny(unsafe_code)]
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
|
||||
mod calculation;
|
||||
pub mod svg;
|
||||
|
||||
@@ -32,6 +32,8 @@
|
||||
//! .init();
|
||||
//! }
|
||||
//! ```
|
||||
#![deny(unsafe_code)]
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
|
||||
use opentelemetry::sdk::Resource;
|
||||
use opentelemetry::KeyValue;
|
||||
|
||||
@@ -5,6 +5,7 @@ edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
arc-swap.workspace = true
|
||||
sentry.workspace = true
|
||||
async-trait.workspace = true
|
||||
anyhow.workspace = true
|
||||
@@ -55,6 +56,7 @@ bytes.workspace = true
|
||||
criterion.workspace = true
|
||||
hex-literal.workspace = true
|
||||
camino-tempfile.workspace = true
|
||||
serde_assert.workspace = true
|
||||
|
||||
[[bench]]
|
||||
name = "benchmarks"
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
// For details about authentication see docs/authentication.md
|
||||
|
||||
use arc_swap::ArcSwap;
|
||||
use serde;
|
||||
use std::fs;
|
||||
use std::{borrow::Cow, fmt::Display, fs, sync::Arc};
|
||||
|
||||
use anyhow::Result;
|
||||
use camino::Utf8Path;
|
||||
@@ -9,9 +10,8 @@ use jsonwebtoken::{
|
||||
decode, encode, Algorithm, DecodingKey, EncodingKey, Header, TokenData, Validation,
|
||||
};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
|
||||
use crate::id::TenantId;
|
||||
use crate::{http::error::ApiError, id::TenantId};
|
||||
|
||||
/// Algorithm to use. We require EdDSA.
|
||||
const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;
|
||||
@@ -32,11 +32,9 @@ pub enum Scope {
|
||||
}
|
||||
|
||||
/// JWT payload. See docs/authentication.md for the format
|
||||
#[serde_as]
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
|
||||
pub struct Claims {
|
||||
#[serde(default)]
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub tenant_id: Option<TenantId>,
|
||||
pub scope: Scope,
|
||||
}
|
||||
@@ -47,31 +45,106 @@ impl Claims {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct SwappableJwtAuth(ArcSwap<JwtAuth>);
|
||||
|
||||
impl SwappableJwtAuth {
|
||||
pub fn new(jwt_auth: JwtAuth) -> Self {
|
||||
SwappableJwtAuth(ArcSwap::new(Arc::new(jwt_auth)))
|
||||
}
|
||||
pub fn swap(&self, jwt_auth: JwtAuth) {
|
||||
self.0.swap(Arc::new(jwt_auth));
|
||||
}
|
||||
pub fn decode(&self, token: &str) -> std::result::Result<TokenData<Claims>, AuthError> {
|
||||
self.0.load().decode(token)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for SwappableJwtAuth {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "Swappable({:?})", self.0.load())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, PartialEq, Eq, Hash, Debug)]
|
||||
pub struct AuthError(pub Cow<'static, str>);
|
||||
|
||||
impl Display for AuthError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<AuthError> for ApiError {
|
||||
fn from(_value: AuthError) -> Self {
|
||||
// Don't pass on the value of the AuthError as a precautionary measure.
|
||||
// Being intentionally vague in public error communication hurts debugability
|
||||
// but it is more secure.
|
||||
ApiError::Forbidden("JWT authentication error".to_string())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct JwtAuth {
|
||||
decoding_key: DecodingKey,
|
||||
decoding_keys: Vec<DecodingKey>,
|
||||
validation: Validation,
|
||||
}
|
||||
|
||||
impl JwtAuth {
|
||||
pub fn new(decoding_key: DecodingKey) -> Self {
|
||||
pub fn new(decoding_keys: Vec<DecodingKey>) -> Self {
|
||||
let mut validation = Validation::default();
|
||||
validation.algorithms = vec![STORAGE_TOKEN_ALGORITHM];
|
||||
// The default 'required_spec_claims' is 'exp'. But we don't want to require
|
||||
// expiration.
|
||||
validation.required_spec_claims = [].into();
|
||||
Self {
|
||||
decoding_key,
|
||||
decoding_keys,
|
||||
validation,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_key_path(key_path: &Utf8Path) -> Result<Self> {
|
||||
let public_key = fs::read(key_path)?;
|
||||
Ok(Self::new(DecodingKey::from_ed_pem(&public_key)?))
|
||||
let metadata = key_path.metadata()?;
|
||||
let decoding_keys = if metadata.is_dir() {
|
||||
let mut keys = Vec::new();
|
||||
for entry in fs::read_dir(key_path)? {
|
||||
let path = entry?.path();
|
||||
if !path.is_file() {
|
||||
// Ignore directories (don't recurse)
|
||||
continue;
|
||||
}
|
||||
let public_key = fs::read(path)?;
|
||||
keys.push(DecodingKey::from_ed_pem(&public_key)?);
|
||||
}
|
||||
keys
|
||||
} else if metadata.is_file() {
|
||||
let public_key = fs::read(key_path)?;
|
||||
vec![DecodingKey::from_ed_pem(&public_key)?]
|
||||
} else {
|
||||
anyhow::bail!("path is neither a directory or a file")
|
||||
};
|
||||
if decoding_keys.is_empty() {
|
||||
anyhow::bail!("Configured for JWT auth with zero decoding keys. All JWT gated requests would be rejected.");
|
||||
}
|
||||
Ok(Self::new(decoding_keys))
|
||||
}
|
||||
|
||||
pub fn decode(&self, token: &str) -> Result<TokenData<Claims>> {
|
||||
Ok(decode(token, &self.decoding_key, &self.validation)?)
|
||||
/// Attempt to decode the token with the internal decoding keys.
|
||||
///
|
||||
/// The function tries the stored decoding keys in succession,
|
||||
/// and returns the first yielding a successful result.
|
||||
/// If there is no working decoding key, it returns the last error.
|
||||
pub fn decode(&self, token: &str) -> std::result::Result<TokenData<Claims>, AuthError> {
|
||||
let mut res = None;
|
||||
for decoding_key in &self.decoding_keys {
|
||||
res = Some(decode(token, decoding_key, &self.validation));
|
||||
if let Some(Ok(res)) = res {
|
||||
return Ok(res);
|
||||
}
|
||||
}
|
||||
if let Some(res) = res {
|
||||
res.map_err(|e| AuthError(Cow::Owned(e.to_string())))
|
||||
} else {
|
||||
Err(AuthError(Cow::Borrowed("no JWT decoding keys configured")))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -111,9 +184,9 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
|
||||
"#;
|
||||
|
||||
#[test]
|
||||
fn test_decode() -> Result<(), anyhow::Error> {
|
||||
fn test_decode() {
|
||||
let expected_claims = Claims {
|
||||
tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?),
|
||||
tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()),
|
||||
scope: Scope::Tenant,
|
||||
};
|
||||
|
||||
@@ -132,28 +205,24 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
|
||||
let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw";
|
||||
|
||||
// Check it can be validated with the public key
|
||||
let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?);
|
||||
let claims_from_token = auth.decode(encoded_eddsa)?.claims;
|
||||
let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]);
|
||||
let claims_from_token = auth.decode(encoded_eddsa).unwrap().claims;
|
||||
assert_eq!(claims_from_token, expected_claims);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_encode() -> Result<(), anyhow::Error> {
|
||||
fn test_encode() {
|
||||
let claims = Claims {
|
||||
tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?),
|
||||
tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()),
|
||||
scope: Scope::Tenant,
|
||||
};
|
||||
|
||||
let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519)?;
|
||||
let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519).unwrap();
|
||||
|
||||
// decode it back
|
||||
let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?);
|
||||
let decoded = auth.decode(&encoded)?;
|
||||
let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]);
|
||||
let decoded = auth.decode(&encoded).unwrap();
|
||||
|
||||
assert_eq!(decoded.claims, claims);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
156
libs/utils/src/exp_counter.rs
Normal file
156
libs/utils/src/exp_counter.rs
Normal file
@@ -0,0 +1,156 @@
|
||||
/// An exhausting, injective counter with exponential search properties
|
||||
///
|
||||
/// The type implements an [`Iterator`] that yields numbers from the range
|
||||
/// from 0 to a pre-defined maximum.
|
||||
///
|
||||
/// * It is *exhausting* in that it visits all numbers inside `0..max`.
|
||||
/// * It is *injective* in that all numbers are visited only once.
|
||||
/// * It has *exponential search properties* in that it iterates over the
|
||||
/// number range in a fractal pattern.
|
||||
///
|
||||
/// This iterator is well suited for finding a pivot in algorithms that
|
||||
/// want centered pivots: its output is heavily biased towards starting
|
||||
/// with small numbers.
|
||||
///
|
||||
/// If the specified maximum is close to an exponent
|
||||
/// of two we will quickly reach that exponent and therefore we will
|
||||
/// quickly close in on the maximum, contrary to the claim of the counter
|
||||
/// providing centered pivots. For most algorithms, this is not a problem
|
||||
/// though as long as this is only a limited number of times, which it does,
|
||||
/// as the next maximum will be an exponent of two, for which the next
|
||||
/// smaller one is precisely its half of. So we will be able to perform a
|
||||
/// binary search in all instances.
|
||||
pub struct ExpCounter {
|
||||
/// The (exclusive) upper limit of our search
|
||||
max: u64,
|
||||
/// The base that increases after each round trip
|
||||
base: u64,
|
||||
/// An increasing offset, always a power of two
|
||||
offs: u64,
|
||||
/// Iteration counter
|
||||
i: u64,
|
||||
}
|
||||
|
||||
impl ExpCounter {
|
||||
/// Creates an iterator that uses `ExpCounter` for the first half of the
|
||||
/// range and a linear range counter for the second half.
|
||||
pub fn with_max_and_linear_search(max: u64) -> impl Iterator<Item = u64> {
|
||||
let linear_start = max / 2;
|
||||
ExpCounter::with_max(linear_start).chain(linear_start..max)
|
||||
}
|
||||
/// Creates a new `ExpCounter` instance that counts to the (exclusive) maximum
|
||||
pub fn with_max(max: u64) -> Self {
|
||||
Self {
|
||||
max,
|
||||
base: 0,
|
||||
offs: 1,
|
||||
i: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Iterator for ExpCounter {
|
||||
type Item = u64;
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
if self.i >= self.max {
|
||||
return None;
|
||||
}
|
||||
if self.i == 0 {
|
||||
// Special casing this is easier than adding 0 as the first in some other fashion
|
||||
self.i += 1;
|
||||
return Some(0);
|
||||
}
|
||||
let to_yield = self.base + self.offs;
|
||||
self.offs *= 2;
|
||||
|
||||
if self.base + self.offs >= self.max {
|
||||
self.base += 1;
|
||||
self.offs = 1;
|
||||
while self.base > self.offs {
|
||||
self.offs *= 2;
|
||||
}
|
||||
if (self.base + self.offs).count_ones() == 1 {
|
||||
self.offs *= 2;
|
||||
}
|
||||
}
|
||||
self.i += 1;
|
||||
|
||||
Some(to_yield)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use std::collections::HashSet;
|
||||
|
||||
use super::*;
|
||||
|
||||
fn dupes_and_missing(list: &mut [u64], max: u64) -> (usize, usize) {
|
||||
let mut contained_in_list = HashSet::<u64>::new();
|
||||
let mut dupes = 0;
|
||||
let mut missing = 0;
|
||||
|
||||
for v in list.iter() {
|
||||
println!("{v:4 } = {v:010b}");
|
||||
}
|
||||
println!("Yielded {} items", list.len());
|
||||
println!("The duplicates:");
|
||||
list.sort();
|
||||
let mut prev = None;
|
||||
for v in list.iter() {
|
||||
contained_in_list.insert(*v);
|
||||
if Some(*v) == prev {
|
||||
dupes += 1;
|
||||
println!("{v:3 } = {v:08b}");
|
||||
}
|
||||
prev = Some(*v);
|
||||
}
|
||||
println!("The missing numbers:");
|
||||
for v in 0..max {
|
||||
if !contained_in_list.contains(&v) {
|
||||
println!("{v:3 } = {v:010b}");
|
||||
missing += 1;
|
||||
}
|
||||
}
|
||||
(dupes, missing)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn to_40() {
|
||||
let max = 40;
|
||||
let list = ExpCounter::with_max(max).collect::<Vec<_>>();
|
||||
let expected = [
|
||||
0, 1, 2, 4, 8, 16, 32, 3, 5, 9, 17, 33, 6, 10, 18, 34, 7, 11, 19, 35, 12, 20, 36, 13,
|
||||
21, 37, 14, 22, 38, 15, 23, 39, 24, 25, 26, 27, 28, 29, 30, 31,
|
||||
];
|
||||
assert_eq!(list, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn to_64() {
|
||||
let max = 64;
|
||||
let mut list = ExpCounter::with_max(max).collect::<Vec<_>>();
|
||||
assert_eq!(dupes_and_missing(&mut list, max), (0, 0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn to_100() {
|
||||
let max = 100;
|
||||
let mut list = ExpCounter::with_max(max).collect::<Vec<_>>();
|
||||
assert_eq!(dupes_and_missing(&mut list, max), (0, 0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn to_127() {
|
||||
let max = 127;
|
||||
let mut list = ExpCounter::with_max(max).collect::<Vec<_>>();
|
||||
assert_eq!(dupes_and_missing(&mut list, max), (0, 0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn to_12345() {
|
||||
let max = 12345;
|
||||
let mut list = ExpCounter::with_max(max).collect::<Vec<_>>();
|
||||
assert_eq!(dupes_and_missing(&mut list, max), (0, 0));
|
||||
}
|
||||
}
|
||||
@@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize};
|
||||
///
|
||||
/// See docs/rfcs/025-generation-numbers.md for detail on how generation
|
||||
/// numbers are used.
|
||||
#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
|
||||
#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
|
||||
pub enum Generation {
|
||||
// Generations with this magic value will not add a suffix to S3 keys, and will not
|
||||
// be included in persisted index_part.json. This value is only to be used
|
||||
|
||||
41
libs/utils/src/hex.rs
Normal file
41
libs/utils/src/hex.rs
Normal file
@@ -0,0 +1,41 @@
|
||||
/// Useful type for asserting that expected bytes match reporting the bytes more readable
|
||||
/// array-syntax compatible hex bytes.
|
||||
///
|
||||
/// # Usage
|
||||
///
|
||||
/// ```
|
||||
/// use utils::Hex;
|
||||
///
|
||||
/// let actual = serialize_something();
|
||||
/// let expected = [0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64];
|
||||
///
|
||||
/// // the type implements PartialEq and on mismatch, both sides are printed in 16 wide multiline
|
||||
/// // output suffixed with an array style length for easier comparisons.
|
||||
/// assert_eq!(Hex(&actual), Hex(&expected));
|
||||
///
|
||||
/// // with `let expected = [0x68];` the error would had been:
|
||||
/// // assertion `left == right` failed
|
||||
/// // left: [0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64; 11]
|
||||
/// // right: [0x68; 1]
|
||||
/// # fn serialize_something() -> Vec<u8> { "hello world".as_bytes().to_vec() }
|
||||
/// ```
|
||||
#[derive(PartialEq)]
|
||||
pub struct Hex<'a>(pub &'a [u8]);
|
||||
|
||||
impl std::fmt::Debug for Hex<'_> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "[")?;
|
||||
for (i, c) in self.0.chunks(16).enumerate() {
|
||||
if i > 0 && !c.is_empty() {
|
||||
writeln!(f, ", ")?;
|
||||
}
|
||||
for (j, b) in c.iter().enumerate() {
|
||||
if j > 0 {
|
||||
write!(f, ", ")?;
|
||||
}
|
||||
write!(f, "0x{b:02x}")?;
|
||||
}
|
||||
}
|
||||
write!(f, "; {}]", self.0.len())
|
||||
}
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::auth::{Claims, JwtAuth};
|
||||
use crate::auth::{AuthError, Claims, SwappableJwtAuth};
|
||||
use crate::http::error::{api_error_handler, route_error_handler, ApiError};
|
||||
use anyhow::Context;
|
||||
use hyper::header::{HeaderName, AUTHORIZATION};
|
||||
@@ -14,6 +14,11 @@ use tracing::{self, debug, info, info_span, warn, Instrument};
|
||||
use std::future::Future;
|
||||
use std::str::FromStr;
|
||||
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use std::io::Write as _;
|
||||
use tokio::sync::mpsc;
|
||||
use tokio_stream::wrappers::ReceiverStream;
|
||||
|
||||
static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"libmetrics_metric_handler_requests_total",
|
||||
@@ -146,94 +151,89 @@ impl Drop for RequestCancelled {
|
||||
}
|
||||
}
|
||||
|
||||
/// An [`std::io::Write`] implementation on top of a channel sending [`bytes::Bytes`] chunks.
|
||||
pub struct ChannelWriter {
|
||||
buffer: BytesMut,
|
||||
pub tx: mpsc::Sender<std::io::Result<Bytes>>,
|
||||
written: usize,
|
||||
}
|
||||
|
||||
impl ChannelWriter {
|
||||
pub fn new(buf_len: usize, tx: mpsc::Sender<std::io::Result<Bytes>>) -> Self {
|
||||
assert_ne!(buf_len, 0);
|
||||
ChannelWriter {
|
||||
// split about half off the buffer from the start, because we flush depending on
|
||||
// capacity. first flush will come sooner than without this, but now resizes will
|
||||
// have better chance of picking up the "other" half. not guaranteed of course.
|
||||
buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
|
||||
tx,
|
||||
written: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn flush0(&mut self) -> std::io::Result<usize> {
|
||||
let n = self.buffer.len();
|
||||
if n == 0 {
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
tracing::trace!(n, "flushing");
|
||||
let ready = self.buffer.split().freeze();
|
||||
|
||||
// not ideal to call from blocking code to block_on, but we are sure that this
|
||||
// operation does not spawn_blocking other tasks
|
||||
let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
|
||||
self.tx.send(Ok(ready)).await.map_err(|_| ())?;
|
||||
|
||||
// throttle sending to allow reuse of our buffer in `write`.
|
||||
self.tx.reserve().await.map_err(|_| ())?;
|
||||
|
||||
// now the response task has picked up the buffer and hopefully started
|
||||
// sending it to the client.
|
||||
Ok(())
|
||||
});
|
||||
if res.is_err() {
|
||||
return Err(std::io::ErrorKind::BrokenPipe.into());
|
||||
}
|
||||
self.written += n;
|
||||
Ok(n)
|
||||
}
|
||||
|
||||
pub fn flushed_bytes(&self) -> usize {
|
||||
self.written
|
||||
}
|
||||
}
|
||||
|
||||
impl std::io::Write for ChannelWriter {
|
||||
fn write(&mut self, mut buf: &[u8]) -> std::io::Result<usize> {
|
||||
let remaining = self.buffer.capacity() - self.buffer.len();
|
||||
|
||||
let out_of_space = remaining < buf.len();
|
||||
|
||||
let original_len = buf.len();
|
||||
|
||||
if out_of_space {
|
||||
let can_still_fit = buf.len() - remaining;
|
||||
self.buffer.extend_from_slice(&buf[..can_still_fit]);
|
||||
buf = &buf[can_still_fit..];
|
||||
self.flush0()?;
|
||||
}
|
||||
|
||||
// assume that this will often under normal operation just move the pointer back to the
|
||||
// beginning of allocation, because previous split off parts are already sent and
|
||||
// dropped.
|
||||
self.buffer.extend_from_slice(buf);
|
||||
Ok(original_len)
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> std::io::Result<()> {
|
||||
self.flush0().map(|_| ())
|
||||
}
|
||||
}
|
||||
|
||||
async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use std::io::Write as _;
|
||||
use tokio::sync::mpsc;
|
||||
use tokio_stream::wrappers::ReceiverStream;
|
||||
|
||||
SERVE_METRICS_COUNT.inc();
|
||||
|
||||
/// An [`std::io::Write`] implementation on top of a channel sending [`bytes::Bytes`] chunks.
|
||||
struct ChannelWriter {
|
||||
buffer: BytesMut,
|
||||
tx: mpsc::Sender<std::io::Result<Bytes>>,
|
||||
written: usize,
|
||||
}
|
||||
|
||||
impl ChannelWriter {
|
||||
fn new(buf_len: usize, tx: mpsc::Sender<std::io::Result<Bytes>>) -> Self {
|
||||
assert_ne!(buf_len, 0);
|
||||
ChannelWriter {
|
||||
// split about half off the buffer from the start, because we flush depending on
|
||||
// capacity. first flush will come sooner than without this, but now resizes will
|
||||
// have better chance of picking up the "other" half. not guaranteed of course.
|
||||
buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
|
||||
tx,
|
||||
written: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn flush0(&mut self) -> std::io::Result<usize> {
|
||||
let n = self.buffer.len();
|
||||
if n == 0 {
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
tracing::trace!(n, "flushing");
|
||||
let ready = self.buffer.split().freeze();
|
||||
|
||||
// not ideal to call from blocking code to block_on, but we are sure that this
|
||||
// operation does not spawn_blocking other tasks
|
||||
let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
|
||||
self.tx.send(Ok(ready)).await.map_err(|_| ())?;
|
||||
|
||||
// throttle sending to allow reuse of our buffer in `write`.
|
||||
self.tx.reserve().await.map_err(|_| ())?;
|
||||
|
||||
// now the response task has picked up the buffer and hopefully started
|
||||
// sending it to the client.
|
||||
Ok(())
|
||||
});
|
||||
if res.is_err() {
|
||||
return Err(std::io::ErrorKind::BrokenPipe.into());
|
||||
}
|
||||
self.written += n;
|
||||
Ok(n)
|
||||
}
|
||||
|
||||
fn flushed_bytes(&self) -> usize {
|
||||
self.written
|
||||
}
|
||||
}
|
||||
|
||||
impl std::io::Write for ChannelWriter {
|
||||
fn write(&mut self, mut buf: &[u8]) -> std::io::Result<usize> {
|
||||
let remaining = self.buffer.capacity() - self.buffer.len();
|
||||
|
||||
let out_of_space = remaining < buf.len();
|
||||
|
||||
let original_len = buf.len();
|
||||
|
||||
if out_of_space {
|
||||
let can_still_fit = buf.len() - remaining;
|
||||
self.buffer.extend_from_slice(&buf[..can_still_fit]);
|
||||
buf = &buf[can_still_fit..];
|
||||
self.flush0()?;
|
||||
}
|
||||
|
||||
// assume that this will often under normal operation just move the pointer back to the
|
||||
// beginning of allocation, because previous split off parts are already sent and
|
||||
// dropped.
|
||||
self.buffer.extend_from_slice(buf);
|
||||
Ok(original_len)
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> std::io::Result<()> {
|
||||
self.flush0().map(|_| ())
|
||||
}
|
||||
}
|
||||
|
||||
let started_at = std::time::Instant::now();
|
||||
|
||||
let (tx, rx) = mpsc::channel(1);
|
||||
@@ -389,7 +389,7 @@ fn parse_token(header_value: &str) -> Result<&str, ApiError> {
|
||||
}
|
||||
|
||||
pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
|
||||
provide_auth: fn(&Request<Body>) -> Option<&JwtAuth>,
|
||||
provide_auth: fn(&Request<Body>) -> Option<&SwappableJwtAuth>,
|
||||
) -> Middleware<B, ApiError> {
|
||||
Middleware::pre(move |req| async move {
|
||||
if let Some(auth) = provide_auth(&req) {
|
||||
@@ -400,9 +400,11 @@ pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
|
||||
})?;
|
||||
let token = parse_token(header_value)?;
|
||||
|
||||
let data = auth
|
||||
.decode(token)
|
||||
.map_err(|_| ApiError::Unauthorized("malformed jwt token".to_string()))?;
|
||||
let data = auth.decode(token).map_err(|err| {
|
||||
warn!("Authentication error: {err}");
|
||||
// Rely on From<AuthError> for ApiError impl
|
||||
err
|
||||
})?;
|
||||
req.set_context(data.claims);
|
||||
}
|
||||
None => {
|
||||
@@ -450,12 +452,11 @@ where
|
||||
|
||||
pub fn check_permission_with(
|
||||
req: &Request<Body>,
|
||||
check_permission: impl Fn(&Claims) -> Result<(), anyhow::Error>,
|
||||
check_permission: impl Fn(&Claims) -> Result<(), AuthError>,
|
||||
) -> Result<(), ApiError> {
|
||||
match req.context::<Claims>() {
|
||||
Some(claims) => {
|
||||
Ok(check_permission(&claims).map_err(|err| ApiError::Forbidden(err.to_string()))?)
|
||||
}
|
||||
Some(claims) => Ok(check_permission(&claims)
|
||||
.map_err(|_err| ApiError::Forbidden("JWT authentication error".to_string()))?),
|
||||
None => Ok(()), // claims is None because auth is disabled
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
|
||||
use std::borrow::Cow;
|
||||
use std::error::Error as StdError;
|
||||
use thiserror::Error;
|
||||
use tracing::{error, info};
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum ApiError {
|
||||
@@ -118,6 +118,9 @@ pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
|
||||
// Print a stack trace for Internal Server errors
|
||||
|
||||
match api_error {
|
||||
ApiError::Forbidden(_) | ApiError::Unauthorized(_) => {
|
||||
warn!("Error processing HTTP request: {api_error:#}")
|
||||
}
|
||||
ApiError::ResourceUnavailable(_) => info!("Error processing HTTP request: {api_error:#}"),
|
||||
ApiError::NotFound(_) => info!("Error processing HTTP request: {api_error:#}"),
|
||||
ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),
|
||||
|
||||
@@ -3,6 +3,7 @@ use std::{fmt, str::FromStr};
|
||||
use anyhow::Context;
|
||||
use hex::FromHex;
|
||||
use rand::Rng;
|
||||
use serde::de::Visitor;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use thiserror::Error;
|
||||
|
||||
@@ -17,12 +18,74 @@ pub enum IdError {
|
||||
///
|
||||
/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look
|
||||
/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`.
|
||||
///
|
||||
/// Use `#[serde_as(as = "DisplayFromStr")]` to (de)serialize it as hex string instead: `ad50847381e248feaac9876cc71ae418`.
|
||||
/// Check the `serde_with::serde_as` documentation for options for more complex types.
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
|
||||
struct Id([u8; 16]);
|
||||
|
||||
impl Serialize for Id {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
if serializer.is_human_readable() {
|
||||
serializer.collect_str(self)
|
||||
} else {
|
||||
self.0.serialize(serializer)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for Id {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
struct IdVisitor {
|
||||
is_human_readable_deserializer: bool,
|
||||
}
|
||||
|
||||
impl<'de> Visitor<'de> for IdVisitor {
|
||||
type Value = Id;
|
||||
|
||||
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
|
||||
if self.is_human_readable_deserializer {
|
||||
formatter.write_str("value in form of hex string")
|
||||
} else {
|
||||
formatter.write_str("value in form of integer array([u8; 16])")
|
||||
}
|
||||
}
|
||||
|
||||
fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
|
||||
where
|
||||
A: serde::de::SeqAccess<'de>,
|
||||
{
|
||||
let s = serde::de::value::SeqAccessDeserializer::new(seq);
|
||||
let id: [u8; 16] = Deserialize::deserialize(s)?;
|
||||
Ok(Id::from(id))
|
||||
}
|
||||
|
||||
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
|
||||
where
|
||||
E: serde::de::Error,
|
||||
{
|
||||
Id::from_str(v).map_err(E::custom)
|
||||
}
|
||||
}
|
||||
|
||||
if deserializer.is_human_readable() {
|
||||
deserializer.deserialize_str(IdVisitor {
|
||||
is_human_readable_deserializer: true,
|
||||
})
|
||||
} else {
|
||||
deserializer.deserialize_tuple(
|
||||
16,
|
||||
IdVisitor {
|
||||
is_human_readable_deserializer: false,
|
||||
},
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Id {
|
||||
pub fn get_from_buf(buf: &mut impl bytes::Buf) -> Id {
|
||||
let mut arr = [0u8; 16];
|
||||
@@ -57,6 +120,8 @@ impl Id {
|
||||
chunk[0] = HEX[((b >> 4) & 0xf) as usize];
|
||||
chunk[1] = HEX[(b & 0xf) as usize];
|
||||
}
|
||||
|
||||
// SAFETY: vec constructed out of `HEX`, it can only be ascii
|
||||
unsafe { String::from_utf8_unchecked(buf) }
|
||||
}
|
||||
}
|
||||
@@ -308,3 +373,112 @@ impl fmt::Display for NodeId {
|
||||
write!(f, "{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use serde_assert::{Deserializer, Serializer, Token, Tokens};
|
||||
|
||||
use crate::bin_ser::BeSer;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_id_serde_non_human_readable() {
|
||||
let original_id = Id([
|
||||
173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
|
||||
]);
|
||||
let expected_tokens = Tokens(vec![
|
||||
Token::Tuple { len: 16 },
|
||||
Token::U8(173),
|
||||
Token::U8(80),
|
||||
Token::U8(132),
|
||||
Token::U8(115),
|
||||
Token::U8(129),
|
||||
Token::U8(226),
|
||||
Token::U8(72),
|
||||
Token::U8(254),
|
||||
Token::U8(170),
|
||||
Token::U8(201),
|
||||
Token::U8(135),
|
||||
Token::U8(108),
|
||||
Token::U8(199),
|
||||
Token::U8(26),
|
||||
Token::U8(228),
|
||||
Token::U8(24),
|
||||
Token::TupleEnd,
|
||||
]);
|
||||
|
||||
let serializer = Serializer::builder().is_human_readable(false).build();
|
||||
let serialized_tokens = original_id.serialize(&serializer).unwrap();
|
||||
assert_eq!(serialized_tokens, expected_tokens);
|
||||
|
||||
let mut deserializer = Deserializer::builder()
|
||||
.is_human_readable(false)
|
||||
.tokens(serialized_tokens)
|
||||
.build();
|
||||
let deserialized_id = Id::deserialize(&mut deserializer).unwrap();
|
||||
assert_eq!(deserialized_id, original_id);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_id_serde_human_readable() {
|
||||
let original_id = Id([
|
||||
173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
|
||||
]);
|
||||
let expected_tokens = Tokens(vec![Token::Str(String::from(
|
||||
"ad50847381e248feaac9876cc71ae418",
|
||||
))]);
|
||||
|
||||
let serializer = Serializer::builder().is_human_readable(true).build();
|
||||
let serialized_tokens = original_id.serialize(&serializer).unwrap();
|
||||
assert_eq!(serialized_tokens, expected_tokens);
|
||||
|
||||
let mut deserializer = Deserializer::builder()
|
||||
.is_human_readable(true)
|
||||
.tokens(Tokens(vec![Token::Str(String::from(
|
||||
"ad50847381e248feaac9876cc71ae418",
|
||||
))]))
|
||||
.build();
|
||||
assert_eq!(Id::deserialize(&mut deserializer).unwrap(), original_id);
|
||||
}
|
||||
|
||||
macro_rules! roundtrip_type {
|
||||
($type:ty, $expected_bytes:expr) => {{
|
||||
let expected_bytes: [u8; 16] = $expected_bytes;
|
||||
let original_id = <$type>::from(expected_bytes);
|
||||
|
||||
let ser_bytes = original_id.ser().unwrap();
|
||||
assert_eq!(ser_bytes, expected_bytes);
|
||||
|
||||
let des_id = <$type>::des(&ser_bytes).unwrap();
|
||||
assert_eq!(des_id, original_id);
|
||||
}};
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_id_bincode_serde() {
|
||||
let expected_bytes = [
|
||||
173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
|
||||
];
|
||||
|
||||
roundtrip_type!(Id, expected_bytes);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tenant_id_bincode_serde() {
|
||||
let expected_bytes = [
|
||||
173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
|
||||
];
|
||||
|
||||
roundtrip_type!(TenantId, expected_bytes);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_timeline_id_bincode_serde() {
|
||||
let expected_bytes = [
|
||||
173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
|
||||
];
|
||||
|
||||
roundtrip_type!(TimelineId, expected_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
//! `utils` is intended to be a place to put code that is shared
|
||||
//! between other crates in this repository.
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
|
||||
pub mod backoff;
|
||||
|
||||
@@ -24,6 +25,10 @@ pub mod auth;
|
||||
|
||||
// utility functions and helper traits for unified unique id generation/serialization etc.
|
||||
pub mod id;
|
||||
|
||||
mod hex;
|
||||
pub use hex::Hex;
|
||||
|
||||
// http endpoint utils
|
||||
pub mod http;
|
||||
|
||||
@@ -73,6 +78,10 @@ pub mod completion;
|
||||
/// Reporting utilities
|
||||
pub mod error;
|
||||
|
||||
pub mod exp_counter;
|
||||
/// async timeout helper
|
||||
pub mod timeout;
|
||||
|
||||
pub mod sync;
|
||||
|
||||
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
|
||||
@@ -130,6 +139,21 @@ macro_rules! project_git_version {
|
||||
};
|
||||
}
|
||||
|
||||
/// This is a shortcut to embed build tag into binaries and avoid copying the same build script to all packages
|
||||
#[macro_export]
|
||||
macro_rules! project_build_tag {
|
||||
($const_identifier:ident) => {
|
||||
const $const_identifier: &::core::primitive::str = {
|
||||
const __ARG: &[&::core::primitive::str; 2] = &match ::core::option_env!("BUILD_TAG") {
|
||||
::core::option::Option::Some(x) => ["build_tag-env:", x],
|
||||
::core::option::Option::None => ["build_tag:", ""],
|
||||
};
|
||||
|
||||
$crate::__const_format::concatcp!(__ARG[0], __ARG[1])
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
/// Re-export for `project_git_version` macro
|
||||
#[doc(hidden)]
|
||||
pub use const_format as __const_format;
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#![warn(missing_docs)]
|
||||
|
||||
use camino::Utf8Path;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde::{de::Visitor, Deserialize, Serialize};
|
||||
use std::fmt;
|
||||
use std::ops::{Add, AddAssign};
|
||||
use std::str::FromStr;
|
||||
@@ -13,10 +13,114 @@ use crate::seqwait::MonotonicCounter;
|
||||
pub const XLOG_BLCKSZ: u32 = 8192;
|
||||
|
||||
/// A Postgres LSN (Log Sequence Number), also known as an XLogRecPtr
|
||||
#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Serialize, Deserialize)]
|
||||
#[serde(transparent)]
|
||||
#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash)]
|
||||
pub struct Lsn(pub u64);
|
||||
|
||||
impl Serialize for Lsn {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
if serializer.is_human_readable() {
|
||||
serializer.collect_str(self)
|
||||
} else {
|
||||
self.0.serialize(serializer)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for Lsn {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
struct LsnVisitor {
|
||||
is_human_readable_deserializer: bool,
|
||||
}
|
||||
|
||||
impl<'de> Visitor<'de> for LsnVisitor {
|
||||
type Value = Lsn;
|
||||
|
||||
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
|
||||
if self.is_human_readable_deserializer {
|
||||
formatter.write_str(
|
||||
"value in form of hex string({upper_u32_hex}/{lower_u32_hex}) representing u64 integer",
|
||||
)
|
||||
} else {
|
||||
formatter.write_str("value in form of integer(u64)")
|
||||
}
|
||||
}
|
||||
|
||||
fn visit_u64<E>(self, v: u64) -> Result<Self::Value, E>
|
||||
where
|
||||
E: serde::de::Error,
|
||||
{
|
||||
Ok(Lsn(v))
|
||||
}
|
||||
|
||||
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
|
||||
where
|
||||
E: serde::de::Error,
|
||||
{
|
||||
Lsn::from_str(v).map_err(|e| E::custom(e))
|
||||
}
|
||||
}
|
||||
|
||||
if deserializer.is_human_readable() {
|
||||
deserializer.deserialize_str(LsnVisitor {
|
||||
is_human_readable_deserializer: true,
|
||||
})
|
||||
} else {
|
||||
deserializer.deserialize_u64(LsnVisitor {
|
||||
is_human_readable_deserializer: false,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Allows (de)serialization of an `Lsn` always as `u64`.
|
||||
///
|
||||
/// ### Example
|
||||
///
|
||||
/// ```rust
|
||||
/// # use serde::{Serialize, Deserialize};
|
||||
/// use utils::lsn::Lsn;
|
||||
///
|
||||
/// #[derive(PartialEq, Serialize, Deserialize, Debug)]
|
||||
/// struct Foo {
|
||||
/// #[serde(with = "utils::lsn::serde_as_u64")]
|
||||
/// always_u64: Lsn,
|
||||
/// }
|
||||
///
|
||||
/// let orig = Foo { always_u64: Lsn(1234) };
|
||||
///
|
||||
/// let res = serde_json::to_string(&orig).unwrap();
|
||||
/// assert_eq!(res, r#"{"always_u64":1234}"#);
|
||||
///
|
||||
/// let foo = serde_json::from_str::<Foo>(&res).unwrap();
|
||||
/// assert_eq!(foo, orig);
|
||||
/// ```
|
||||
///
|
||||
pub mod serde_as_u64 {
|
||||
use super::Lsn;
|
||||
|
||||
/// Serializes the Lsn as u64 disregarding the human readability of the format.
|
||||
///
|
||||
/// Meant to be used via `#[serde(with = "...")]` or `#[serde(serialize_with = "...")]`.
|
||||
pub fn serialize<S: serde::Serializer>(lsn: &Lsn, serializer: S) -> Result<S::Ok, S::Error> {
|
||||
use serde::Serialize;
|
||||
lsn.0.serialize(serializer)
|
||||
}
|
||||
|
||||
/// Deserializes the Lsn as u64 disregarding the human readability of the format.
|
||||
///
|
||||
/// Meant to be used via `#[serde(with = "...")]` or `#[serde(deserialize_with = "...")]`.
|
||||
pub fn deserialize<'de, D: serde::Deserializer<'de>>(deserializer: D) -> Result<Lsn, D::Error> {
|
||||
use serde::Deserialize;
|
||||
u64::deserialize(deserializer).map(Lsn)
|
||||
}
|
||||
}
|
||||
|
||||
/// We tried to parse an LSN from a string, but failed
|
||||
#[derive(Debug, PartialEq, Eq, thiserror::Error)]
|
||||
#[error("LsnParseError")]
|
||||
@@ -264,8 +368,13 @@ impl MonotonicCounter<Lsn> for RecordLsn {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::bin_ser::BeSer;
|
||||
|
||||
use super::*;
|
||||
|
||||
use serde::ser::Serialize;
|
||||
use serde_assert::{Deserializer, Serializer, Token, Tokens};
|
||||
|
||||
#[test]
|
||||
fn test_lsn_strings() {
|
||||
assert_eq!("12345678/AAAA5555".parse(), Ok(Lsn(0x12345678AAAA5555)));
|
||||
@@ -341,4 +450,95 @@ mod tests {
|
||||
assert_eq!(lsn.fetch_max(Lsn(6000)), Lsn(5678));
|
||||
assert_eq!(lsn.fetch_max(Lsn(5000)), Lsn(6000));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_lsn_serde() {
|
||||
let original_lsn = Lsn(0x0123456789abcdef);
|
||||
let expected_readable_tokens = Tokens(vec![Token::U64(0x0123456789abcdef)]);
|
||||
let expected_non_readable_tokens =
|
||||
Tokens(vec![Token::Str(String::from("1234567/89ABCDEF"))]);
|
||||
|
||||
// Testing human_readable ser/de
|
||||
let serializer = Serializer::builder().is_human_readable(false).build();
|
||||
let readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
|
||||
assert_eq!(readable_ser_tokens, expected_readable_tokens);
|
||||
|
||||
let mut deserializer = Deserializer::builder()
|
||||
.is_human_readable(false)
|
||||
.tokens(readable_ser_tokens)
|
||||
.build();
|
||||
let des_lsn = Lsn::deserialize(&mut deserializer).unwrap();
|
||||
assert_eq!(des_lsn, original_lsn);
|
||||
|
||||
// Testing NON human_readable ser/de
|
||||
let serializer = Serializer::builder().is_human_readable(true).build();
|
||||
let non_readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
|
||||
assert_eq!(non_readable_ser_tokens, expected_non_readable_tokens);
|
||||
|
||||
let mut deserializer = Deserializer::builder()
|
||||
.is_human_readable(true)
|
||||
.tokens(non_readable_ser_tokens)
|
||||
.build();
|
||||
let des_lsn = Lsn::deserialize(&mut deserializer).unwrap();
|
||||
assert_eq!(des_lsn, original_lsn);
|
||||
|
||||
// Testing mismatching ser/de
|
||||
let serializer = Serializer::builder().is_human_readable(false).build();
|
||||
let non_readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
|
||||
|
||||
let mut deserializer = Deserializer::builder()
|
||||
.is_human_readable(true)
|
||||
.tokens(non_readable_ser_tokens)
|
||||
.build();
|
||||
Lsn::deserialize(&mut deserializer).unwrap_err();
|
||||
|
||||
let serializer = Serializer::builder().is_human_readable(true).build();
|
||||
let readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
|
||||
|
||||
let mut deserializer = Deserializer::builder()
|
||||
.is_human_readable(false)
|
||||
.tokens(readable_ser_tokens)
|
||||
.build();
|
||||
Lsn::deserialize(&mut deserializer).unwrap_err();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_lsn_ensure_roundtrip() {
|
||||
let original_lsn = Lsn(0xaaaabbbb);
|
||||
|
||||
let serializer = Serializer::builder().is_human_readable(false).build();
|
||||
let ser_tokens = original_lsn.serialize(&serializer).unwrap();
|
||||
|
||||
let mut deserializer = Deserializer::builder()
|
||||
.is_human_readable(false)
|
||||
.tokens(ser_tokens)
|
||||
.build();
|
||||
|
||||
let des_lsn = Lsn::deserialize(&mut deserializer).unwrap();
|
||||
assert_eq!(des_lsn, original_lsn);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_lsn_bincode_serde() {
|
||||
let lsn = Lsn(0x0123456789abcdef);
|
||||
let expected_bytes = [0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef];
|
||||
|
||||
let ser_bytes = lsn.ser().unwrap();
|
||||
assert_eq!(ser_bytes, expected_bytes);
|
||||
|
||||
let des_lsn = Lsn::des(&ser_bytes).unwrap();
|
||||
assert_eq!(des_lsn, lsn);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_lsn_bincode_ensure_roundtrip() {
|
||||
let original_lsn = Lsn(0x01_02_03_04_05_06_07_08);
|
||||
let expected_bytes = vec![0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08];
|
||||
|
||||
let ser_bytes = original_lsn.ser().unwrap();
|
||||
assert_eq!(ser_bytes, expected_bytes);
|
||||
|
||||
let des_lsn = Lsn::des(&ser_bytes).unwrap();
|
||||
assert_eq!(des_lsn, original_lsn);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,6 @@ use std::time::{Duration, SystemTime};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use pq_proto::{read_cstr, PG_EPOCH};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use tracing::{trace, warn};
|
||||
|
||||
use crate::lsn::Lsn;
|
||||
@@ -15,21 +14,17 @@ use crate::lsn::Lsn;
|
||||
///
|
||||
/// serde Serialize is used only for human readable dump to json (e.g. in
|
||||
/// safekeepers debug_dump).
|
||||
#[serde_as]
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct PageserverFeedback {
|
||||
/// Last known size of the timeline. Used to enforce timeline size limit.
|
||||
pub current_timeline_size: u64,
|
||||
/// LSN last received and ingested by the pageserver. Controls backpressure.
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub last_received_lsn: Lsn,
|
||||
/// LSN up to which data is persisted by the pageserver to its local disc.
|
||||
/// Controls backpressure.
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub disk_consistent_lsn: Lsn,
|
||||
/// LSN up to which data is persisted by the pageserver on s3; safekeepers
|
||||
/// consider WAL before it can be removed.
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub remote_consistent_lsn: Lsn,
|
||||
// Serialize with RFC3339 format.
|
||||
#[serde(with = "serde_systemtime")]
|
||||
|
||||
@@ -125,6 +125,9 @@ where
|
||||
// Wake everyone with an error.
|
||||
let mut internal = self.internal.lock().unwrap();
|
||||
|
||||
// Block any future waiters from starting
|
||||
internal.shutdown = true;
|
||||
|
||||
// This will steal the entire waiters map.
|
||||
// When we drop it all waiters will be woken.
|
||||
mem::take(&mut internal.waiters)
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
/// Immediately terminate the calling process without calling
|
||||
/// atexit callbacks, C runtime destructors etc. We mainly use
|
||||
/// this to protect coverage data from concurrent writes.
|
||||
pub fn exit_now(code: u8) {
|
||||
pub fn exit_now(code: u8) -> ! {
|
||||
// SAFETY: exiting is safe, the ffi is not safe
|
||||
unsafe { nix::libc::_exit(code as _) };
|
||||
}
|
||||
|
||||
@@ -1 +1,3 @@
|
||||
pub mod heavier_once_cell;
|
||||
|
||||
pub mod gate;
|
||||
|
||||
158
libs/utils/src/sync/gate.rs
Normal file
158
libs/utils/src/sync/gate.rs
Normal file
@@ -0,0 +1,158 @@
|
||||
use std::{sync::Arc, time::Duration};
|
||||
|
||||
/// Gates are a concurrency helper, primarily used for implementing safe shutdown.
|
||||
///
|
||||
/// Users of a resource call `enter()` to acquire a GateGuard, and the owner of
|
||||
/// the resource calls `close()` when they want to ensure that all holders of guards
|
||||
/// have released them, and that no future guards will be issued.
|
||||
pub struct Gate {
|
||||
/// Each caller of enter() takes one unit from the semaphore. In close(), we
|
||||
/// take all the units to ensure all GateGuards are destroyed.
|
||||
sem: Arc<tokio::sync::Semaphore>,
|
||||
|
||||
/// For observability only: a name that will be used to log warnings if a particular
|
||||
/// gate is holding up shutdown
|
||||
name: String,
|
||||
}
|
||||
|
||||
/// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will
|
||||
/// not complete.
|
||||
#[derive(Debug)]
|
||||
pub struct GateGuard(tokio::sync::OwnedSemaphorePermit);
|
||||
|
||||
/// Observability helper: every `warn_period`, emit a log warning that we're still waiting on this gate
|
||||
async fn warn_if_stuck<Fut: std::future::Future>(
|
||||
fut: Fut,
|
||||
name: &str,
|
||||
warn_period: std::time::Duration,
|
||||
) -> <Fut as std::future::Future>::Output {
|
||||
let started = std::time::Instant::now();
|
||||
|
||||
let mut fut = std::pin::pin!(fut);
|
||||
|
||||
loop {
|
||||
match tokio::time::timeout(warn_period, &mut fut).await {
|
||||
Ok(ret) => return ret,
|
||||
Err(_) => {
|
||||
tracing::warn!(
|
||||
gate = name,
|
||||
elapsed_ms = started.elapsed().as_millis(),
|
||||
"still waiting, taking longer than expected..."
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum GateError {
|
||||
GateClosed,
|
||||
}
|
||||
|
||||
impl Gate {
|
||||
const MAX_UNITS: u32 = u32::MAX;
|
||||
|
||||
pub fn new(name: String) -> Self {
|
||||
Self {
|
||||
sem: Arc::new(tokio::sync::Semaphore::new(Self::MAX_UNITS as usize)),
|
||||
name,
|
||||
}
|
||||
}
|
||||
|
||||
/// Acquire a guard that will prevent close() calls from completing. If close()
|
||||
/// was already called, this will return an error which should be interpreted
|
||||
/// as "shutting down".
|
||||
///
|
||||
/// This function would typically be used from e.g. request handlers. While holding
|
||||
/// the guard returned from this function, it is important to respect a CancellationToken
|
||||
/// to avoid blocking close() indefinitely: typically types that contain a Gate will
|
||||
/// also contain a CancellationToken.
|
||||
pub fn enter(&self) -> Result<GateGuard, GateError> {
|
||||
self.sem
|
||||
.clone()
|
||||
.try_acquire_owned()
|
||||
.map(GateGuard)
|
||||
.map_err(|_| GateError::GateClosed)
|
||||
}
|
||||
|
||||
/// Types with a shutdown() method and a gate should call this method at the
|
||||
/// end of shutdown, to ensure that all GateGuard holders are done.
|
||||
///
|
||||
/// This will wait for all guards to be destroyed. For this to complete promptly, it is
|
||||
/// important that the holders of such guards are respecting a CancellationToken which has
|
||||
/// been cancelled before entering this function.
|
||||
pub async fn close(&self) {
|
||||
warn_if_stuck(self.do_close(), &self.name, Duration::from_millis(1000)).await
|
||||
}
|
||||
|
||||
/// Check if [`Self::close()`] has finished waiting for all [`Self::enter()`] users to finish. This
|
||||
/// is usually analoguous for "Did shutdown finish?" for types that include a Gate, whereas checking
|
||||
/// the CancellationToken on such types is analogous to "Did shutdown start?"
|
||||
pub fn close_complete(&self) -> bool {
|
||||
self.sem.is_closed()
|
||||
}
|
||||
|
||||
async fn do_close(&self) {
|
||||
tracing::debug!(gate = self.name, "Closing Gate...");
|
||||
match self.sem.acquire_many(Self::MAX_UNITS).await {
|
||||
Ok(_units) => {
|
||||
// While holding all units, close the semaphore. All subsequent calls to enter() will fail.
|
||||
self.sem.close();
|
||||
}
|
||||
Err(_) => {
|
||||
// Semaphore closed: we are the only function that can do this, so it indicates a double-call.
|
||||
// This is legal. Timeline::shutdown for example is not protected from being called more than
|
||||
// once.
|
||||
tracing::debug!(gate = self.name, "Double close")
|
||||
}
|
||||
}
|
||||
tracing::debug!(gate = self.name, "Closed Gate.")
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use futures::FutureExt;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_idle_gate() {
|
||||
// Having taken no gates, we should not be blocked in close
|
||||
let gate = Gate::new("test".to_string());
|
||||
gate.close().await;
|
||||
|
||||
// If a guard is dropped before entering, close should not be blocked
|
||||
let gate = Gate::new("test".to_string());
|
||||
let guard = gate.enter().unwrap();
|
||||
drop(guard);
|
||||
gate.close().await;
|
||||
|
||||
// Entering a closed guard fails
|
||||
gate.enter().expect_err("enter should fail after close");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_busy_gate() {
|
||||
let gate = Gate::new("test".to_string());
|
||||
|
||||
let guard = gate.enter().unwrap();
|
||||
|
||||
let mut close_fut = std::pin::pin!(gate.close());
|
||||
|
||||
// Close should be blocked
|
||||
assert!(close_fut.as_mut().now_or_never().is_none());
|
||||
|
||||
// Attempting to enter() should fail, even though close isn't done yet.
|
||||
gate.enter()
|
||||
.expect_err("enter should fail after entering close");
|
||||
|
||||
drop(guard);
|
||||
|
||||
// Guard is gone, close should finish
|
||||
assert!(close_fut.as_mut().now_or_never().is_some());
|
||||
|
||||
// Attempting to enter() is still forbidden
|
||||
gate.enter().expect_err("enter should fail finishing close");
|
||||
}
|
||||
}
|
||||
@@ -1,4 +1,7 @@
|
||||
use std::sync::{Arc, Mutex, MutexGuard};
|
||||
use std::sync::{
|
||||
atomic::{AtomicUsize, Ordering},
|
||||
Arc, Mutex, MutexGuard,
|
||||
};
|
||||
use tokio::sync::Semaphore;
|
||||
|
||||
/// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of
|
||||
@@ -10,6 +13,7 @@ use tokio::sync::Semaphore;
|
||||
/// [`OwnedSemaphorePermit`]: tokio::sync::OwnedSemaphorePermit
|
||||
pub struct OnceCell<T> {
|
||||
inner: Mutex<Inner<T>>,
|
||||
initializers: AtomicUsize,
|
||||
}
|
||||
|
||||
impl<T> Default for OnceCell<T> {
|
||||
@@ -17,6 +21,7 @@ impl<T> Default for OnceCell<T> {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
inner: Default::default(),
|
||||
initializers: AtomicUsize::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -49,6 +54,7 @@ impl<T> OnceCell<T> {
|
||||
init_semaphore: Arc::new(sem),
|
||||
value: Some(value),
|
||||
}),
|
||||
initializers: AtomicUsize::new(0),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -60,8 +66,8 @@ impl<T> OnceCell<T> {
|
||||
/// Initialization is panic-safe and cancellation-safe.
|
||||
pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<Guard<'_, T>, E>
|
||||
where
|
||||
F: FnOnce() -> Fut,
|
||||
Fut: std::future::Future<Output = Result<T, E>>,
|
||||
F: FnOnce(InitPermit) -> Fut,
|
||||
Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
|
||||
{
|
||||
let sem = {
|
||||
let guard = self.inner.lock().unwrap();
|
||||
@@ -71,29 +77,61 @@ impl<T> OnceCell<T> {
|
||||
guard.init_semaphore.clone()
|
||||
};
|
||||
|
||||
let permit = sem.acquire_owned().await;
|
||||
if permit.is_err() {
|
||||
let guard = self.inner.lock().unwrap();
|
||||
assert!(
|
||||
guard.value.is_some(),
|
||||
"semaphore got closed, must be initialized"
|
||||
);
|
||||
return Ok(Guard(guard));
|
||||
} else {
|
||||
// now we try
|
||||
let value = factory().await?;
|
||||
let permit = {
|
||||
// increment the count for the duration of queued
|
||||
let _guard = CountWaitingInitializers::start(self);
|
||||
sem.acquire_owned().await
|
||||
};
|
||||
|
||||
let mut guard = self.inner.lock().unwrap();
|
||||
assert!(
|
||||
guard.value.is_none(),
|
||||
"we won permit, must not be initialized"
|
||||
);
|
||||
guard.value = Some(value);
|
||||
guard.init_semaphore.close();
|
||||
Ok(Guard(guard))
|
||||
match permit {
|
||||
Ok(permit) => {
|
||||
let permit = InitPermit(permit);
|
||||
let (value, _permit) = factory(permit).await?;
|
||||
|
||||
let guard = self.inner.lock().unwrap();
|
||||
|
||||
Ok(Self::set0(value, guard))
|
||||
}
|
||||
Err(_closed) => {
|
||||
let guard = self.inner.lock().unwrap();
|
||||
assert!(
|
||||
guard.value.is_some(),
|
||||
"semaphore got closed, must be initialized"
|
||||
);
|
||||
return Ok(Guard(guard));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
|
||||
/// to complete initializing the inner value.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// If the inner has already been initialized.
|
||||
pub fn set(&self, value: T, _permit: InitPermit) -> Guard<'_, T> {
|
||||
let guard = self.inner.lock().unwrap();
|
||||
|
||||
// cannot assert that this permit is for self.inner.semaphore, but we can assert it cannot
|
||||
// give more permits right now.
|
||||
if guard.init_semaphore.try_acquire().is_ok() {
|
||||
drop(guard);
|
||||
panic!("permit is of wrong origin");
|
||||
}
|
||||
|
||||
Self::set0(value, guard)
|
||||
}
|
||||
|
||||
fn set0(value: T, mut guard: std::sync::MutexGuard<'_, Inner<T>>) -> Guard<'_, T> {
|
||||
if guard.value.is_some() {
|
||||
drop(guard);
|
||||
unreachable!("we won permit, must not be initialized");
|
||||
}
|
||||
guard.value = Some(value);
|
||||
guard.init_semaphore.close();
|
||||
Guard(guard)
|
||||
}
|
||||
|
||||
/// Returns a guard to an existing initialized value, if any.
|
||||
pub fn get(&self) -> Option<Guard<'_, T>> {
|
||||
let guard = self.inner.lock().unwrap();
|
||||
@@ -103,6 +141,28 @@ impl<T> OnceCell<T> {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete.
|
||||
pub fn initializer_count(&self) -> usize {
|
||||
self.initializers.load(Ordering::Relaxed)
|
||||
}
|
||||
}
|
||||
|
||||
/// DropGuard counter for queued tasks waiting to initialize, mainly accessible for the
|
||||
/// initializing task for example at the end of initialization.
|
||||
struct CountWaitingInitializers<'a, T>(&'a OnceCell<T>);
|
||||
|
||||
impl<'a, T> CountWaitingInitializers<'a, T> {
|
||||
fn start(target: &'a OnceCell<T>) -> Self {
|
||||
target.initializers.fetch_add(1, Ordering::Relaxed);
|
||||
CountWaitingInitializers(target)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T> Drop for CountWaitingInitializers<'a, T> {
|
||||
fn drop(&mut self) {
|
||||
self.0.initializers.fetch_sub(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
/// Uninteresting guard object to allow short-lived access to inspect or clone the held,
|
||||
@@ -135,7 +195,7 @@ impl<'a, T> Guard<'a, T> {
|
||||
///
|
||||
/// The permit will be on a semaphore part of the new internal value, and any following
|
||||
/// [`OnceCell::get_or_init`] will wait on it to complete.
|
||||
pub fn take_and_deinit(&mut self) -> (T, tokio::sync::OwnedSemaphorePermit) {
|
||||
pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
|
||||
let mut swapped = Inner::default();
|
||||
let permit = swapped
|
||||
.init_semaphore
|
||||
@@ -145,11 +205,14 @@ impl<'a, T> Guard<'a, T> {
|
||||
std::mem::swap(&mut *self.0, &mut swapped);
|
||||
swapped
|
||||
.value
|
||||
.map(|v| (v, permit))
|
||||
.map(|v| (v, InitPermit(permit)))
|
||||
.expect("guard is not created unless value has been initialized")
|
||||
}
|
||||
}
|
||||
|
||||
/// Type held by OnceCell (de)initializing task.
|
||||
pub struct InitPermit(tokio::sync::OwnedSemaphorePermit);
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -185,11 +248,11 @@ mod tests {
|
||||
barrier.wait().await;
|
||||
let won = {
|
||||
let g = cell
|
||||
.get_or_init(|| {
|
||||
.get_or_init(|permit| {
|
||||
counters.factory_got_to_run.fetch_add(1, Ordering::Relaxed);
|
||||
async {
|
||||
counters.future_polled.fetch_add(1, Ordering::Relaxed);
|
||||
Ok::<_, Infallible>(i)
|
||||
Ok::<_, Infallible>((i, permit))
|
||||
}
|
||||
})
|
||||
.await
|
||||
@@ -243,7 +306,7 @@ mod tests {
|
||||
deinitialization_started.wait().await;
|
||||
|
||||
let started_at = tokio::time::Instant::now();
|
||||
cell.get_or_init(|| async { Ok::<_, Infallible>(reinit) })
|
||||
cell.get_or_init(|permit| async { Ok::<_, Infallible>((reinit, permit)) })
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
@@ -258,18 +321,32 @@ mod tests {
|
||||
assert_eq!(*cell.get().unwrap(), reinit);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn reinit_with_deinit_permit() {
|
||||
let cell = Arc::new(OnceCell::new(42));
|
||||
|
||||
let (mol, permit) = cell.get().unwrap().take_and_deinit();
|
||||
cell.set(5, permit);
|
||||
assert_eq!(*cell.get().unwrap(), 5);
|
||||
|
||||
let (five, permit) = cell.get().unwrap().take_and_deinit();
|
||||
assert_eq!(5, five);
|
||||
cell.set(mol, permit);
|
||||
assert_eq!(*cell.get().unwrap(), 42);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn initialization_attemptable_until_ok() {
|
||||
let cell = OnceCell::default();
|
||||
|
||||
for _ in 0..10 {
|
||||
cell.get_or_init(|| async { Err("whatever error") })
|
||||
cell.get_or_init(|_permit| async { Err("whatever error") })
|
||||
.await
|
||||
.unwrap_err();
|
||||
}
|
||||
|
||||
let g = cell
|
||||
.get_or_init(|| async { Ok::<_, Infallible>("finally success") })
|
||||
.get_or_init(|permit| async { Ok::<_, Infallible>(("finally success", permit)) })
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(*g, "finally success");
|
||||
@@ -281,11 +358,11 @@ mod tests {
|
||||
|
||||
let barrier = tokio::sync::Barrier::new(2);
|
||||
|
||||
let initializer = cell.get_or_init(|| async {
|
||||
let initializer = cell.get_or_init(|permit| async {
|
||||
barrier.wait().await;
|
||||
futures::future::pending::<()>().await;
|
||||
|
||||
Ok::<_, Infallible>("never reached")
|
||||
Ok::<_, Infallible>(("never reached", permit))
|
||||
});
|
||||
|
||||
tokio::select! {
|
||||
@@ -298,7 +375,7 @@ mod tests {
|
||||
assert!(cell.get().is_none());
|
||||
|
||||
let g = cell
|
||||
.get_or_init(|| async { Ok::<_, Infallible>("now initialized") })
|
||||
.get_or_init(|permit| async { Ok::<_, Infallible>(("now initialized", permit)) })
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(*g, "now initialized");
|
||||
|
||||
37
libs/utils/src/timeout.rs
Normal file
37
libs/utils/src/timeout.rs
Normal file
@@ -0,0 +1,37 @@
|
||||
use std::time::Duration;
|
||||
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
pub enum TimeoutCancellableError {
|
||||
Timeout,
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
/// Wrap [`tokio::time::timeout`] with a CancellationToken.
|
||||
///
|
||||
/// This wrapper is appropriate for any long running operation in a task
|
||||
/// that ought to respect a CancellationToken (which means most tasks).
|
||||
///
|
||||
/// The only time you should use a bare tokio::timeout is when the future `F`
|
||||
/// itself respects a CancellationToken: otherwise, always use this wrapper
|
||||
/// with your CancellationToken to ensure that your task does not hold up
|
||||
/// graceful shutdown.
|
||||
pub async fn timeout_cancellable<F>(
|
||||
duration: Duration,
|
||||
cancel: &CancellationToken,
|
||||
future: F,
|
||||
) -> Result<F::Output, TimeoutCancellableError>
|
||||
where
|
||||
F: std::future::Future,
|
||||
{
|
||||
tokio::select!(
|
||||
r = tokio::time::timeout(duration, future) => {
|
||||
r.map_err(|_| TimeoutCancellableError::Timeout)
|
||||
|
||||
},
|
||||
_ = cancel.cancelled() => {
|
||||
Err(TimeoutCancellableError::Cancelled)
|
||||
|
||||
}
|
||||
)
|
||||
}
|
||||
@@ -19,13 +19,12 @@ inotify.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
sysinfo.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio = { workspace = true, features = ["rt-multi-thread"] }
|
||||
tokio-postgres.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
[target.'cfg(target_os = "linux")'.dependencies]
|
||||
cgroups-rs = "0.3.3"
|
||||
|
||||
@@ -21,11 +21,6 @@ pub struct FileCacheState {
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct FileCacheConfig {
|
||||
/// Whether the file cache is *actually* stored in memory (e.g. by writing to
|
||||
/// a tmpfs or shmem file). If true, the size of the file cache will be counted against the
|
||||
/// memory available for the cgroup.
|
||||
pub(crate) in_memory: bool,
|
||||
|
||||
/// The size of the file cache, in terms of the size of the resource it consumes
|
||||
/// (currently: only memory)
|
||||
///
|
||||
@@ -59,22 +54,9 @@ pub struct FileCacheConfig {
|
||||
spread_factor: f64,
|
||||
}
|
||||
|
||||
impl FileCacheConfig {
|
||||
pub fn default_in_memory() -> Self {
|
||||
impl Default for FileCacheConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
in_memory: true,
|
||||
// 75 %
|
||||
resource_multiplier: 0.75,
|
||||
// 640 MiB; (512 + 128)
|
||||
min_remaining_after_cache: NonZeroU64::new(640 * MiB).unwrap(),
|
||||
// ensure any increase in file cache size is split 90-10 with 10% to other memory
|
||||
spread_factor: 0.1,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn default_on_disk() -> Self {
|
||||
Self {
|
||||
in_memory: false,
|
||||
resource_multiplier: 0.75,
|
||||
// 256 MiB - lower than when in memory because overcommitting is safe; if we don't have
|
||||
// memory, the kernel will just evict from its page cache, rather than e.g. killing
|
||||
@@ -83,7 +65,9 @@ impl FileCacheConfig {
|
||||
spread_factor: 0.1,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FileCacheConfig {
|
||||
/// Make sure fields of the config are consistent.
|
||||
pub fn validate(&self) -> anyhow::Result<()> {
|
||||
// Single field validity
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
#![deny(unsafe_code)]
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
#![cfg(target_os = "linux")]
|
||||
|
||||
use anyhow::Context;
|
||||
@@ -39,16 +41,6 @@ pub struct Args {
|
||||
#[arg(short, long)]
|
||||
pub pgconnstr: Option<String>,
|
||||
|
||||
/// Flag to signal that the Postgres file cache is on disk (i.e. not in memory aside from the
|
||||
/// kernel's page cache), and therefore should not count against available memory.
|
||||
//
|
||||
// NB: Ideally this flag would directly refer to whether the file cache is in memory (rather
|
||||
// than a roundabout way, via whether it's on disk), but in order to be backwards compatible
|
||||
// during the switch away from an in-memory file cache, we had to default to the previous
|
||||
// behavior.
|
||||
#[arg(long)]
|
||||
pub file_cache_on_disk: bool,
|
||||
|
||||
/// The address we should listen on for connection requests. For the
|
||||
/// agent, this is 0.0.0.0:10301. For the informant, this is 127.0.0.1:10369.
|
||||
#[arg(short, long)]
|
||||
|
||||
@@ -156,10 +156,7 @@ impl Runner {
|
||||
// memory limits.
|
||||
if let Some(connstr) = &args.pgconnstr {
|
||||
info!("initializing file cache");
|
||||
let config = match args.file_cache_on_disk {
|
||||
true => FileCacheConfig::default_on_disk(),
|
||||
false => FileCacheConfig::default_in_memory(),
|
||||
};
|
||||
let config = FileCacheConfig::default();
|
||||
|
||||
let mut file_cache = FileCacheState::new(connstr, config, token.clone())
|
||||
.await
|
||||
@@ -187,10 +184,7 @@ impl Runner {
|
||||
info!("file cache size actually got set to {actual_size}")
|
||||
}
|
||||
|
||||
if args.file_cache_on_disk {
|
||||
file_cache_disk_size = actual_size;
|
||||
}
|
||||
|
||||
file_cache_disk_size = actual_size;
|
||||
state.filecache = Some(file_cache);
|
||||
}
|
||||
|
||||
@@ -239,17 +233,11 @@ impl Runner {
|
||||
|
||||
let requested_mem = target.mem;
|
||||
let usable_system_memory = requested_mem.saturating_sub(self.config.sys_buffer_bytes);
|
||||
let (expected_file_cache_size, expected_file_cache_disk_size) = self
|
||||
let expected_file_cache_size = self
|
||||
.filecache
|
||||
.as_ref()
|
||||
.map(|file_cache| {
|
||||
let size = file_cache.config.calculate_cache_size(usable_system_memory);
|
||||
match file_cache.config.in_memory {
|
||||
true => (size, 0),
|
||||
false => (size, size),
|
||||
}
|
||||
})
|
||||
.unwrap_or((0, 0));
|
||||
.map(|file_cache| file_cache.config.calculate_cache_size(usable_system_memory))
|
||||
.unwrap_or(0);
|
||||
if let Some(cgroup) = &self.cgroup {
|
||||
let (last_time, last_history) = *cgroup.watcher.borrow();
|
||||
|
||||
@@ -273,7 +261,7 @@ impl Runner {
|
||||
|
||||
let new_threshold = self
|
||||
.config
|
||||
.cgroup_threshold(usable_system_memory, expected_file_cache_disk_size);
|
||||
.cgroup_threshold(usable_system_memory, expected_file_cache_size);
|
||||
|
||||
let current = last_history.avg_non_reclaimable;
|
||||
|
||||
@@ -300,13 +288,10 @@ impl Runner {
|
||||
.set_file_cache_size(expected_file_cache_size)
|
||||
.await
|
||||
.context("failed to set file cache size")?;
|
||||
if !file_cache.config.in_memory {
|
||||
file_cache_disk_size = actual_usage;
|
||||
}
|
||||
file_cache_disk_size = actual_usage;
|
||||
let message = format!(
|
||||
"set file cache size to {} MiB (in memory = {})",
|
||||
"set file cache size to {} MiB",
|
||||
bytes_to_mebibytes(actual_usage),
|
||||
file_cache.config.in_memory,
|
||||
);
|
||||
info!("downscale: {message}");
|
||||
status.push(message);
|
||||
@@ -357,9 +342,7 @@ impl Runner {
|
||||
.set_file_cache_size(expected_usage)
|
||||
.await
|
||||
.context("failed to set file cache size")?;
|
||||
if !file_cache.config.in_memory {
|
||||
file_cache_disk_size = actual_usage;
|
||||
}
|
||||
file_cache_disk_size = actual_usage;
|
||||
|
||||
if actual_usage != expected_usage {
|
||||
warn!(
|
||||
|
||||
@@ -188,6 +188,7 @@ extern "C" fn recovery_download(
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::unnecessary_cast)]
|
||||
extern "C" fn wal_read(
|
||||
sk: *mut Safekeeper,
|
||||
buf: *mut ::std::os::raw::c_char,
|
||||
@@ -421,6 +422,7 @@ impl std::fmt::Display for Level {
|
||||
}
|
||||
|
||||
/// Take ownership of `Vec<u8>` from StringInfoData.
|
||||
#[allow(clippy::unnecessary_cast)]
|
||||
pub(crate) fn take_vec_u8(pg: &mut StringInfoData) -> Option<Vec<u8>> {
|
||||
if pg.data.is_null() {
|
||||
return None;
|
||||
|
||||
@@ -186,7 +186,7 @@ impl Wrapper {
|
||||
.unwrap()
|
||||
.into_bytes_with_nul();
|
||||
assert!(safekeepers_list_vec.len() == safekeepers_list_vec.capacity());
|
||||
let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut i8;
|
||||
let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut std::ffi::c_char;
|
||||
|
||||
let callback_data = Box::into_raw(Box::new(api)) as *mut ::std::os::raw::c_void;
|
||||
|
||||
|
||||
@@ -1,22 +1,21 @@
|
||||
use anyhow::{bail, Result};
|
||||
use utils::auth::{Claims, Scope};
|
||||
use utils::auth::{AuthError, Claims, Scope};
|
||||
use utils::id::TenantId;
|
||||
|
||||
pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<()> {
|
||||
pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<(), AuthError> {
|
||||
match (&claims.scope, tenant_id) {
|
||||
(Scope::Tenant, None) => {
|
||||
bail!("Attempt to access management api with tenant scope. Permission denied")
|
||||
}
|
||||
(Scope::Tenant, None) => Err(AuthError(
|
||||
"Attempt to access management api with tenant scope. Permission denied".into(),
|
||||
)),
|
||||
(Scope::Tenant, Some(tenant_id)) => {
|
||||
if claims.tenant_id.unwrap() != tenant_id {
|
||||
bail!("Tenant id mismatch. Permission denied")
|
||||
return Err(AuthError("Tenant id mismatch. Permission denied".into()));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
(Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope
|
||||
(Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope
|
||||
(Scope::SafekeeperData, _) => {
|
||||
bail!("SafekeeperData scope makes no sense for Pageserver")
|
||||
}
|
||||
(Scope::SafekeeperData, _) => Err(AuthError(
|
||||
"SafekeeperData scope makes no sense for Pageserver".into(),
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,7 +14,7 @@ use pageserver::control_plane_client::ControlPlaneClient;
|
||||
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
|
||||
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
|
||||
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
|
||||
use pageserver::tenant::{secondary, TenantSharedResources};
|
||||
use pageserver::tenant::TenantSharedResources;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use tokio::time::Instant;
|
||||
use tracing::*;
|
||||
@@ -34,11 +34,15 @@ use postgres_backend::AuthType;
|
||||
use utils::logging::TracingErrorLayerEnablement;
|
||||
use utils::signals::ShutdownSignals;
|
||||
use utils::{
|
||||
auth::JwtAuth, logging, project_git_version, sentry_init::init_sentry, signals::Signal,
|
||||
auth::{JwtAuth, SwappableJwtAuth},
|
||||
logging, project_build_tag, project_git_version,
|
||||
sentry_init::init_sentry,
|
||||
signals::Signal,
|
||||
tcp_listener,
|
||||
};
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
project_build_tag!(BUILD_TAG);
|
||||
|
||||
const PID_FILE_NAME: &str = "pageserver.pid";
|
||||
|
||||
@@ -258,11 +262,12 @@ fn start_pageserver(
|
||||
// A changed version string indicates changed software.
|
||||
// A changed launch timestamp indicates a pageserver restart.
|
||||
info!(
|
||||
"version: {} launch_timestamp: {}",
|
||||
"version: {} launch_timestamp: {} build_tag: {}",
|
||||
version(),
|
||||
launch_ts.to_string()
|
||||
launch_ts.to_string(),
|
||||
BUILD_TAG,
|
||||
);
|
||||
set_build_info_metric(GIT_VERSION);
|
||||
set_build_info_metric(GIT_VERSION, BUILD_TAG);
|
||||
set_launch_timestamp_metric(launch_ts);
|
||||
pageserver::preinitialize_metrics();
|
||||
|
||||
@@ -319,13 +324,12 @@ fn start_pageserver(
|
||||
let http_auth;
|
||||
let pg_auth;
|
||||
if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
|
||||
// unwrap is ok because check is performed when creating config, so path is set and file exists
|
||||
// unwrap is ok because check is performed when creating config, so path is set and exists
|
||||
let key_path = conf.auth_validation_public_key_path.as_ref().unwrap();
|
||||
info!(
|
||||
"Loading public key for verifying JWT tokens from {:#?}",
|
||||
key_path
|
||||
);
|
||||
let auth: Arc<JwtAuth> = Arc::new(JwtAuth::from_key_path(key_path)?);
|
||||
info!("Loading public key(s) for verifying JWT tokens from {key_path:?}");
|
||||
|
||||
let jwt_auth = JwtAuth::from_key_path(key_path)?;
|
||||
let auth: Arc<SwappableJwtAuth> = Arc::new(SwappableJwtAuth::new(jwt_auth));
|
||||
|
||||
http_auth = match &conf.http_auth_type {
|
||||
AuthType::Trust => None,
|
||||
@@ -524,18 +528,6 @@ fn start_pageserver(
|
||||
}
|
||||
});
|
||||
|
||||
let secondary_controller = if let Some(remote_storage) = &remote_storage {
|
||||
secondary::spawn_tasks(
|
||||
conf,
|
||||
tenant_manager.clone(),
|
||||
remote_storage.clone(),
|
||||
background_jobs_barrier.clone(),
|
||||
shutdown_pageserver.clone(),
|
||||
)
|
||||
} else {
|
||||
secondary::null_controller()
|
||||
};
|
||||
|
||||
// shared state between the disk-usage backed eviction background task and the http endpoint
|
||||
// that allows triggering disk-usage based eviction manually. note that the http endpoint
|
||||
// is still accessible even if background task is not configured as long as remote storage has
|
||||
@@ -547,7 +539,6 @@ fn start_pageserver(
|
||||
conf,
|
||||
remote_storage.clone(),
|
||||
disk_usage_eviction_state.clone(),
|
||||
tenant_manager.clone(),
|
||||
background_jobs_barrier.clone(),
|
||||
)?;
|
||||
}
|
||||
@@ -566,7 +557,6 @@ fn start_pageserver(
|
||||
broker_client.clone(),
|
||||
disk_usage_eviction_state,
|
||||
deletion_queue.new_client(),
|
||||
secondary_controller,
|
||||
)
|
||||
.context("Failed to initialize router state")?,
|
||||
);
|
||||
|
||||
@@ -33,8 +33,7 @@ use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
|
||||
use crate::tenant::config::TenantConf;
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::{
|
||||
TENANTS_SEGMENT_NAME, TENANT_ATTACHING_MARKER_FILENAME, TENANT_DELETED_MARKER_FILE_NAME,
|
||||
TIMELINES_SEGMENT_NAME,
|
||||
TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
|
||||
};
|
||||
use crate::{
|
||||
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_LOCATION_CONFIG_NAME,
|
||||
@@ -162,7 +161,7 @@ pub struct PageServerConf {
|
||||
pub http_auth_type: AuthType,
|
||||
/// authentication method for libpq connections from compute
|
||||
pub pg_auth_type: AuthType,
|
||||
/// Path to a file containing public key for verifying JWT tokens.
|
||||
/// Path to a file or directory containing public key(s) for verifying JWT tokens.
|
||||
/// Used for both mgmt and compute auth, if enabled.
|
||||
pub auth_validation_public_key_path: Option<Utf8PathBuf>,
|
||||
|
||||
@@ -633,11 +632,6 @@ impl PageServerConf {
|
||||
self.tenants_path().join(tenant_id.to_string())
|
||||
}
|
||||
|
||||
pub fn tenant_attaching_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
|
||||
self.tenant_path(tenant_id)
|
||||
.join(TENANT_ATTACHING_MARKER_FILENAME)
|
||||
}
|
||||
|
||||
pub fn tenant_ignore_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
|
||||
self.tenant_path(tenant_id).join(IGNORED_TENANT_FILE_NAME)
|
||||
}
|
||||
@@ -1320,12 +1314,6 @@ broker_endpoint = '{broker_endpoint}'
|
||||
assert_eq!(
|
||||
parsed_remote_storage_config,
|
||||
RemoteStorageConfig {
|
||||
max_concurrent_syncs: NonZeroUsize::new(
|
||||
remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS
|
||||
)
|
||||
.unwrap(),
|
||||
max_sync_errors: NonZeroU32::new(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS)
|
||||
.unwrap(),
|
||||
storage: RemoteStorageKind::LocalFs(local_storage_path.clone()),
|
||||
},
|
||||
"Remote storage config should correctly parse the local FS config and fill other storage defaults"
|
||||
@@ -1386,8 +1374,6 @@ broker_endpoint = '{broker_endpoint}'
|
||||
assert_eq!(
|
||||
parsed_remote_storage_config,
|
||||
RemoteStorageConfig {
|
||||
max_concurrent_syncs,
|
||||
max_sync_errors,
|
||||
storage: RemoteStorageKind::AwsS3(S3Config {
|
||||
bucket_name: bucket_name.clone(),
|
||||
bucket_region: bucket_region.clone(),
|
||||
|
||||
@@ -3,7 +3,6 @@ use anyhow::Context;
|
||||
use chrono::{DateTime, Utc};
|
||||
use consumption_metrics::EventType;
|
||||
use futures::stream::StreamExt;
|
||||
use serde_with::serde_as;
|
||||
use std::{sync::Arc, time::SystemTime};
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
@@ -42,13 +41,10 @@ pub(super) enum Name {
|
||||
///
|
||||
/// This is a denormalization done at the MetricsKey const methods; these should not be constructed
|
||||
/// elsewhere.
|
||||
#[serde_with::serde_as]
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
|
||||
pub(crate) struct MetricsKey {
|
||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||
pub(super) tenant_id: TenantId,
|
||||
|
||||
#[serde_as(as = "Option<serde_with::DisplayFromStr>")]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub(super) timeline_id: Option<TimelineId>,
|
||||
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use consumption_metrics::{Event, EventChunk, IdempotencyKey, CHUNK_SIZE};
|
||||
use serde_with::serde_as;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::Instrument;
|
||||
|
||||
@@ -7,12 +6,9 @@ use super::{metrics::Name, Cache, MetricsKey, RawMetric};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
/// How the metrics from pageserver are identified.
|
||||
#[serde_with::serde_as]
|
||||
#[derive(serde::Serialize, serde::Deserialize, Debug, Clone, Copy, PartialEq)]
|
||||
struct Ids {
|
||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||
pub(super) tenant_id: TenantId,
|
||||
#[serde_as(as = "Option<serde_with::DisplayFromStr>")]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub(super) timeline_id: Option<TimelineId>,
|
||||
}
|
||||
|
||||
@@ -57,7 +57,10 @@ impl ControlPlaneClient {
|
||||
|
||||
if let Some(jwt) = &conf.control_plane_api_token {
|
||||
let mut headers = hyper::HeaderMap::new();
|
||||
headers.insert("Authorization", jwt.get_contents().parse().unwrap());
|
||||
headers.insert(
|
||||
"Authorization",
|
||||
format!("Bearer {}", jwt.get_contents()).parse().unwrap(),
|
||||
);
|
||||
client = client.default_headers(headers);
|
||||
}
|
||||
|
||||
@@ -144,7 +147,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
||||
Ok(response
|
||||
.tenants
|
||||
.into_iter()
|
||||
.map(|t| (t.id, Generation::new(t.generation)))
|
||||
.map(|t| (t.id, Generation::new(t.gen)))
|
||||
.collect::<HashMap<_, _>>())
|
||||
}
|
||||
|
||||
|
||||
@@ -10,6 +10,7 @@ use crate::control_plane_client::ControlPlaneGenerationsApi;
|
||||
use crate::metrics;
|
||||
use crate::tenant::remote_timeline_client::remote_layer_path;
|
||||
use crate::tenant::remote_timeline_client::remote_timeline_path;
|
||||
use crate::virtual_file::MaybeFatalIo;
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use anyhow::Context;
|
||||
use camino::Utf8PathBuf;
|
||||
@@ -17,7 +18,6 @@ use hex::FromHex;
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath};
|
||||
use serde::Deserialize;
|
||||
use serde::Serialize;
|
||||
use serde_with::serde_as;
|
||||
use thiserror::Error;
|
||||
use tokio;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -214,7 +214,6 @@ where
|
||||
/// during recovery as startup.
|
||||
const TEMP_SUFFIX: &str = "tmp";
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct DeletionList {
|
||||
/// Serialization version, for future use
|
||||
@@ -243,7 +242,6 @@ struct DeletionList {
|
||||
validated: bool,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct DeletionHeader {
|
||||
/// Serialization version, for future use
|
||||
@@ -271,7 +269,9 @@ impl DeletionHeader {
|
||||
let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX);
|
||||
VirtualFile::crashsafe_overwrite(&header_path, &temp_path, &header_bytes)
|
||||
.await
|
||||
.map_err(Into::into)
|
||||
.maybe_fatal_err("save deletion header")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -360,6 +360,7 @@ impl DeletionList {
|
||||
let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list");
|
||||
VirtualFile::crashsafe_overwrite(&path, &temp_path, &bytes)
|
||||
.await
|
||||
.maybe_fatal_err("save deletion list")
|
||||
.map_err(Into::into)
|
||||
}
|
||||
}
|
||||
@@ -892,14 +893,6 @@ mod test {
|
||||
std::fs::create_dir_all(remote_fs_dir)?;
|
||||
let remote_fs_dir = harness.conf.workdir.join("remote_fs").canonicalize_utf8()?;
|
||||
let storage_config = RemoteStorageConfig {
|
||||
max_concurrent_syncs: std::num::NonZeroUsize::new(
|
||||
remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
|
||||
)
|
||||
.unwrap(),
|
||||
max_sync_errors: std::num::NonZeroU32::new(
|
||||
remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
|
||||
)
|
||||
.unwrap(),
|
||||
storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
|
||||
};
|
||||
let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
|
||||
|
||||
@@ -1,2 +0,0 @@
|
||||
Checking pageserver v0.1.0 (/home/neon/neon/pageserver)
|
||||
Finished dev [optimized + debuginfo] target(s) in 7.62s
|
||||
@@ -55,21 +55,24 @@ impl Deleter {
|
||||
|
||||
/// Wrap the remote `delete_objects` with a failpoint
|
||||
async fn remote_delete(&self) -> Result<(), anyhow::Error> {
|
||||
fail::fail_point!("deletion-queue-before-execute", |_| {
|
||||
info!("Skipping execution, failpoint set");
|
||||
metrics::DELETION_QUEUE
|
||||
.remote_errors
|
||||
.with_label_values(&["failpoint"])
|
||||
.inc();
|
||||
Err(anyhow::anyhow!("failpoint hit"))
|
||||
});
|
||||
|
||||
// A backoff::retry is used here for two reasons:
|
||||
// - To provide a backoff rather than busy-polling the API on errors
|
||||
// - To absorb transient 429/503 conditions without hitting our error
|
||||
// logging path for issues deleting objects.
|
||||
backoff::retry(
|
||||
|| async { self.remote_storage.delete_objects(&self.accumulator).await },
|
||||
|| async {
|
||||
fail::fail_point!("deletion-queue-before-execute", |_| {
|
||||
info!("Skipping execution, failpoint set");
|
||||
|
||||
metrics::DELETION_QUEUE
|
||||
.remote_errors
|
||||
.with_label_values(&["failpoint"])
|
||||
.inc();
|
||||
Err(anyhow::anyhow!("failpoint: deletion-queue-before-execute"))
|
||||
});
|
||||
|
||||
self.remote_storage.delete_objects(&self.accumulator).await
|
||||
},
|
||||
|_| false,
|
||||
3,
|
||||
10,
|
||||
|
||||
@@ -34,6 +34,8 @@ use crate::deletion_queue::TEMP_SUFFIX;
|
||||
use crate::metrics;
|
||||
use crate::tenant::remote_timeline_client::remote_layer_path;
|
||||
use crate::tenant::storage_layer::LayerFileName;
|
||||
use crate::virtual_file::on_fatal_io_error;
|
||||
use crate::virtual_file::MaybeFatalIo;
|
||||
|
||||
// The number of keys in a DeletionList before we will proactively persist it
|
||||
// (without reaching a flush deadline). This aims to deliver objects of the order
|
||||
@@ -195,7 +197,7 @@ impl ListWriter {
|
||||
debug!("Deletion header {header_path} not found, first start?");
|
||||
Ok(None)
|
||||
} else {
|
||||
Err(anyhow::anyhow!(e))
|
||||
on_fatal_io_error(&e, "reading deletion header");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -216,16 +218,9 @@ impl ListWriter {
|
||||
self.pending.sequence = validated_sequence + 1;
|
||||
|
||||
let deletion_directory = self.conf.deletion_prefix();
|
||||
let mut dir = match tokio::fs::read_dir(&deletion_directory).await {
|
||||
Ok(d) => d,
|
||||
Err(e) => {
|
||||
warn!("Failed to open deletion list directory {deletion_directory}: {e:#}");
|
||||
|
||||
// Give up: if we can't read the deletion list directory, we probably can't
|
||||
// write lists into it later, so the queue won't work.
|
||||
return Err(e.into());
|
||||
}
|
||||
};
|
||||
let mut dir = tokio::fs::read_dir(&deletion_directory)
|
||||
.await
|
||||
.fatal_err("read deletion directory");
|
||||
|
||||
let list_name_pattern =
|
||||
Regex::new("(?<sequence>[a-zA-Z0-9]{16})-(?<version>[a-zA-Z0-9]{2}).list").unwrap();
|
||||
@@ -233,7 +228,7 @@ impl ListWriter {
|
||||
let temp_extension = format!(".{TEMP_SUFFIX}");
|
||||
let header_path = self.conf.deletion_header_path();
|
||||
let mut seqs: Vec<u64> = Vec::new();
|
||||
while let Some(dentry) = dir.next_entry().await? {
|
||||
while let Some(dentry) = dir.next_entry().await.fatal_err("read deletion dentry") {
|
||||
let file_name = dentry.file_name();
|
||||
let dentry_str = file_name.to_string_lossy();
|
||||
|
||||
@@ -246,11 +241,9 @@ impl ListWriter {
|
||||
info!("Cleaning up temporary file {dentry_str}");
|
||||
let absolute_path =
|
||||
deletion_directory.join(dentry.file_name().to_str().expect("non-Unicode path"));
|
||||
if let Err(e) = tokio::fs::remove_file(&absolute_path).await {
|
||||
// Non-fatal error: we will just leave the file behind but not
|
||||
// try and load it.
|
||||
warn!("Failed to clean up temporary file {absolute_path}: {e:#}");
|
||||
}
|
||||
tokio::fs::remove_file(&absolute_path)
|
||||
.await
|
||||
.fatal_err("delete temp file");
|
||||
|
||||
continue;
|
||||
}
|
||||
@@ -290,7 +283,9 @@ impl ListWriter {
|
||||
for s in seqs {
|
||||
let list_path = self.conf.deletion_list_path(s);
|
||||
|
||||
let list_bytes = tokio::fs::read(&list_path).await?;
|
||||
let list_bytes = tokio::fs::read(&list_path)
|
||||
.await
|
||||
.fatal_err("read deletion list");
|
||||
|
||||
let mut deletion_list = match serde_json::from_slice::<DeletionList>(&list_bytes) {
|
||||
Ok(l) => l,
|
||||
@@ -349,7 +344,7 @@ impl ListWriter {
|
||||
info!("Started deletion frontend worker");
|
||||
|
||||
// Synchronous, but we only do it once per process lifetime so it's tolerable
|
||||
if let Err(e) = create_dir_all(&self.conf.deletion_prefix()) {
|
||||
if let Err(e) = create_dir_all(self.conf.deletion_prefix()) {
|
||||
tracing::error!(
|
||||
"Failed to create deletion list directory {}, deletions will not be executed ({e})",
|
||||
self.conf.deletion_prefix(),
|
||||
|
||||
@@ -28,6 +28,7 @@ use crate::config::PageServerConf;
|
||||
use crate::control_plane_client::ControlPlaneGenerationsApi;
|
||||
use crate::control_plane_client::RetryForeverError;
|
||||
use crate::metrics;
|
||||
use crate::virtual_file::MaybeFatalIo;
|
||||
|
||||
use super::deleter::DeleterMessage;
|
||||
use super::DeletionHeader;
|
||||
@@ -287,16 +288,9 @@ where
|
||||
async fn cleanup_lists(&mut self, list_paths: Vec<Utf8PathBuf>) {
|
||||
for list_path in list_paths {
|
||||
debug!("Removing deletion list {list_path}");
|
||||
|
||||
if let Err(e) = tokio::fs::remove_file(&list_path).await {
|
||||
// Unexpected: we should have permissions and nothing else should
|
||||
// be touching these files. We will leave the file behind. Subsequent
|
||||
// pageservers will try and load it again: hopefully whatever storage
|
||||
// issue (probably permissions) has been fixed by then.
|
||||
tracing::error!("Failed to delete {list_path}: {e:#}");
|
||||
metrics::DELETION_QUEUE.unexpected_errors.inc();
|
||||
break;
|
||||
}
|
||||
tokio::fs::remove_file(&list_path)
|
||||
.await
|
||||
.fatal_err("remove deletion list");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -48,23 +48,20 @@ use std::{
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use camino::Utf8Path;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::time::Instant;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, error, info, instrument, warn, Instrument};
|
||||
use utils::{
|
||||
completion,
|
||||
id::{TenantId, TenantTimelineId},
|
||||
};
|
||||
use utils::{id::TimelineId, serde_percent::Percent};
|
||||
use utils::completion;
|
||||
use utils::serde_percent::Percent;
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
||||
tenant::{
|
||||
mgr::TenantManager,
|
||||
secondary::SecondaryTenant,
|
||||
self,
|
||||
storage_layer::{AsLayerDesc, EvictionError, Layer},
|
||||
Timeline,
|
||||
},
|
||||
@@ -90,7 +87,6 @@ pub fn launch_disk_usage_global_eviction_task(
|
||||
conf: &'static PageServerConf,
|
||||
storage: GenericRemoteStorage,
|
||||
state: Arc<State>,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
background_jobs_barrier: completion::Barrier,
|
||||
) -> anyhow::Result<()> {
|
||||
let Some(task_config) = &conf.disk_usage_based_eviction else {
|
||||
@@ -116,7 +112,8 @@ pub fn launch_disk_usage_global_eviction_task(
|
||||
_ = background_jobs_barrier.wait() => { }
|
||||
};
|
||||
|
||||
disk_usage_eviction_task(&state, task_config, &storage, tenant_manager, cancel).await;
|
||||
disk_usage_eviction_task(&state, task_config, &storage, &conf.tenants_path(), cancel)
|
||||
.await;
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
@@ -129,7 +126,7 @@ async fn disk_usage_eviction_task(
|
||||
state: &State,
|
||||
task_config: &DiskUsageEvictionTaskConfig,
|
||||
_storage: &GenericRemoteStorage,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
tenants_dir: &Utf8Path,
|
||||
cancel: CancellationToken,
|
||||
) {
|
||||
scopeguard::defer! {
|
||||
@@ -153,8 +150,7 @@ async fn disk_usage_eviction_task(
|
||||
|
||||
async {
|
||||
let res =
|
||||
disk_usage_eviction_task_iteration(state, task_config, &tenant_manager, &cancel)
|
||||
.await;
|
||||
disk_usage_eviction_task_iteration(state, task_config, tenants_dir, &cancel).await;
|
||||
|
||||
match res {
|
||||
Ok(()) => {}
|
||||
@@ -185,14 +181,12 @@ pub trait Usage: Clone + Copy + std::fmt::Debug {
|
||||
async fn disk_usage_eviction_task_iteration(
|
||||
state: &State,
|
||||
task_config: &DiskUsageEvictionTaskConfig,
|
||||
tenant_manager: &Arc<TenantManager>,
|
||||
tenants_dir: &Utf8Path,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let tenants_dir = tenant_manager.get_conf().tenants_path();
|
||||
let usage_pre = filesystem_level_usage::get(&tenants_dir, task_config)
|
||||
let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
|
||||
.context("get filesystem-level disk usage before evictions")?;
|
||||
let res =
|
||||
disk_usage_eviction_task_iteration_impl(state, usage_pre, tenant_manager, cancel).await;
|
||||
let res = disk_usage_eviction_task_iteration_impl(state, usage_pre, cancel).await;
|
||||
match res {
|
||||
Ok(outcome) => {
|
||||
debug!(?outcome, "disk_usage_eviction_iteration finished");
|
||||
@@ -202,7 +196,7 @@ async fn disk_usage_eviction_task_iteration(
|
||||
}
|
||||
IterationOutcome::Finished(outcome) => {
|
||||
// Verify with statvfs whether we made any real progress
|
||||
let after = filesystem_level_usage::get(&tenants_dir, task_config)
|
||||
let after = filesystem_level_usage::get(tenants_dir, task_config)
|
||||
// It's quite unlikely to hit the error here. Keep the code simple and bail out.
|
||||
.context("get filesystem-level disk usage after evictions")?;
|
||||
|
||||
@@ -277,7 +271,6 @@ struct LayerCount {
|
||||
pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
state: &State,
|
||||
usage_pre: U,
|
||||
tenant_manager: &Arc<TenantManager>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<IterationOutcome<U>> {
|
||||
// use tokio's mutex to get a Sync guard (instead of std::sync::Mutex)
|
||||
@@ -297,7 +290,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
"running disk usage based eviction due to pressure"
|
||||
);
|
||||
|
||||
let candidates = match collect_eviction_candidates(tenant_manager, cancel).await? {
|
||||
let candidates = match collect_eviction_candidates(cancel).await? {
|
||||
EvictionCandidates::Cancelled => {
|
||||
return Ok(IterationOutcome::Cancelled);
|
||||
}
|
||||
@@ -333,13 +326,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
// If we get far enough in the list that we start to evict layers that are below
|
||||
// the tenant's min-resident-size threshold, print a warning, and memorize the disk
|
||||
// usage at that point, in 'usage_planned_min_resident_size_respecting'.
|
||||
|
||||
// Evictions for attached tenants, batched by timeline
|
||||
let mut batched: HashMap<_, Vec<_>> = HashMap::new();
|
||||
|
||||
// Evictions for secondary locations, batched by tenant
|
||||
let mut secondary_by_tenant: HashMap<TenantId, Vec<(TimelineId, Layer)>> = HashMap::new();
|
||||
|
||||
let mut warned = None;
|
||||
let mut usage_planned = usage_pre;
|
||||
let mut max_batch_size = 0;
|
||||
@@ -362,22 +349,14 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
// FIXME: batching makes no sense anymore because of no layermap locking, should just spawn
|
||||
// tasks to evict all seen layers until we have evicted enough
|
||||
|
||||
match candidate.source {
|
||||
EvictionCandidateSource::Attached(timeline) => {
|
||||
let batch = batched.entry(TimelineKey(timeline)).or_default();
|
||||
let batch = batched.entry(TimelineKey(candidate.timeline)).or_default();
|
||||
|
||||
// semaphore will later be used to limit eviction concurrency, and we can express at
|
||||
// most u32 number of permits. unlikely we would have u32::MAX layers to be evicted,
|
||||
// but fail gracefully by not making batches larger.
|
||||
if batch.len() < u32::MAX as usize {
|
||||
batch.push(candidate.layer);
|
||||
max_batch_size = max_batch_size.max(batch.len());
|
||||
}
|
||||
}
|
||||
EvictionCandidateSource::Secondary(ttid) => {
|
||||
let batch = secondary_by_tenant.entry(ttid.tenant_id).or_default();
|
||||
batch.push((ttid.timeline_id, candidate.layer));
|
||||
}
|
||||
// semaphore will later be used to limit eviction concurrency, and we can express at
|
||||
// most u32 number of permits. unlikely we would have u32::MAX layers to be evicted,
|
||||
// but fail gracefully by not making batches larger.
|
||||
if batch.len() < u32::MAX as usize {
|
||||
batch.push(candidate.layer);
|
||||
max_batch_size = max_batch_size.max(batch.len());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -393,20 +372,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
};
|
||||
debug!(?usage_planned, "usage planned");
|
||||
|
||||
// phase2 (secondary tenants): evict victims batched by tenant
|
||||
for (tenant_id, timeline_layers) in secondary_by_tenant {
|
||||
// Q: Why do we go via TenantManager again rather than just deleting files, or keeping
|
||||
// an Arc ref to the secondary state?
|
||||
// A: It's because a given tenant's local storage **belongs** to whoever is currently
|
||||
// live in the TenantManager. We must avoid a race where we might plan an eviction
|
||||
// for secondary, and then execute it when the tenant is actually in an attached state.
|
||||
tenant_manager
|
||||
.evict_tenant_layers(&tenant_id, timeline_layers)
|
||||
.instrument(tracing::info_span!("evict_batch", %tenant_id))
|
||||
.await;
|
||||
}
|
||||
|
||||
// phase2 (attached tenants): evict victims batched by timeline
|
||||
// phase2: evict victims batched by timeline
|
||||
|
||||
let mut js = tokio::task::JoinSet::new();
|
||||
|
||||
@@ -437,7 +403,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
return (evicted_bytes, evictions_failed);
|
||||
};
|
||||
|
||||
let results = timeline.evict_layers(&batch, &cancel).await;
|
||||
let results = timeline.evict_layers(&batch).await;
|
||||
|
||||
match results {
|
||||
Ok(results) => {
|
||||
@@ -514,18 +480,9 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
}))
|
||||
}
|
||||
|
||||
// An eviction candidate might originate from either an attached tenant
|
||||
// with a [`Tenant`] and [`Timeline`] object, or from a secondary tenant
|
||||
// location. These differ in how we will execute the eviction.
|
||||
#[derive(Clone)]
|
||||
enum EvictionCandidateSource {
|
||||
Attached(Arc<Timeline>),
|
||||
Secondary(TenantTimelineId),
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct EvictionCandidate {
|
||||
source: EvictionCandidateSource,
|
||||
timeline: Arc<Timeline>,
|
||||
layer: Layer,
|
||||
last_activity_ts: SystemTime,
|
||||
}
|
||||
@@ -575,18 +532,32 @@ enum EvictionCandidates {
|
||||
/// after exhauting the `Above` partition.
|
||||
/// So, we did not respect each tenant's min_resident_size.
|
||||
async fn collect_eviction_candidates(
|
||||
tenant_manager: &Arc<TenantManager>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<EvictionCandidates> {
|
||||
// get a snapshot of the list of tenants
|
||||
let tenants = tenant::mgr::list_tenants()
|
||||
.await
|
||||
.context("get list of tenants")?;
|
||||
|
||||
let mut candidates = Vec::new();
|
||||
|
||||
let tenants = tenant_manager.get_attached_tenants();
|
||||
|
||||
for tenant in tenants {
|
||||
for (tenant_id, _state) in &tenants {
|
||||
if cancel.is_cancelled() {
|
||||
return Ok(EvictionCandidates::Cancelled);
|
||||
}
|
||||
let tenant = match tenant::mgr::get_tenant(*tenant_id, true) {
|
||||
Ok(tenant) => tenant,
|
||||
Err(e) => {
|
||||
// this can happen if tenant has lifecycle transition after we fetched it
|
||||
debug!("failed to get tenant: {e:#}");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
if tenant.cancel.is_cancelled() {
|
||||
info!(%tenant_id, "Skipping tenant for eviction, it is shutting down");
|
||||
continue;
|
||||
}
|
||||
|
||||
// collect layers from all timelines in this tenant
|
||||
//
|
||||
@@ -649,7 +620,7 @@ async fn collect_eviction_candidates(
|
||||
for (timeline, layer_info) in tenant_candidates.into_iter() {
|
||||
let file_size = layer_info.file_size();
|
||||
let candidate = EvictionCandidate {
|
||||
source: EvictionCandidateSource::Attached(timeline),
|
||||
timeline,
|
||||
last_activity_ts: layer_info.last_activity_ts,
|
||||
layer: layer_info.layer,
|
||||
};
|
||||
@@ -663,43 +634,6 @@ async fn collect_eviction_candidates(
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: this is a long loop over all secondary locations. At the least, respect
|
||||
// cancellation here, but really we need to break up the loop. We could extract the
|
||||
// Arc<SecondaryTenant>s and iterate over them with some tokio yields in there. Ideally
|
||||
// though we should just reduce the total amount of work: our eviction goals do not require
|
||||
// listing absolutely every layer in every tenant: we could sample this.
|
||||
tenant_manager.foreach_secondary_tenants(
|
||||
|tenant_id: &TenantId, state: &Arc<SecondaryTenant>| {
|
||||
let mut tenant_candidates = Vec::new();
|
||||
for (timeline_id, layer_info) in state.get_layers_for_eviction() {
|
||||
debug!(tenant_id=%tenant_id, timeline_id=%timeline_id, "timeline resident layers (secondary) count: {}", layer_info.resident_layers.len());
|
||||
tenant_candidates.extend(
|
||||
layer_info.resident_layers
|
||||
.into_iter()
|
||||
.map(|layer_infos| (timeline_id, layer_infos)),
|
||||
);
|
||||
}
|
||||
|
||||
tenant_candidates
|
||||
.sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts));
|
||||
|
||||
candidates.extend(tenant_candidates.into_iter().map(|(timeline_id, candidate)| {
|
||||
(
|
||||
// Secondary locations' layers are always considered above the min resident size,
|
||||
// i.e. secondary locations are permitted to be trimmed to zero layers if all
|
||||
// the layers have sufficiently old access times.
|
||||
MinResidentSizePartition::Above,
|
||||
EvictionCandidate {
|
||||
source: EvictionCandidateSource::Secondary(TenantTimelineId { tenant_id: *tenant_id, timeline_id}),
|
||||
last_activity_ts: candidate.last_activity_ts,
|
||||
layer: candidate.layer,
|
||||
}
|
||||
)
|
||||
}));
|
||||
|
||||
},
|
||||
);
|
||||
|
||||
debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
|
||||
"as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
|
||||
candidates
|
||||
|
||||
@@ -52,6 +52,31 @@ paths:
|
||||
schema:
|
||||
type: object
|
||||
|
||||
/v1/reload_auth_validation_keys:
|
||||
post:
|
||||
description: Reloads the JWT public keys from their pre-configured location on disk.
|
||||
responses:
|
||||
"200":
|
||||
description: The reload completed successfully.
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error (also hits if no keys were found)
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/tenant/{tenant_id}:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
@@ -327,7 +352,8 @@ paths:
|
||||
in: query
|
||||
required: true
|
||||
schema:
|
||||
type: integer
|
||||
type: string
|
||||
format: hex
|
||||
description: A LSN to get the timestamp
|
||||
responses:
|
||||
"200":
|
||||
@@ -569,7 +595,17 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/NotFoundError"
|
||||
"409":
|
||||
description: Tenant download is already in progress
|
||||
description: |
|
||||
The tenant is already known to Pageserver in some way,
|
||||
and hence this `/attach` call has been rejected.
|
||||
|
||||
Some examples of how this can happen:
|
||||
- tenant was created on this pageserver
|
||||
- tenant attachment was started by an earlier call to `/attach`.
|
||||
|
||||
Callers should poll the tenant status's `attachment_status` field,
|
||||
like for status 202. See the longer description for `POST /attach`
|
||||
for details.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
@@ -713,6 +749,12 @@ paths:
|
||||
|
||||
Errors if the tenant is absent on disk, already present in memory or fails to schedule its load.
|
||||
Scheduling a load does not mean that the tenant would load successfully, check tenant status to ensure load correctness.
|
||||
requestBody:
|
||||
required: false
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/TenantLoadRequest"
|
||||
responses:
|
||||
"202":
|
||||
description: Tenant scheduled to load successfully
|
||||
@@ -1203,6 +1245,15 @@ components:
|
||||
new_tenant_id:
|
||||
type: string
|
||||
format: hex
|
||||
generation:
|
||||
type: integer
|
||||
description: Attachment generation number.
|
||||
TenantLoadRequest:
|
||||
type: object
|
||||
properties:
|
||||
generation:
|
||||
type: integer
|
||||
description: Attachment generation number.
|
||||
TenantAttachRequest:
|
||||
type: object
|
||||
required:
|
||||
|
||||
@@ -4,7 +4,6 @@
|
||||
use std::collections::HashMap;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use futures::TryFutureExt;
|
||||
@@ -18,10 +17,10 @@ use pageserver_api::models::{
|
||||
TenantLoadRequest, TenantLocationConfigRequest,
|
||||
};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use tenant_size_model::{SizeResult, StorageModel};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::auth::JwtAuth;
|
||||
use utils::http::endpoint::request_span;
|
||||
use utils::http::json::json_request_or_empty_body;
|
||||
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
|
||||
@@ -40,7 +39,6 @@ use crate::tenant::mgr::{
|
||||
GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
|
||||
TenantSlotError, TenantSlotUpsertError, TenantStateError,
|
||||
};
|
||||
use crate::tenant::secondary::SecondaryController;
|
||||
use crate::tenant::size::ModelInputs;
|
||||
use crate::tenant::storage_layer::LayerAccessStatsReset;
|
||||
use crate::tenant::timeline::Timeline;
|
||||
@@ -48,7 +46,7 @@ use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSha
|
||||
use crate::{config::PageServerConf, tenant::mgr};
|
||||
use crate::{disk_usage_eviction_task, tenant};
|
||||
use utils::{
|
||||
auth::JwtAuth,
|
||||
auth::SwappableJwtAuth,
|
||||
generation::Generation,
|
||||
http::{
|
||||
endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with},
|
||||
@@ -67,26 +65,23 @@ use super::models::ConfigureFailpointsRequest;
|
||||
pub struct State {
|
||||
conf: &'static PageServerConf,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
allowlist_routes: Vec<Uri>,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
secondary_controller: SecondaryController,
|
||||
}
|
||||
|
||||
impl State {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn new(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
secondary_controller: SecondaryController,
|
||||
) -> anyhow::Result<Self> {
|
||||
let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml", "/metrics"]
|
||||
.iter()
|
||||
@@ -101,7 +96,6 @@ impl State {
|
||||
broker_client,
|
||||
disk_usage_eviction_state,
|
||||
deletion_queue_client,
|
||||
secondary_controller,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -171,8 +165,7 @@ impl From<TenantSlotError> for ApiError {
|
||||
NotFound(tenant_id) => {
|
||||
ApiError::NotFound(anyhow::anyhow!("NotFound: tenant {tenant_id}").into())
|
||||
}
|
||||
e @ AlreadyExists(_, _) => ApiError::Conflict(format!("{e}")),
|
||||
e @ Conflict(_) => ApiError::Conflict(format!("{e}")),
|
||||
e @ (AlreadyExists(_, _) | Conflict(_)) => ApiError::Conflict(format!("{e}")),
|
||||
InProgress => {
|
||||
ApiError::ResourceUnavailable("Tenant is being modified concurrently".into())
|
||||
}
|
||||
@@ -231,6 +224,7 @@ impl From<GetTenantError> for ApiError {
|
||||
// (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls).
|
||||
ApiError::ResourceUnavailable("Tenant not yet active".into())
|
||||
}
|
||||
GetTenantError::MapState(e) => ApiError::ResourceUnavailable(format!("{e}").into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -309,11 +303,7 @@ async fn build_timeline_info(
|
||||
// we're executing this function, we will outlive the timeline on-disk state.
|
||||
info.current_logical_size_non_incremental = Some(
|
||||
timeline
|
||||
.get_current_logical_size_non_incremental(
|
||||
info.last_record_lsn,
|
||||
CancellationToken::new(),
|
||||
ctx,
|
||||
)
|
||||
.get_current_logical_size_non_incremental(info.last_record_lsn, ctx)
|
||||
.await?,
|
||||
);
|
||||
}
|
||||
@@ -399,6 +389,32 @@ async fn status_handler(
|
||||
json_response(StatusCode::OK, StatusResponse { id: config.id })
|
||||
}
|
||||
|
||||
async fn reload_auth_validation_keys_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&request, None)?;
|
||||
let config = get_config(&request);
|
||||
let state = get_state(&request);
|
||||
let Some(shared_auth) = &state.auth else {
|
||||
return json_response(StatusCode::BAD_REQUEST, ());
|
||||
};
|
||||
// unwrap is ok because check is performed when creating config, so path is set and exists
|
||||
let key_path = config.auth_validation_public_key_path.as_ref().unwrap();
|
||||
info!("Reloading public key(s) for verifying JWT tokens from {key_path:?}");
|
||||
|
||||
match JwtAuth::from_key_path(key_path) {
|
||||
Ok(new_auth) => {
|
||||
shared_auth.swap(new_auth);
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("Error reloading public keys from {key_path:?}: {e:}");
|
||||
json_response(StatusCode::INTERNAL_SERVER_ERROR, ())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn timeline_create_handler(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
@@ -442,6 +458,9 @@ async fn timeline_create_handler(
|
||||
Err(e @ tenant::CreateTimelineError::AncestorNotActive) => {
|
||||
json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg(e.to_string()))
|
||||
}
|
||||
Err(tenant::CreateTimelineError::ShuttingDown) => {
|
||||
json_response(StatusCode::SERVICE_UNAVAILABLE,HttpErrorBody::from_msg("tenant shutting down".to_string()))
|
||||
}
|
||||
Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
|
||||
}
|
||||
}
|
||||
@@ -544,10 +563,8 @@ async fn get_lsn_by_timestamp_handler(
|
||||
let result = timeline.find_lsn_for_timestamp(timestamp_pg, &ctx).await?;
|
||||
|
||||
if version.unwrap_or(0) > 1 {
|
||||
#[serde_as]
|
||||
#[derive(serde::Serialize)]
|
||||
struct Result {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
lsn: Lsn,
|
||||
kind: &'static str,
|
||||
}
|
||||
@@ -587,7 +604,7 @@ async fn get_timestamp_of_lsn_handler(
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
|
||||
let result = timeline.get_timestamp_for_lsn(lsn, &ctx).await?;
|
||||
let result = timeline.get_max_timestamp_for_lsn(lsn, &ctx).await?;
|
||||
|
||||
match result {
|
||||
Some(time) => {
|
||||
@@ -856,10 +873,8 @@ async fn tenant_size_handler(
|
||||
}
|
||||
|
||||
/// The type resides in the pageserver not to expose `ModelInputs`.
|
||||
#[serde_with::serde_as]
|
||||
#[derive(serde::Serialize)]
|
||||
struct TenantHistorySize {
|
||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||
id: TenantId,
|
||||
/// Size is a mixture of WAL and logical size, so the unit is bytes.
|
||||
///
|
||||
@@ -1124,9 +1139,6 @@ async fn put_tenant_location_config_handler(
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let request_data: TenantLocationConfigRequest = json_request(&mut request).await?;
|
||||
|
||||
let flush = parse_query_param(&request, "flush_ms")?.map(Duration::from_millis);
|
||||
|
||||
let tenant_id = request_data.tenant_id;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
@@ -1156,7 +1168,7 @@ async fn put_tenant_location_config_handler(
|
||||
|
||||
state
|
||||
.tenant_manager
|
||||
.upsert_location(tenant_id, location_conf, flush, &ctx)
|
||||
.upsert_location(tenant_id, location_conf, &ctx)
|
||||
.await
|
||||
// TODO: badrequest assumes the caller was asking for something unreasonable, but in
|
||||
// principle we might have hit something like concurrent API calls to the same tenant,
|
||||
@@ -1466,7 +1478,7 @@ async fn timeline_collect_keyspace(
|
||||
let keys = timeline
|
||||
.collect_keyspace(at_lsn, &ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
|
||||
json_response(StatusCode::OK, Partitioning { keys, at_lsn })
|
||||
}
|
||||
@@ -1547,12 +1559,11 @@ async fn disk_usage_eviction_run(
|
||||
)));
|
||||
}
|
||||
|
||||
let eviction_state = state.disk_usage_eviction_state.clone();
|
||||
let state = state.disk_usage_eviction_state.clone();
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
let child_cancel = cancel.clone();
|
||||
let _g = cancel.drop_guard();
|
||||
let tenant_manager = state.tenant_manager.clone();
|
||||
|
||||
crate::task_mgr::spawn(
|
||||
crate::task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
@@ -1563,9 +1574,8 @@ async fn disk_usage_eviction_run(
|
||||
false,
|
||||
async move {
|
||||
let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
|
||||
&eviction_state,
|
||||
&state,
|
||||
usage,
|
||||
&tenant_manager,
|
||||
&child_cancel,
|
||||
)
|
||||
.await;
|
||||
@@ -1583,36 +1593,6 @@ async fn disk_usage_eviction_run(
|
||||
json_response(StatusCode::OK, response)
|
||||
}
|
||||
|
||||
async fn secondary_download_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let state = get_state(&request);
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
state
|
||||
.secondary_controller
|
||||
.download_tenant(tenant_id)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn secondary_upload_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let state = get_state(&request);
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
state
|
||||
.secondary_controller
|
||||
.upload_tenant(tenant_id)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
json_response(
|
||||
StatusCode::NOT_FOUND,
|
||||
@@ -1687,6 +1667,8 @@ where
|
||||
);
|
||||
|
||||
match handle.await {
|
||||
// TODO: never actually return Err from here, always Ok(...) so that we can log
|
||||
// spanned errors. Call api_error_handler instead and return appropriate Body.
|
||||
Ok(result) => result,
|
||||
Err(e) => {
|
||||
// The handler task panicked. We have a global panic handler that logs the
|
||||
@@ -1735,7 +1717,7 @@ where
|
||||
pub fn make_router(
|
||||
state: Arc<State>,
|
||||
launch_ts: &'static LaunchTimestamp,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
|
||||
let spec = include_bytes!("openapi_spec.yml");
|
||||
let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc");
|
||||
@@ -1764,6 +1746,9 @@ pub fn make_router(
|
||||
.put("/v1/failpoints", |r| {
|
||||
testing_api_handler("manage failpoints", r, failpoints_handler)
|
||||
})
|
||||
.post("/v1/reload_auth_validation_keys", |r| {
|
||||
api_handler(r, reload_auth_validation_keys_handler)
|
||||
})
|
||||
.get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
|
||||
.post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
|
||||
.get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
|
||||
@@ -1849,16 +1834,6 @@ pub fn make_router(
|
||||
.put("/v1/deletion_queue/flush", |r| {
|
||||
api_handler(r, deletion_queue_flush)
|
||||
})
|
||||
.post("/v1/secondary/:tenant_id/upload", |r| {
|
||||
testing_api_handler("force heatmap upload", r, secondary_upload_handler)
|
||||
})
|
||||
.post("/v1/secondary/:tenant_id/download", |r| {
|
||||
testing_api_handler(
|
||||
"force secondary layer download",
|
||||
r,
|
||||
secondary_download_handler,
|
||||
)
|
||||
})
|
||||
.put("/v1/tenant/:tenant_id/break", |r| {
|
||||
testing_api_handler("set tenant state to broken", r, handle_tenant_break)
|
||||
})
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
|
||||
mod auth;
|
||||
pub mod basebackup;
|
||||
pub mod config;
|
||||
@@ -61,14 +63,6 @@ pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_cod
|
||||
)
|
||||
.await;
|
||||
|
||||
// Shut down any page service tasks.
|
||||
timed(
|
||||
task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
|
||||
"shutdown PageRequestHandlers",
|
||||
Duration::from_secs(1),
|
||||
)
|
||||
.await;
|
||||
|
||||
// Shut down all the tenants. This flushes everything to disk and kills
|
||||
// the checkpoint and GC tasks.
|
||||
timed(
|
||||
@@ -78,6 +72,15 @@ pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_cod
|
||||
)
|
||||
.await;
|
||||
|
||||
// Shut down any page service tasks: any in-progress work for particular timelines or tenants
|
||||
// should already have been canclled via mgr::shutdown_all_tenants
|
||||
timed(
|
||||
task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
|
||||
"shutdown PageRequestHandlers",
|
||||
Duration::from_secs(1),
|
||||
)
|
||||
.await;
|
||||
|
||||
// Best effort to persist any outstanding deletions, to avoid leaking objects
|
||||
if let Some(mut deletion_queue) = deletion_queue {
|
||||
deletion_queue.shutdown(Duration::from_secs(5)).await;
|
||||
|
||||
@@ -962,6 +962,32 @@ static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) struct TenantManagerMetrics {
|
||||
pub(crate) tenant_slots: UIntGauge,
|
||||
pub(crate) tenant_slot_writes: IntCounter,
|
||||
pub(crate) unexpected_errors: IntCounter,
|
||||
}
|
||||
|
||||
pub(crate) static TENANT_MANAGER: Lazy<TenantManagerMetrics> = Lazy::new(|| {
|
||||
TenantManagerMetrics {
|
||||
tenant_slots: register_uint_gauge!(
|
||||
"pageserver_tenant_manager_slots",
|
||||
"How many slots currently exist, including all attached, secondary and in-progress operations",
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
tenant_slot_writes: register_int_counter!(
|
||||
"pageserver_tenant_manager_slot_writes",
|
||||
"Writes to a tenant slot, including all of create/attach/detach/delete"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
unexpected_errors: register_int_counter!(
|
||||
"pageserver_tenant_manager_unexpected_errors_total",
|
||||
"Number of unexpected conditions encountered: nonzero value indicates a non-fatal bug."
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
}
|
||||
});
|
||||
|
||||
pub(crate) struct DeletionQueueMetrics {
|
||||
pub(crate) keys_submitted: IntCounter,
|
||||
pub(crate) keys_dropped: IntCounter,
|
||||
@@ -1199,15 +1225,6 @@ pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_wal_redo_wait_seconds",
|
||||
"Time spent waiting for access to the Postgres WAL redo process",
|
||||
redo_histogram_time_buckets!(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_wal_redo_records_histogram",
|
||||
@@ -1235,6 +1252,46 @@ pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub(crate) struct WalRedoProcessCounters {
|
||||
pub(crate) started: IntCounter,
|
||||
pub(crate) killed_by_cause: enum_map::EnumMap<WalRedoKillCause, IntCounter>,
|
||||
}
|
||||
|
||||
#[derive(Debug, enum_map::Enum, strum_macros::IntoStaticStr)]
|
||||
pub(crate) enum WalRedoKillCause {
|
||||
WalRedoProcessDrop,
|
||||
NoLeakChildDrop,
|
||||
Startup,
|
||||
}
|
||||
|
||||
impl Default for WalRedoProcessCounters {
|
||||
fn default() -> Self {
|
||||
let started = register_int_counter!(
|
||||
"pageserver_wal_redo_process_started_total",
|
||||
"Number of WAL redo processes started",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let killed = register_int_counter_vec!(
|
||||
"pageserver_wal_redo_process_stopped_total",
|
||||
"Number of WAL redo processes stopped",
|
||||
&["cause"],
|
||||
)
|
||||
.unwrap();
|
||||
Self {
|
||||
started,
|
||||
killed_by_cause: EnumMap::from_array(std::array::from_fn(|i| {
|
||||
let cause = <WalRedoKillCause as enum_map::Enum>::from_usize(i);
|
||||
let cause_str: &'static str = cause.into();
|
||||
killed.with_label_values(&[cause_str])
|
||||
})),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
|
||||
Lazy::new(WalRedoProcessCounters::default);
|
||||
|
||||
/// Similar to `prometheus::HistogramTimer` but does not record on drop.
|
||||
pub struct StorageTimeMetricsTimer {
|
||||
metrics: StorageTimeMetrics,
|
||||
@@ -1884,6 +1941,9 @@ pub fn preinitialize_metrics() {
|
||||
// Deletion queue stats
|
||||
Lazy::force(&DELETION_QUEUE);
|
||||
|
||||
// Tenant manager stats
|
||||
Lazy::force(&TENANT_MANAGER);
|
||||
|
||||
// countervecs
|
||||
[&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
|
||||
.into_iter()
|
||||
@@ -1899,7 +1959,6 @@ pub fn preinitialize_metrics() {
|
||||
&READ_NUM_FS_LAYERS,
|
||||
&WAIT_LSN_TIME,
|
||||
&WAL_REDO_TIME,
|
||||
&WAL_REDO_WAIT_TIME,
|
||||
&WAL_REDO_RECORDS_HISTOGRAM,
|
||||
&WAL_REDO_BYTES_HISTOGRAM,
|
||||
]
|
||||
|
||||
@@ -40,7 +40,7 @@ use tracing::field;
|
||||
use tracing::*;
|
||||
use utils::id::ConnectionId;
|
||||
use utils::{
|
||||
auth::{Claims, JwtAuth, Scope},
|
||||
auth::{Claims, Scope, SwappableJwtAuth},
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
simple_rcu::RcuReadGuard,
|
||||
@@ -55,16 +55,20 @@ use crate::metrics;
|
||||
use crate::metrics::LIVE_CONNECTIONS_COUNT;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant;
|
||||
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::mgr;
|
||||
use crate::tenant::mgr::GetTenantError;
|
||||
use crate::tenant::{Tenant, Timeline};
|
||||
use crate::tenant::mgr::get_active_tenant_with_timeout;
|
||||
use crate::tenant::mgr::GetActiveTenantError;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::trace::Tracer;
|
||||
|
||||
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
|
||||
use postgres_ffi::BLCKSZ;
|
||||
|
||||
// How long we may block waiting for a [`TenantSlot::InProgress`]` and/or a [`Tenant`] which
|
||||
// is not yet in state [`TenantState::Active`].
|
||||
const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
|
||||
|
||||
/// Read the end of a tar archive.
|
||||
///
|
||||
/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
|
||||
@@ -118,7 +122,7 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()
|
||||
pub async fn libpq_listener_main(
|
||||
conf: &'static PageServerConf,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
listener: TcpListener,
|
||||
auth_type: AuthType,
|
||||
listener_ctx: RequestContext,
|
||||
@@ -186,7 +190,7 @@ pub async fn libpq_listener_main(
|
||||
async fn page_service_conn_main(
|
||||
conf: &'static PageServerConf,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
socket: tokio::net::TcpStream,
|
||||
auth_type: AuthType,
|
||||
connection_ctx: RequestContext,
|
||||
@@ -214,22 +218,34 @@ async fn page_service_conn_main(
|
||||
// no write timeout is used, because the kernel is assumed to error writes after some time.
|
||||
let mut socket = tokio_io_timeout::TimeoutReader::new(socket);
|
||||
|
||||
// timeout should be lower, but trying out multiple days for
|
||||
// <https://github.com/neondatabase/neon/issues/4205>
|
||||
socket.set_timeout(Some(std::time::Duration::from_secs(60 * 60 * 24 * 3)));
|
||||
let default_timeout_ms = 10 * 60 * 1000; // 10 minutes by default
|
||||
let socket_timeout_ms = (|| {
|
||||
fail::fail_point!("simulated-bad-compute-connection", |avg_timeout_ms| {
|
||||
// Exponential distribution for simulating
|
||||
// poor network conditions, expect about avg_timeout_ms to be around 15
|
||||
// in tests
|
||||
if let Some(avg_timeout_ms) = avg_timeout_ms {
|
||||
let avg = avg_timeout_ms.parse::<i64>().unwrap() as f32;
|
||||
let u = rand::random::<f32>();
|
||||
((1.0 - u).ln() / (-avg)) as u64
|
||||
} else {
|
||||
default_timeout_ms
|
||||
}
|
||||
});
|
||||
default_timeout_ms
|
||||
})();
|
||||
|
||||
// A timeout here does not mean the client died, it can happen if it's just idle for
|
||||
// a while: we will tear down this PageServerHandler and instantiate a new one if/when
|
||||
// they reconnect.
|
||||
socket.set_timeout(Some(std::time::Duration::from_millis(socket_timeout_ms)));
|
||||
let socket = std::pin::pin!(socket);
|
||||
|
||||
// XXX: pgbackend.run() should take the connection_ctx,
|
||||
// and create a child per-query context when it invokes process_query.
|
||||
// But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
|
||||
// and create the per-query context in process_query ourselves.
|
||||
let mut conn_handler = PageServerHandler::new(
|
||||
conf,
|
||||
broker_client,
|
||||
auth,
|
||||
connection_ctx,
|
||||
task_mgr::shutdown_token(),
|
||||
);
|
||||
let mut conn_handler = PageServerHandler::new(conf, broker_client, auth, connection_ctx);
|
||||
let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
|
||||
|
||||
match pgbackend
|
||||
@@ -255,7 +271,7 @@ async fn page_service_conn_main(
|
||||
struct PageServerHandler {
|
||||
_conf: &'static PageServerConf,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
claims: Option<Claims>,
|
||||
|
||||
/// The context created for the lifetime of the connection
|
||||
@@ -263,19 +279,14 @@ struct PageServerHandler {
|
||||
/// For each query received over the connection,
|
||||
/// `process_query` creates a child context from this one.
|
||||
connection_ctx: RequestContext,
|
||||
|
||||
/// A token that should fire when the tenant transitions from
|
||||
/// attached state, or when the pageserver is shutting down.
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
impl PageServerHandler {
|
||||
pub fn new(
|
||||
conf: &'static PageServerConf,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
connection_ctx: RequestContext,
|
||||
cancel: CancellationToken,
|
||||
) -> Self {
|
||||
PageServerHandler {
|
||||
_conf: conf,
|
||||
@@ -283,7 +294,6 @@ impl PageServerHandler {
|
||||
auth,
|
||||
claims: None,
|
||||
connection_ctx,
|
||||
cancel,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -291,7 +301,11 @@ impl PageServerHandler {
|
||||
/// this rather than naked flush() in order to shut down promptly. Without this, we would
|
||||
/// block shutdown of a tenant if a postgres client was failing to consume bytes we send
|
||||
/// in the flush.
|
||||
async fn flush_cancellable<IO>(&self, pgb: &mut PostgresBackend<IO>) -> Result<(), QueryError>
|
||||
async fn flush_cancellable<IO>(
|
||||
&self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), QueryError>
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
@@ -299,7 +313,7 @@ impl PageServerHandler {
|
||||
flush_r = pgb.flush() => {
|
||||
Ok(flush_r?)
|
||||
},
|
||||
_ = self.cancel.cancelled() => {
|
||||
_ = cancel.cancelled() => {
|
||||
Err(QueryError::Shutdown)
|
||||
}
|
||||
)
|
||||
@@ -308,6 +322,7 @@ impl PageServerHandler {
|
||||
fn copyin_stream<'a, IO>(
|
||||
&'a self,
|
||||
pgb: &'a mut PostgresBackend<IO>,
|
||||
cancel: &'a CancellationToken,
|
||||
) -> impl Stream<Item = io::Result<Bytes>> + 'a
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
@@ -317,7 +332,7 @@ impl PageServerHandler {
|
||||
let msg = tokio::select! {
|
||||
biased;
|
||||
|
||||
_ = self.cancel.cancelled() => {
|
||||
_ = cancel.cancelled() => {
|
||||
// We were requested to shut down.
|
||||
let msg = "pageserver is shutting down";
|
||||
let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
|
||||
@@ -357,7 +372,7 @@ impl PageServerHandler {
|
||||
let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
|
||||
// error can't happen here, ErrorResponse serialization should be always ok
|
||||
pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
|
||||
self.flush_cancellable(pgb).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
||||
self.flush_cancellable(pgb, cancel).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
||||
Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
|
||||
}
|
||||
Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
|
||||
@@ -384,12 +399,13 @@ impl PageServerHandler {
|
||||
{
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
// NOTE: pagerequests handler exits when connection is closed,
|
||||
// so there is no need to reset the association
|
||||
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
||||
|
||||
// Make request tracer if needed
|
||||
let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
|
||||
let tenant = mgr::get_active_tenant_with_timeout(
|
||||
tenant_id,
|
||||
ACTIVE_TENANT_TIMEOUT,
|
||||
&task_mgr::shutdown_token(),
|
||||
)
|
||||
.await?;
|
||||
let mut tracer = if tenant.get_trace_read_requests() {
|
||||
let connection_id = ConnectionId::generate();
|
||||
let path = tenant
|
||||
@@ -405,9 +421,14 @@ impl PageServerHandler {
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(|e| anyhow::anyhow!(e))?;
|
||||
|
||||
// Avoid starting new requests if the timeline has already started shutting down,
|
||||
// and block timeline shutdown until this request is complete, or drops out due
|
||||
// to cancellation.
|
||||
let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
|
||||
|
||||
// switch client to COPYBOTH
|
||||
pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
|
||||
self.flush_cancellable(pgb).await?;
|
||||
self.flush_cancellable(pgb, &timeline.cancel).await?;
|
||||
|
||||
let metrics = metrics::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);
|
||||
|
||||
@@ -415,7 +436,7 @@ impl PageServerHandler {
|
||||
let msg = tokio::select! {
|
||||
biased;
|
||||
|
||||
_ = self.cancel.cancelled() => {
|
||||
_ = timeline.cancel.cancelled() => {
|
||||
// We were requested to shut down.
|
||||
info!("shutdown request received in page handler");
|
||||
return Err(QueryError::Shutdown)
|
||||
@@ -490,9 +511,24 @@ impl PageServerHandler {
|
||||
}
|
||||
};
|
||||
|
||||
if let Err(e) = &response {
|
||||
// Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
|
||||
// because wait_lsn etc will drop out
|
||||
// is_stopping(): [`Timeline::flush_and_shutdown`] has entered
|
||||
// is_canceled(): [`Timeline::shutdown`]` has entered
|
||||
if timeline.cancel.is_cancelled() || timeline.is_stopping() {
|
||||
// If we fail to fulfil a request during shutdown, which may be _because_ of
|
||||
// shutdown, then do not send the error to the client. Instead just drop the
|
||||
// connection.
|
||||
span.in_scope(|| info!("dropped response during shutdown: {e:#}"));
|
||||
return Err(QueryError::Shutdown);
|
||||
}
|
||||
}
|
||||
|
||||
let response = response.unwrap_or_else(|e| {
|
||||
// print the all details to the log with {:#}, but for the client the
|
||||
// error message is enough
|
||||
// error message is enough. Do not log if shutting down, as the anyhow::Error
|
||||
// here includes cancellation which is not an error.
|
||||
span.in_scope(|| error!("error reading relation or page version: {:#}", e));
|
||||
PagestreamBeMessage::Error(PagestreamErrorResponse {
|
||||
message: e.to_string(),
|
||||
@@ -500,7 +536,7 @@ impl PageServerHandler {
|
||||
});
|
||||
|
||||
pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?;
|
||||
self.flush_cancellable(pgb).await?;
|
||||
self.flush_cancellable(pgb, &timeline.cancel).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -522,10 +558,14 @@ impl PageServerHandler {
|
||||
{
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
||||
// Create empty timeline
|
||||
info!("creating new timeline");
|
||||
let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
|
||||
let tenant = get_active_tenant_with_timeout(
|
||||
tenant_id,
|
||||
ACTIVE_TENANT_TIMEOUT,
|
||||
&task_mgr::shutdown_token(),
|
||||
)
|
||||
.await?;
|
||||
let timeline = tenant
|
||||
.create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
|
||||
.await?;
|
||||
@@ -543,9 +583,9 @@ impl PageServerHandler {
|
||||
// Import basebackup provided via CopyData
|
||||
info!("importing basebackup");
|
||||
pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
|
||||
self.flush_cancellable(pgb).await?;
|
||||
self.flush_cancellable(pgb, &tenant.cancel).await?;
|
||||
|
||||
let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb)));
|
||||
let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &tenant.cancel)));
|
||||
timeline
|
||||
.import_basebackup_from_tar(
|
||||
&mut copyin_reader,
|
||||
@@ -582,9 +622,10 @@ impl PageServerHandler {
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
||||
|
||||
let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
|
||||
let timeline = self
|
||||
.get_active_tenant_timeline(tenant_id, timeline_id)
|
||||
.await?;
|
||||
let last_record_lsn = timeline.get_last_record_lsn();
|
||||
if last_record_lsn != start_lsn {
|
||||
return Err(QueryError::Other(
|
||||
@@ -598,8 +639,8 @@ impl PageServerHandler {
|
||||
// Import wal provided via CopyData
|
||||
info!("importing wal");
|
||||
pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
|
||||
self.flush_cancellable(pgb).await?;
|
||||
let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb)));
|
||||
self.flush_cancellable(pgb, &timeline.cancel).await?;
|
||||
let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &timeline.cancel)));
|
||||
import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?;
|
||||
info!("wal import complete");
|
||||
|
||||
@@ -792,7 +833,9 @@ impl PageServerHandler {
|
||||
let started = std::time::Instant::now();
|
||||
|
||||
// check that the timeline exists
|
||||
let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
|
||||
let timeline = self
|
||||
.get_active_tenant_timeline(tenant_id, timeline_id)
|
||||
.await?;
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
if let Some(lsn) = lsn {
|
||||
// Backup was requested at a particular LSN. Wait for it to arrive.
|
||||
@@ -807,7 +850,7 @@ impl PageServerHandler {
|
||||
|
||||
// switch client to COPYOUT
|
||||
pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
|
||||
self.flush_cancellable(pgb).await?;
|
||||
self.flush_cancellable(pgb, &timeline.cancel).await?;
|
||||
|
||||
// Send a tarball of the latest layer on the timeline. Compress if not
|
||||
// fullbackup. TODO Compress in that case too (tests need to be updated)
|
||||
@@ -859,7 +902,7 @@ impl PageServerHandler {
|
||||
}
|
||||
|
||||
pgb.write_message_noflush(&BeMessage::CopyDone)?;
|
||||
self.flush_cancellable(pgb).await?;
|
||||
self.flush_cancellable(pgb, &timeline.cancel).await?;
|
||||
|
||||
let basebackup_after = started
|
||||
.elapsed()
|
||||
@@ -877,7 +920,7 @@ impl PageServerHandler {
|
||||
|
||||
// when accessing management api supply None as an argument
|
||||
// when using to authorize tenant pass corresponding tenant id
|
||||
fn check_permission(&self, tenant_id: Option<TenantId>) -> anyhow::Result<()> {
|
||||
fn check_permission(&self, tenant_id: Option<TenantId>) -> Result<(), QueryError> {
|
||||
if self.auth.is_none() {
|
||||
// auth is set to Trust, nothing to check so just return ok
|
||||
return Ok(());
|
||||
@@ -889,7 +932,26 @@ impl PageServerHandler {
|
||||
.claims
|
||||
.as_ref()
|
||||
.expect("claims presence already checked");
|
||||
check_permission(claims, tenant_id)
|
||||
check_permission(claims, tenant_id).map_err(|e| QueryError::Unauthorized(e.0))
|
||||
}
|
||||
|
||||
/// Shorthand for getting a reference to a Timeline of an Active tenant.
|
||||
async fn get_active_tenant_timeline(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<Arc<Timeline>, GetActiveTimelineError> {
|
||||
let tenant = get_active_tenant_with_timeout(
|
||||
tenant_id,
|
||||
ACTIVE_TENANT_TIMEOUT,
|
||||
&task_mgr::shutdown_token(),
|
||||
)
|
||||
.await
|
||||
.map_err(GetActiveTimelineError::Tenant)?;
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
|
||||
Ok(timeline)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -909,16 +971,17 @@ where
|
||||
.auth
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?;
|
||||
.decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)
|
||||
.map_err(|e| QueryError::Unauthorized(e.0))?;
|
||||
|
||||
if matches!(data.claims.scope, Scope::Tenant) && data.claims.tenant_id.is_none() {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"jwt token scope is Tenant, but tenant id is missing"
|
||||
)));
|
||||
return Err(QueryError::Unauthorized(
|
||||
"jwt token scope is Tenant, but tenant id is missing".into(),
|
||||
));
|
||||
}
|
||||
|
||||
info!(
|
||||
"jwt auth succeeded for scope: {:#?} by tenant id: {:?}",
|
||||
debug!(
|
||||
"jwt scope check succeeded for scope: {:#?} by tenant id: {:?}",
|
||||
data.claims.scope, data.claims.tenant_id,
|
||||
);
|
||||
|
||||
@@ -940,9 +1003,13 @@ where
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
query_string: &str,
|
||||
) -> Result<(), QueryError> {
|
||||
fail::fail_point!("simulated-bad-compute-connection", |_| {
|
||||
info!("Hit failpoint for bad connection");
|
||||
Err(QueryError::SimulatedConnectionError)
|
||||
});
|
||||
|
||||
let ctx = self.connection_ctx.attached_child();
|
||||
debug!("process query {query_string:?}");
|
||||
|
||||
if query_string.starts_with("pagestream ") {
|
||||
let (_, params_raw) = query_string.split_at("pagestream ".len());
|
||||
let params = params_raw.split(' ').collect::<Vec<_>>();
|
||||
@@ -1048,7 +1115,9 @@ where
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
|
||||
let timeline = self
|
||||
.get_active_tenant_timeline(tenant_id, timeline_id)
|
||||
.await?;
|
||||
|
||||
let end_of_timeline = timeline.get_last_record_rlsn();
|
||||
|
||||
@@ -1232,7 +1301,12 @@ where
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
|
||||
let tenant = get_active_tenant_with_timeout(
|
||||
tenant_id,
|
||||
ACTIVE_TENANT_TIMEOUT,
|
||||
&task_mgr::shutdown_token(),
|
||||
)
|
||||
.await?;
|
||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[
|
||||
RowDescriptor::int8_col(b"checkpoint_distance"),
|
||||
RowDescriptor::int8_col(b"checkpoint_timeout"),
|
||||
@@ -1278,67 +1352,16 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
enum GetActiveTenantError {
|
||||
#[error(
|
||||
"Timed out waiting {wait_time:?} for tenant active state. Latest state: {latest_state:?}"
|
||||
)]
|
||||
WaitForActiveTimeout {
|
||||
latest_state: TenantState,
|
||||
wait_time: Duration,
|
||||
},
|
||||
#[error(transparent)]
|
||||
NotFound(GetTenantError),
|
||||
#[error(transparent)]
|
||||
WaitTenantActive(tenant::WaitToBecomeActiveError),
|
||||
}
|
||||
|
||||
impl From<GetActiveTenantError> for QueryError {
|
||||
fn from(e: GetActiveTenantError) -> Self {
|
||||
match e {
|
||||
GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
|
||||
ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
|
||||
),
|
||||
GetActiveTenantError::WaitTenantActive(e) => QueryError::Other(anyhow::Error::new(e)),
|
||||
GetActiveTenantError::NotFound(e) => QueryError::Other(anyhow::Error::new(e)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get active tenant.
|
||||
///
|
||||
/// If the tenant is Loading, waits for it to become Active, for up to 30 s. That
|
||||
/// ensures that queries don't fail immediately after pageserver startup, because
|
||||
/// all tenants are still loading.
|
||||
async fn get_active_tenant_with_timeout(
|
||||
tenant_id: TenantId,
|
||||
_ctx: &RequestContext, /* require get a context to support cancellation in the future */
|
||||
) -> Result<Arc<Tenant>, GetActiveTenantError> {
|
||||
let tenant = match mgr::get_tenant(tenant_id, false) {
|
||||
Ok(tenant) => tenant,
|
||||
Err(e @ GetTenantError::NotFound(_)) => return Err(GetActiveTenantError::NotFound(e)),
|
||||
Err(GetTenantError::NotActive(_)) => {
|
||||
unreachable!("we're calling get_tenant with active_only=false")
|
||||
}
|
||||
Err(GetTenantError::Broken(_)) => {
|
||||
unreachable!("we're calling get_tenant with active_only=false")
|
||||
}
|
||||
};
|
||||
let wait_time = Duration::from_secs(30);
|
||||
match tokio::time::timeout(wait_time, tenant.wait_to_become_active()).await {
|
||||
Ok(Ok(())) => Ok(tenant),
|
||||
// no .context(), the error message is good enough and some tests depend on it
|
||||
Ok(Err(e)) => Err(GetActiveTenantError::WaitTenantActive(e)),
|
||||
Err(_) => {
|
||||
let latest_state = tenant.current_state();
|
||||
if latest_state == TenantState::Active {
|
||||
Ok(tenant)
|
||||
} else {
|
||||
Err(GetActiveTenantError::WaitForActiveTimeout {
|
||||
latest_state,
|
||||
wait_time,
|
||||
})
|
||||
GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => {
|
||||
QueryError::Shutdown
|
||||
}
|
||||
e => QueryError::Other(anyhow::anyhow!(e)),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1359,18 +1382,3 @@ impl From<GetActiveTimelineError> for QueryError {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Shorthand for getting a reference to a Timeline of an Active tenant.
|
||||
async fn get_active_tenant_timeline(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Arc<Timeline>, GetActiveTimelineError> {
|
||||
let tenant = get_active_tenant_with_timeout(tenant_id, ctx)
|
||||
.await
|
||||
.map_err(GetActiveTimelineError::Tenant)?;
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
|
||||
Ok(timeline)
|
||||
}
|
||||
|
||||
@@ -21,8 +21,9 @@ use serde::{Deserialize, Serialize};
|
||||
use std::collections::{hash_map, HashMap, HashSet};
|
||||
use std::ops::ControlFlow;
|
||||
use std::ops::Range;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, trace, warn};
|
||||
use utils::bin_ser::DeserializeError;
|
||||
use utils::exp_counter::ExpCounter;
|
||||
use utils::{bin_ser::BeSer, lsn::Lsn};
|
||||
|
||||
/// Block number within a relation or SLRU. This matches PostgreSQL's BlockNumber type.
|
||||
@@ -30,9 +31,33 @@ pub type BlockNumber = u32;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum LsnForTimestamp {
|
||||
/// Found commits both before and after the given timestamp
|
||||
Present(Lsn),
|
||||
|
||||
/// Found no commits after the given timestamp, this means
|
||||
/// that the newest data in the branch is older than the given
|
||||
/// timestamp.
|
||||
///
|
||||
/// All commits <= LSN happened before the given timestamp
|
||||
Future(Lsn),
|
||||
|
||||
/// The queried timestamp is past our horizon we look back at (PITR)
|
||||
///
|
||||
/// All commits > LSN happened after the given timestamp,
|
||||
/// but any commits < LSN might have happened before or after
|
||||
/// the given timestamp. We don't know because no data before
|
||||
/// the given lsn is available.
|
||||
Past(Lsn),
|
||||
|
||||
/// We have found no commit with a timestamp,
|
||||
/// so we can't return anything meaningful.
|
||||
///
|
||||
/// The associated LSN is the lower bound value we can safely
|
||||
/// create branches on, but no statement is made if it is
|
||||
/// older or newer than the timestamp.
|
||||
///
|
||||
/// This variant can e.g. be returned right after a
|
||||
/// cluster import.
|
||||
NoData(Lsn),
|
||||
}
|
||||
|
||||
@@ -44,6 +69,36 @@ pub enum CalculateLogicalSizeError {
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum CollectKeySpaceError {
|
||||
#[error(transparent)]
|
||||
Decode(#[from] DeserializeError),
|
||||
#[error(transparent)]
|
||||
PageRead(PageReconstructError),
|
||||
#[error("cancelled")]
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
impl From<PageReconstructError> for CollectKeySpaceError {
|
||||
fn from(err: PageReconstructError) -> Self {
|
||||
match err {
|
||||
PageReconstructError::Cancelled => Self::Cancelled,
|
||||
err => Self::PageRead(err),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<PageReconstructError> for CalculateLogicalSizeError {
|
||||
fn from(pre: PageReconstructError) -> Self {
|
||||
match pre {
|
||||
PageReconstructError::AncestorStopping(_) | PageReconstructError::Cancelled => {
|
||||
Self::Cancelled
|
||||
}
|
||||
_ => Self::Other(pre.into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum RelationError {
|
||||
#[error("Relation Already Exists")]
|
||||
@@ -314,7 +369,11 @@ impl Timeline {
|
||||
ctx: &RequestContext,
|
||||
) -> Result<LsnForTimestamp, PageReconstructError> {
|
||||
let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
|
||||
let min_lsn = *gc_cutoff_lsn_guard;
|
||||
// We use this method to figure out the branching LSN for the new branch, but the
|
||||
// GC cutoff could be before the branching point and we cannot create a new branch
|
||||
// with LSN < `ancestor_lsn`. Thus, pick the maximum of these two to be
|
||||
// on the safe side.
|
||||
let min_lsn = std::cmp::max(*gc_cutoff_lsn_guard, self.get_ancestor_lsn());
|
||||
let max_lsn = self.get_last_record_lsn();
|
||||
|
||||
// LSNs are always 8-byte aligned. low/mid/high represent the
|
||||
@@ -324,83 +383,92 @@ impl Timeline {
|
||||
|
||||
let mut found_smaller = false;
|
||||
let mut found_larger = false;
|
||||
|
||||
// This is a search budget to not make the search too expensive
|
||||
let mut budget = 1_000_000u32;
|
||||
while low < high {
|
||||
// cannot overflow, high and low are both smaller than u64::MAX / 2
|
||||
let mid = (high + low) / 2;
|
||||
// this always holds: low <= mid_start < high
|
||||
let mid_start = (high + low) / 2;
|
||||
let mut mid = mid_start;
|
||||
|
||||
let cmp = self
|
||||
.is_latest_commit_timestamp_ge_than(
|
||||
search_timestamp,
|
||||
Lsn(mid * 8),
|
||||
&mut found_smaller,
|
||||
&mut found_larger,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
let mut max = None;
|
||||
|
||||
if cmp {
|
||||
high = mid;
|
||||
// Search for an lsn that has a commit timestamp.
|
||||
// The search with `ExpCounter` will eventually try all LSNs
|
||||
// in the range between mid_start and high, which is important
|
||||
// when we set high to mid if we have exhausted all possibilities.
|
||||
// We don't do an explosive search as we want to prove that
|
||||
// every single lsn up to high is giving inconclusive results.
|
||||
// This can only be done by trying all lsns.
|
||||
for offs in ExpCounter::with_max_and_linear_search(high - mid_start) {
|
||||
mid = offs + mid_start;
|
||||
|
||||
// Do the query for mid + 1 instead of mid so that we can make definite statements
|
||||
// about low (low's invariant: always points to commit before or at search_timestamp).
|
||||
max = self
|
||||
.get_max_timestamp_for_lsn(Lsn((mid + 1) * 8), ctx)
|
||||
.await?;
|
||||
if max.is_some() {
|
||||
break;
|
||||
}
|
||||
// Do some limiting to make sure the query does not become too expensive.
|
||||
if let Some(new_budget) = budget.checked_sub(1) {
|
||||
budget = new_budget;
|
||||
} else {
|
||||
return Ok(LsnForTimestamp::NoData(min_lsn));
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(max) = max {
|
||||
if max <= search_timestamp {
|
||||
// We found a commit with timestamp before (or at) `search_timestamp`.
|
||||
found_smaller = true;
|
||||
low = mid + 1;
|
||||
} else {
|
||||
// We found a commit with timestamp after `search_timestamp`.
|
||||
found_larger = true;
|
||||
high = mid;
|
||||
}
|
||||
} else {
|
||||
low = mid + 1;
|
||||
// max is None so we have proof of a chain of None's from mid_start all up to high.
|
||||
// Thus we can safely set high to mid_start
|
||||
high = mid_start;
|
||||
}
|
||||
}
|
||||
// If `found_smaller == true`, `low` is the LSN of the last commit record
|
||||
// before or at `search_timestamp`
|
||||
//
|
||||
// FIXME: it would be better to get the LSN of the previous commit.
|
||||
// Otherwise, if you restore to the returned LSN, the database will
|
||||
// include physical changes from later commits that will be marked
|
||||
// as aborted, and will need to be vacuumed away.
|
||||
let commit_lsn = Lsn(low * 8);
|
||||
match (found_smaller, found_larger) {
|
||||
(false, false) => {
|
||||
// This can happen if no commit records have been processed yet, e.g.
|
||||
// just after importing a cluster.
|
||||
Ok(LsnForTimestamp::NoData(max_lsn))
|
||||
}
|
||||
(true, false) => {
|
||||
// Didn't find any commit timestamps larger than the request
|
||||
Ok(LsnForTimestamp::Future(max_lsn))
|
||||
Ok(LsnForTimestamp::NoData(min_lsn))
|
||||
}
|
||||
(false, true) => {
|
||||
// Didn't find any commit timestamps smaller than the request
|
||||
Ok(LsnForTimestamp::Past(max_lsn))
|
||||
Ok(LsnForTimestamp::Past(min_lsn))
|
||||
}
|
||||
(true, true) => {
|
||||
// low is the LSN of the first commit record *after* the search_timestamp,
|
||||
// Back off by one to get to the point just before the commit.
|
||||
//
|
||||
// FIXME: it would be better to get the LSN of the previous commit.
|
||||
// Otherwise, if you restore to the returned LSN, the database will
|
||||
// include physical changes from later commits that will be marked
|
||||
// as aborted, and will need to be vacuumed away.
|
||||
Ok(LsnForTimestamp::Present(Lsn((low - 1) * 8)))
|
||||
(true, false) => {
|
||||
// Only found commits with timestamps smaller than the request.
|
||||
// It's still a valid case for branch creation, return it.
|
||||
// And `update_gc_info()` ignores LSN for a `LsnForTimestamp::Future`
|
||||
// case, anyway.
|
||||
Ok(LsnForTimestamp::Future(commit_lsn))
|
||||
}
|
||||
(true, true) => Ok(LsnForTimestamp::Present(commit_lsn)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Subroutine of find_lsn_for_timestamp(). Returns true, if there are any
|
||||
/// commits that committed after 'search_timestamp', at LSN 'probe_lsn'.
|
||||
/// Obtain the timestamp for the given lsn.
|
||||
///
|
||||
/// Additionally, sets 'found_smaller'/'found_Larger, if encounters any commits
|
||||
/// with a smaller/larger timestamp.
|
||||
///
|
||||
pub async fn is_latest_commit_timestamp_ge_than(
|
||||
&self,
|
||||
search_timestamp: TimestampTz,
|
||||
probe_lsn: Lsn,
|
||||
found_smaller: &mut bool,
|
||||
found_larger: &mut bool,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<bool, PageReconstructError> {
|
||||
self.map_all_timestamps(probe_lsn, ctx, |timestamp| {
|
||||
if timestamp >= search_timestamp {
|
||||
*found_larger = true;
|
||||
return ControlFlow::Break(true);
|
||||
} else {
|
||||
*found_smaller = true;
|
||||
}
|
||||
ControlFlow::Continue(())
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
/// Obtain the possible timestamp range for the given lsn.
|
||||
///
|
||||
/// If the lsn has no timestamps, returns None. returns `(min, max, median)` if it has timestamps.
|
||||
pub async fn get_timestamp_for_lsn(
|
||||
/// If the lsn has no timestamps, returns None. Returns the maximum such timestamp otherwise.
|
||||
pub(crate) async fn get_max_timestamp_for_lsn(
|
||||
&self,
|
||||
probe_lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
@@ -552,7 +620,8 @@ impl Timeline {
|
||||
Err(e) => Err(PageReconstructError::from(e)),
|
||||
},
|
||||
Err(e) => {
|
||||
warn!("Failed to get info about AUX files: {}", e);
|
||||
// This is expected: historical databases do not have the key.
|
||||
debug!("Failed to get info about AUX files: {}", e);
|
||||
Ok(HashMap::new())
|
||||
}
|
||||
}
|
||||
@@ -566,30 +635,22 @@ impl Timeline {
|
||||
pub async fn get_current_logical_size_non_incremental(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
cancel: CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<u64, CalculateLogicalSizeError> {
|
||||
crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
// Fetch list of database dirs and iterate them
|
||||
let buf = self.get(DBDIR_KEY, lsn, ctx).await.context("read dbdir")?;
|
||||
let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
|
||||
let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
|
||||
|
||||
let mut total_size: u64 = 0;
|
||||
for (spcnode, dbnode) in dbdir.dbdirs.keys() {
|
||||
for rel in self
|
||||
.list_rels(*spcnode, *dbnode, lsn, ctx)
|
||||
.await
|
||||
.context("list rels")?
|
||||
{
|
||||
if cancel.is_cancelled() {
|
||||
for rel in self.list_rels(*spcnode, *dbnode, lsn, ctx).await? {
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(CalculateLogicalSizeError::Cancelled);
|
||||
}
|
||||
let relsize_key = rel_size_to_key(rel);
|
||||
let mut buf = self
|
||||
.get(relsize_key, lsn, ctx)
|
||||
.await
|
||||
.with_context(|| format!("read relation size of {rel:?}"))?;
|
||||
let mut buf = self.get(relsize_key, lsn, ctx).await?;
|
||||
let relsize = buf.get_u32_le();
|
||||
|
||||
total_size += relsize as u64;
|
||||
@@ -602,11 +663,11 @@ impl Timeline {
|
||||
/// Get a KeySpace that covers all the Keys that are in use at the given LSN.
|
||||
/// Anything that's not listed maybe removed from the underlying storage (from
|
||||
/// that LSN forwards).
|
||||
pub async fn collect_keyspace(
|
||||
pub(crate) async fn collect_keyspace(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<KeySpace> {
|
||||
) -> Result<KeySpace, CollectKeySpaceError> {
|
||||
// Iterate through key ranges, greedily packing them into partitions
|
||||
let mut result = KeySpaceAccum::new();
|
||||
|
||||
@@ -615,7 +676,7 @@ impl Timeline {
|
||||
|
||||
// Fetch list of database dirs and iterate them
|
||||
let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
|
||||
let dbdir = DbDirectory::des(&buf).context("deserialization failure")?;
|
||||
let dbdir = DbDirectory::des(&buf)?;
|
||||
|
||||
let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
|
||||
dbs.sort_unstable();
|
||||
@@ -648,7 +709,7 @@ impl Timeline {
|
||||
let slrudir_key = slru_dir_to_key(kind);
|
||||
result.add_key(slrudir_key);
|
||||
let buf = self.get(slrudir_key, lsn, ctx).await?;
|
||||
let dir = SlruSegmentDirectory::des(&buf).context("deserialization failure")?;
|
||||
let dir = SlruSegmentDirectory::des(&buf)?;
|
||||
let mut segments: Vec<u32> = dir.segments.iter().cloned().collect();
|
||||
segments.sort_unstable();
|
||||
for segno in segments {
|
||||
@@ -666,7 +727,7 @@ impl Timeline {
|
||||
// Then pg_twophase
|
||||
result.add_key(TWOPHASEDIR_KEY);
|
||||
let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
|
||||
let twophase_dir = TwoPhaseDirectory::des(&buf).context("deserialization failure")?;
|
||||
let twophase_dir = TwoPhaseDirectory::des(&buf)?;
|
||||
let mut xids: Vec<TransactionId> = twophase_dir.xids.iter().cloned().collect();
|
||||
xids.sort_unstable();
|
||||
for xid in xids {
|
||||
|
||||
@@ -257,12 +257,6 @@ pub enum TaskKind {
|
||||
/// See [`crate::disk_usage_eviction_task`].
|
||||
DiskUsageEviction,
|
||||
|
||||
/// See [`crate::tenant::secondary`].
|
||||
SecondaryDownloads,
|
||||
|
||||
/// See [`crate::tenant::secondary`].
|
||||
SecondaryUploads,
|
||||
|
||||
// Initial logical size calculation
|
||||
InitialLogicalSizeCalculation,
|
||||
|
||||
@@ -305,10 +299,6 @@ pub enum TaskKind {
|
||||
|
||||
#[derive(Default)]
|
||||
struct MutableTaskState {
|
||||
/// Tenant and timeline that this task is associated with.
|
||||
tenant_id: Option<TenantId>,
|
||||
timeline_id: Option<TimelineId>,
|
||||
|
||||
/// Handle for waiting for the task to exit. It can be None, if the
|
||||
/// the task has already exited.
|
||||
join_handle: Option<JoinHandle<()>>,
|
||||
@@ -325,6 +315,11 @@ struct PageServerTask {
|
||||
// To request task shutdown, just cancel this token.
|
||||
cancel: CancellationToken,
|
||||
|
||||
/// Tasks may optionally be launched for a particular tenant/timeline, enabling
|
||||
/// later cancelling tasks for that tenant/timeline in [`shutdown_tasks`]
|
||||
tenant_id: Option<TenantId>,
|
||||
timeline_id: Option<TimelineId>,
|
||||
|
||||
mutable: Mutex<MutableTaskState>,
|
||||
}
|
||||
|
||||
@@ -350,11 +345,9 @@ where
|
||||
kind,
|
||||
name: name.to_string(),
|
||||
cancel: cancel.clone(),
|
||||
mutable: Mutex::new(MutableTaskState {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
join_handle: None,
|
||||
}),
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
mutable: Mutex::new(MutableTaskState { join_handle: None }),
|
||||
});
|
||||
|
||||
TASKS.lock().unwrap().insert(task_id, Arc::clone(&task));
|
||||
@@ -424,8 +417,6 @@ async fn task_finish(
|
||||
|
||||
let mut shutdown_process = false;
|
||||
{
|
||||
let task_mut = task.mutable.lock().unwrap();
|
||||
|
||||
match result {
|
||||
Ok(Ok(())) => {
|
||||
debug!("Task '{}' exited normally", task_name);
|
||||
@@ -434,13 +425,13 @@ async fn task_finish(
|
||||
if shutdown_process_on_error {
|
||||
error!(
|
||||
"Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
|
||||
task_name, task_mut.tenant_id, task_mut.timeline_id, err
|
||||
task_name, task.tenant_id, task.timeline_id, err
|
||||
);
|
||||
shutdown_process = true;
|
||||
} else {
|
||||
error!(
|
||||
"Task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
|
||||
task_name, task_mut.tenant_id, task_mut.timeline_id, err
|
||||
task_name, task.tenant_id, task.timeline_id, err
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -448,13 +439,13 @@ async fn task_finish(
|
||||
if shutdown_process_on_error {
|
||||
error!(
|
||||
"Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
|
||||
task_name, task_mut.tenant_id, task_mut.timeline_id, err
|
||||
task_name, task.tenant_id, task.timeline_id, err
|
||||
);
|
||||
shutdown_process = true;
|
||||
} else {
|
||||
error!(
|
||||
"Task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
|
||||
task_name, task_mut.tenant_id, task_mut.timeline_id, err
|
||||
task_name, task.tenant_id, task.timeline_id, err
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -466,17 +457,6 @@ async fn task_finish(
|
||||
}
|
||||
}
|
||||
|
||||
// expected to be called from the task of the given id.
|
||||
pub fn associate_with(tenant_id: Option<TenantId>, timeline_id: Option<TimelineId>) {
|
||||
CURRENT_TASK.with(|ct| {
|
||||
let mut task_mut = ct.mutable.lock().unwrap();
|
||||
task_mut.tenant_id = tenant_id;
|
||||
task_mut.timeline_id = timeline_id;
|
||||
});
|
||||
}
|
||||
|
||||
/// Is there a task running that matches the criteria
|
||||
|
||||
/// Signal and wait for tasks to shut down.
|
||||
///
|
||||
///
|
||||
@@ -499,17 +479,16 @@ pub async fn shutdown_tasks(
|
||||
{
|
||||
let tasks = TASKS.lock().unwrap();
|
||||
for task in tasks.values() {
|
||||
let task_mut = task.mutable.lock().unwrap();
|
||||
if (kind.is_none() || Some(task.kind) == kind)
|
||||
&& (tenant_id.is_none() || task_mut.tenant_id == tenant_id)
|
||||
&& (timeline_id.is_none() || task_mut.timeline_id == timeline_id)
|
||||
&& (tenant_id.is_none() || task.tenant_id == tenant_id)
|
||||
&& (timeline_id.is_none() || task.timeline_id == timeline_id)
|
||||
{
|
||||
task.cancel.cancel();
|
||||
victim_tasks.push((
|
||||
Arc::clone(task),
|
||||
task.kind,
|
||||
task_mut.tenant_id,
|
||||
task_mut.timeline_id,
|
||||
task.tenant_id,
|
||||
task.timeline_id,
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -327,7 +327,7 @@ mod tests {
|
||||
let mut sz: u16 = rng.gen();
|
||||
// Make 50% of the arrays small
|
||||
if rng.gen() {
|
||||
sz |= 63;
|
||||
sz &= 63;
|
||||
}
|
||||
random_array(sz.into())
|
||||
})
|
||||
|
||||
@@ -3,10 +3,10 @@ use std::sync::Arc;
|
||||
use anyhow::Context;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use pageserver_api::models::TenantState;
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath};
|
||||
use tokio::sync::OwnedMutexGuard;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, info, instrument, warn, Instrument, Span};
|
||||
use tracing::{error, instrument, warn, Instrument, Span};
|
||||
|
||||
use utils::{
|
||||
backoff, completion, crashsafe, fs_ext,
|
||||
@@ -25,11 +25,9 @@ use super::{
|
||||
remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
|
||||
span,
|
||||
timeline::delete::DeleteTimelineFlow,
|
||||
tree_sort_timelines, DeleteTimelineError, Tenant,
|
||||
tree_sort_timelines, DeleteTimelineError, Tenant, TenantPreload,
|
||||
};
|
||||
|
||||
const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u32 = 3;
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum DeleteTenantError {
|
||||
#[error("GetTenant {0}")]
|
||||
@@ -69,7 +67,7 @@ fn remote_tenant_delete_mark_path(
|
||||
.context("Failed to strip workdir prefix")
|
||||
.and_then(RemotePath::new)
|
||||
.context("tenant path")?;
|
||||
Ok(tenant_remote_path.join(Utf8Path::new("deleted")))
|
||||
Ok(tenant_remote_path.join(Utf8Path::new("timelines/deleted")))
|
||||
}
|
||||
|
||||
async fn create_remote_delete_mark(
|
||||
@@ -159,7 +157,8 @@ async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), Del
|
||||
// Assert timelines dir is empty.
|
||||
if !fs_ext::is_directory_empty(timelines_path).await? {
|
||||
// Display first 10 items in directory
|
||||
let list = &fs_ext::list_dir(timelines_path).await.context("list_dir")?[..10];
|
||||
let list = fs_ext::list_dir(timelines_path).await.context("list_dir")?;
|
||||
let list = &list.into_iter().take(10).collect::<Vec<_>>();
|
||||
return Err(DeleteTenantError::Other(anyhow::anyhow!(
|
||||
"Timelines directory is not empty after all timelines deletion: {list:?}"
|
||||
)));
|
||||
@@ -248,32 +247,6 @@ async fn cleanup_remaining_fs_traces(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn remote_delete_mark_exists(
|
||||
conf: &PageServerConf,
|
||||
tenant_id: &TenantId,
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
) -> anyhow::Result<bool> {
|
||||
// If remote storage is there we rely on it
|
||||
let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id).context("path")?;
|
||||
|
||||
let result = backoff::retry(
|
||||
|| async { remote_storage.download(&remote_mark_path).await },
|
||||
|e| matches!(e, DownloadError::NotFound),
|
||||
SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
|
||||
SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
|
||||
"fetch_tenant_deletion_mark",
|
||||
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
|
||||
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
|
||||
)
|
||||
.await;
|
||||
|
||||
match result {
|
||||
Ok(_) => Ok(true),
|
||||
Err(DownloadError::NotFound) => Ok(false),
|
||||
Err(e) => Err(anyhow::anyhow!(e)).context("remote_delete_mark_exists")?,
|
||||
}
|
||||
}
|
||||
|
||||
/// Orchestrates tenant shut down of all tasks, removes its in-memory structures,
|
||||
/// and deletes its data from both disk and s3.
|
||||
/// The sequence of steps:
|
||||
@@ -285,10 +258,9 @@ pub(crate) async fn remote_delete_mark_exists(
|
||||
/// 6. Remove remote mark
|
||||
/// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark
|
||||
/// It is resumable from any step in case a crash/restart occurs.
|
||||
/// There are three entrypoints to the process:
|
||||
/// There are two entrypoints to the process:
|
||||
/// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler.
|
||||
/// 2. [`DeleteTenantFlow::resume_from_load`] is called during restarts when local or remote deletion marks are still there.
|
||||
/// 3. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process.
|
||||
/// 2. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process.
|
||||
/// Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function.
|
||||
#[derive(Default)]
|
||||
pub enum DeleteTenantFlow {
|
||||
@@ -387,7 +359,7 @@ impl DeleteTenantFlow {
|
||||
|
||||
pub(crate) async fn should_resume_deletion(
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: Option<&GenericRemoteStorage>,
|
||||
remote_mark_exists: bool,
|
||||
tenant: &Tenant,
|
||||
) -> Result<Option<DeletionGuard>, DeleteTenantError> {
|
||||
let acquire = |t: &Tenant| {
|
||||
@@ -398,66 +370,25 @@ impl DeleteTenantFlow {
|
||||
)
|
||||
};
|
||||
|
||||
let tenant_id = tenant.tenant_id;
|
||||
// Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
|
||||
if conf.tenant_deleted_mark_file_path(&tenant_id).exists() {
|
||||
if remote_mark_exists {
|
||||
return Ok(acquire(tenant));
|
||||
}
|
||||
|
||||
let remote_storage = match remote_storage {
|
||||
Some(remote_storage) => remote_storage,
|
||||
None => return Ok(None),
|
||||
};
|
||||
|
||||
if remote_delete_mark_exists(conf, &tenant_id, remote_storage).await? {
|
||||
let tenant_id = tenant.tenant_id;
|
||||
// Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
|
||||
if conf.tenant_deleted_mark_file_path(&tenant_id).exists() {
|
||||
Ok(acquire(tenant))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn resume_from_load(
|
||||
guard: DeletionGuard,
|
||||
tenant: &Arc<Tenant>,
|
||||
init_order: Option<&InitializationOrder>,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
let (_, progress) = completion::channel();
|
||||
|
||||
tenant
|
||||
.set_stopping(progress, true, false)
|
||||
.await
|
||||
.expect("cant be stopping or broken");
|
||||
|
||||
// Do not consume valuable resources during the load phase, continue deletion once init phase is complete.
|
||||
let background_jobs_can_start = init_order.as_ref().map(|x| &x.background_jobs_can_start);
|
||||
if let Some(background) = background_jobs_can_start {
|
||||
info!("waiting for backgound jobs barrier");
|
||||
background.clone().wait().await;
|
||||
info!("ready for backgound jobs barrier");
|
||||
}
|
||||
|
||||
// Tenant may not be loadable if we fail late in cleanup_remaining_fs_traces (e g remove timelines dir)
|
||||
let timelines_path = tenant.conf.timelines_path(&tenant.tenant_id);
|
||||
if timelines_path.exists() {
|
||||
tenant.load(init_order, None, ctx).await.context("load")?;
|
||||
}
|
||||
|
||||
Self::background(
|
||||
guard,
|
||||
tenant.conf,
|
||||
tenant.remote_storage.clone(),
|
||||
tenants,
|
||||
tenant,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn resume_from_attach(
|
||||
guard: DeletionGuard,
|
||||
tenant: &Arc<Tenant>,
|
||||
preload: Option<TenantPreload>,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
init_order: Option<InitializationOrder>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
let (_, progress) = completion::channel();
|
||||
@@ -468,7 +399,7 @@ impl DeleteTenantFlow {
|
||||
.expect("cant be stopping or broken");
|
||||
|
||||
tenant
|
||||
.attach(ctx, super::AttachMarkerMode::Expect)
|
||||
.attach(init_order, preload, ctx)
|
||||
.await
|
||||
.context("attach")?;
|
||||
|
||||
@@ -606,10 +537,18 @@ impl DeleteTenantFlow {
|
||||
.await
|
||||
.context("cleanup_remaining_fs_traces")?;
|
||||
|
||||
let mut locked = tenants.write().unwrap();
|
||||
if locked.remove(&tenant.tenant_id).is_none() {
|
||||
warn!("Tenant got removed from tenants map during deletion");
|
||||
};
|
||||
{
|
||||
let mut locked = tenants.write().unwrap();
|
||||
if locked.remove(&tenant.tenant_id).is_none() {
|
||||
warn!("Tenant got removed from tenants map during deletion");
|
||||
};
|
||||
|
||||
// FIXME: we should not be modifying this from outside of mgr.rs.
|
||||
// This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)
|
||||
crate::metrics::TENANT_MANAGER
|
||||
.tenant_slots
|
||||
.set(locked.len() as u64);
|
||||
}
|
||||
|
||||
*guard = Self::Finished;
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -406,4 +406,123 @@ mod tests {
|
||||
METADATA_OLD_FORMAT_VERSION, METADATA_FORMAT_VERSION
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_metadata_bincode_serde() {
|
||||
let original_metadata = TimelineMetadata::new(
|
||||
Lsn(0x200),
|
||||
Some(Lsn(0x100)),
|
||||
Some(TIMELINE_ID),
|
||||
Lsn(0),
|
||||
Lsn(0),
|
||||
Lsn(0),
|
||||
// Any version will do here, so use the default
|
||||
crate::DEFAULT_PG_VERSION,
|
||||
);
|
||||
let metadata_bytes = original_metadata
|
||||
.to_bytes()
|
||||
.expect("Cannot create bytes array from metadata");
|
||||
|
||||
let metadata_bincode_be_bytes = original_metadata
|
||||
.ser()
|
||||
.expect("Cannot serialize the metadata");
|
||||
|
||||
// 8 bytes for the length of the vector
|
||||
assert_eq!(metadata_bincode_be_bytes.len(), 8 + metadata_bytes.len());
|
||||
|
||||
let expected_bincode_bytes = {
|
||||
let mut temp = vec![];
|
||||
let len_bytes = metadata_bytes.len().to_be_bytes();
|
||||
temp.extend_from_slice(&len_bytes);
|
||||
temp.extend_from_slice(&metadata_bytes);
|
||||
temp
|
||||
};
|
||||
assert_eq!(metadata_bincode_be_bytes, expected_bincode_bytes);
|
||||
|
||||
let deserialized_metadata = TimelineMetadata::des(&metadata_bincode_be_bytes).unwrap();
|
||||
// Deserialized metadata has the metadata header, which is different from the serialized one.
|
||||
// Reference: TimelineMetaData::to_bytes()
|
||||
let expected_metadata = {
|
||||
let mut temp_metadata = original_metadata;
|
||||
let body_bytes = temp_metadata
|
||||
.body
|
||||
.ser()
|
||||
.expect("Cannot serialize the metadata body");
|
||||
let metadata_size = METADATA_HDR_SIZE + body_bytes.len();
|
||||
let hdr = TimelineMetadataHeader {
|
||||
size: metadata_size as u16,
|
||||
format_version: METADATA_FORMAT_VERSION,
|
||||
checksum: crc32c::crc32c(&body_bytes),
|
||||
};
|
||||
temp_metadata.hdr = hdr;
|
||||
temp_metadata
|
||||
};
|
||||
assert_eq!(deserialized_metadata, expected_metadata);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_metadata_bincode_serde_ensure_roundtrip() {
|
||||
let original_metadata = TimelineMetadata::new(
|
||||
Lsn(0x200),
|
||||
Some(Lsn(0x100)),
|
||||
Some(TIMELINE_ID),
|
||||
Lsn(0),
|
||||
Lsn(0),
|
||||
Lsn(0),
|
||||
// Any version will do here, so use the default
|
||||
crate::DEFAULT_PG_VERSION,
|
||||
);
|
||||
let expected_bytes = vec![
|
||||
/* bincode length encoding bytes */
|
||||
0, 0, 0, 0, 0, 0, 2, 0, // 8 bytes for the length of the serialized vector
|
||||
/* TimelineMetadataHeader */
|
||||
4, 37, 101, 34, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2)
|
||||
/* TimelineMetadataBodyV2 */
|
||||
0, 0, 0, 0, 0, 0, 2, 0, // disk_consistent_lsn (8 bytes)
|
||||
1, 0, 0, 0, 0, 0, 0, 1, 0, // prev_record_lsn (9 bytes)
|
||||
1, 17, 34, 51, 68, 85, 102, 119, 136, 17, 34, 51, 68, 85, 102, 119,
|
||||
136, // ancestor_timeline (17 bytes)
|
||||
0, 0, 0, 0, 0, 0, 0, 0, // ancestor_lsn (8 bytes)
|
||||
0, 0, 0, 0, 0, 0, 0, 0, // latest_gc_cutoff_lsn (8 bytes)
|
||||
0, 0, 0, 0, 0, 0, 0, 0, // initdb_lsn (8 bytes)
|
||||
0, 0, 0, 15, // pg_version (4 bytes)
|
||||
/* padding bytes */
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
];
|
||||
let metadata_ser_bytes = original_metadata.ser().unwrap();
|
||||
assert_eq!(metadata_ser_bytes, expected_bytes);
|
||||
|
||||
let expected_metadata = {
|
||||
let mut temp_metadata = original_metadata;
|
||||
let body_bytes = temp_metadata
|
||||
.body
|
||||
.ser()
|
||||
.expect("Cannot serialize the metadata body");
|
||||
let metadata_size = METADATA_HDR_SIZE + body_bytes.len();
|
||||
let hdr = TimelineMetadataHeader {
|
||||
size: metadata_size as u16,
|
||||
format_version: METADATA_FORMAT_VERSION,
|
||||
checksum: crc32c::crc32c(&body_bytes),
|
||||
};
|
||||
temp_metadata.hdr = hdr;
|
||||
temp_metadata
|
||||
};
|
||||
let des_metadata = TimelineMetadata::des(&metadata_ser_bytes).unwrap();
|
||||
assert_eq!(des_metadata, expected_metadata);
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -168,41 +168,19 @@
|
||||
//! - create `Timeline` struct and a `RemoteTimelineClient`
|
||||
//! - initialize the client's upload queue with its `IndexPart`
|
||||
//! - schedule uploads for layers that are only present locally.
|
||||
//! - if the remote `IndexPart`'s metadata was newer than the metadata in
|
||||
//! the local filesystem, write the remote metadata to the local filesystem
|
||||
//! - After the above is done for each timeline, open the tenant for business by
|
||||
//! transitioning it from `TenantState::Attaching` to `TenantState::Active` state.
|
||||
//! This starts the timelines' WAL-receivers and the tenant's GC & Compaction loops.
|
||||
//!
|
||||
//! We keep track of the fact that a client is in `Attaching` state in a marker
|
||||
//! file on the local disk. This is critical because, when we restart the pageserver,
|
||||
//! we do not want to do the `List timelines` step for each tenant that has already
|
||||
//! been successfully attached (for performance & cost reasons).
|
||||
//! Instead, for a tenant without the attach marker file, we assume that the
|
||||
//! local state is in sync or ahead of the remote state. This includes the list
|
||||
//! of all of the tenant's timelines, which is particularly critical to be up-to-date:
|
||||
//! if there's a timeline on the remote that the pageserver doesn't know about,
|
||||
//! the GC will not consider its branch point, leading to data loss.
|
||||
//! So, for a tenant with the attach marker file, we know that we do not yet have
|
||||
//! persisted all the remote timeline's metadata files locally. To exclude the
|
||||
//! risk above, we re-run the procedure for such tenants
|
||||
//!
|
||||
//! # Operating Without Remote Storage
|
||||
//!
|
||||
//! If no remote storage configuration is provided, the [`RemoteTimelineClient`] is
|
||||
//! not created and the uploads are skipped.
|
||||
//! Theoretically, it should be ok to remove and re-add remote storage configuration to
|
||||
//! the pageserver config at any time, since it doesn't make a difference to
|
||||
//! [`Timeline::load_layer_map`].
|
||||
//! Of course, the remote timeline dir must not change while we have de-configured
|
||||
//! remote storage, i.e., the pageserver must remain the owner of the given prefix
|
||||
//! in remote storage.
|
||||
//! But note that we don't test any of this right now.
|
||||
//!
|
||||
//! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
|
||||
//! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map
|
||||
|
||||
pub(crate) mod download;
|
||||
mod download;
|
||||
pub mod index;
|
||||
mod upload;
|
||||
|
||||
@@ -468,7 +446,10 @@ impl RemoteTimelineClient {
|
||||
//
|
||||
|
||||
/// Download index file
|
||||
pub async fn download_index_file(&self) -> Result<MaybeDeletedIndexPart, DownloadError> {
|
||||
pub async fn download_index_file(
|
||||
&self,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<MaybeDeletedIndexPart, DownloadError> {
|
||||
let _unfinished_gauge_guard = self.metrics.call_begin(
|
||||
&RemoteOpFileKind::Index,
|
||||
&RemoteOpKind::Download,
|
||||
@@ -482,6 +463,7 @@ impl RemoteTimelineClient {
|
||||
&self.tenant_id,
|
||||
&self.timeline_id,
|
||||
self.generation,
|
||||
cancel,
|
||||
)
|
||||
.measure_remote_op(
|
||||
self.tenant_id,
|
||||
@@ -739,6 +721,17 @@ impl RemoteTimelineClient {
|
||||
})
|
||||
.collect();
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
for (name, gen) in &with_generations {
|
||||
if let Some(unexpected) = upload_queue.dangling_files.insert(name.to_owned(), *gen) {
|
||||
if &unexpected == gen {
|
||||
tracing::error!("{name} was unlinked twice with same generation");
|
||||
} else {
|
||||
tracing::error!("{name} was unlinked twice with different generations {gen:?} and {unexpected:?}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// after unlinking files from the upload_queue.latest_files we must always schedule an
|
||||
// index_part update, because that needs to be uploaded before we can actually delete the
|
||||
// files.
|
||||
@@ -772,6 +765,19 @@ impl RemoteTimelineClient {
|
||||
info!("scheduling deletion of layer {}{}", name, gen.get_suffix());
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
for (name, gen) in &with_generations {
|
||||
match upload_queue.dangling_files.remove(name) {
|
||||
Some(same) if &same == gen => { /* expected */ }
|
||||
Some(other) => {
|
||||
tracing::error!("{name} was unlinked with {other:?} but deleted with {gen:?}");
|
||||
}
|
||||
None => {
|
||||
tracing::error!("{name} was unlinked but was not dangling");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// schedule the actual deletions
|
||||
let op = UploadOp::Delete(Delete {
|
||||
layers: with_generations,
|
||||
@@ -797,9 +803,7 @@ impl RemoteTimelineClient {
|
||||
|
||||
let names = compacted_from.iter().map(|x| x.layer_desc().filename());
|
||||
|
||||
let with_generations =
|
||||
self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
|
||||
self.schedule_deletion_of_unlinked0(upload_queue, with_generations);
|
||||
self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
|
||||
self.launch_queued_tasks(upload_queue);
|
||||
|
||||
Ok(())
|
||||
@@ -1452,6 +1456,8 @@ impl RemoteTimelineClient {
|
||||
num_inprogress_deletions: 0,
|
||||
inprogress_tasks: HashMap::default(),
|
||||
queued_operations: VecDeque::default(),
|
||||
#[cfg(feature = "testing")]
|
||||
dangling_files: HashMap::default(),
|
||||
};
|
||||
|
||||
let upload_queue = std::mem::replace(
|
||||
@@ -1495,23 +1501,6 @@ impl RemoteTimelineClient {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_layers_metadata(
|
||||
&self,
|
||||
layers: Vec<LayerFileName>,
|
||||
) -> anyhow::Result<Vec<Option<LayerFileMetadata>>> {
|
||||
let q = self.upload_queue.lock().unwrap();
|
||||
let q = match &*q {
|
||||
UploadQueue::Stopped(_) | UploadQueue::Uninitialized => {
|
||||
anyhow::bail!("queue is in state {}", q.as_str())
|
||||
}
|
||||
UploadQueue::Initialized(inner) => inner,
|
||||
};
|
||||
|
||||
let decorated = layers.into_iter().map(|l| q.latest_files.get(&l).cloned());
|
||||
|
||||
Ok(decorated.collect())
|
||||
}
|
||||
}
|
||||
|
||||
pub fn remote_timelines_path(tenant_id: &TenantId) -> RemotePath {
|
||||
@@ -1552,15 +1541,8 @@ pub fn remote_index_path(
|
||||
.expect("Failed to construct path")
|
||||
}
|
||||
|
||||
pub const HEATMAP_BASENAME: &str = "heatmap";
|
||||
|
||||
pub fn remote_heatmap_path(tenant_id: &TenantId) -> RemotePath {
|
||||
RemotePath::from_string(&format!("tenants/{tenant_id}/{HEATMAP_BASENAME}-v01"))
|
||||
.expect("Failed to construct path")
|
||||
}
|
||||
|
||||
/// Given the key of an index, parse out the generation part of the name
|
||||
pub(crate) fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
|
||||
pub fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
|
||||
let file_name = match path.get_path().file_name() {
|
||||
Some(f) => f,
|
||||
None => {
|
||||
@@ -1749,7 +1731,11 @@ mod tests {
|
||||
let client = timeline.remote_client.as_ref().unwrap();
|
||||
|
||||
// Download back the index.json, and check that the list of files is correct
|
||||
let initial_index_part = match client.download_index_file().await.unwrap() {
|
||||
let initial_index_part = match client
|
||||
.download_index_file(CancellationToken::new())
|
||||
.await
|
||||
.unwrap()
|
||||
{
|
||||
MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
|
||||
MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
|
||||
};
|
||||
@@ -1838,7 +1824,11 @@ mod tests {
|
||||
}
|
||||
|
||||
// Download back the index.json, and check that the list of files is correct
|
||||
let index_part = match client.download_index_file().await.unwrap() {
|
||||
let index_part = match client
|
||||
.download_index_file(CancellationToken::new())
|
||||
.await
|
||||
.unwrap()
|
||||
{
|
||||
MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
|
||||
MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
|
||||
};
|
||||
@@ -2037,7 +2027,7 @@ mod tests {
|
||||
let client = test_state.build_client(get_generation);
|
||||
|
||||
let download_r = client
|
||||
.download_index_file()
|
||||
.download_index_file(CancellationToken::new())
|
||||
.await
|
||||
.expect("download should always succeed");
|
||||
assert!(matches!(download_r, MaybeDeletedIndexPart::IndexPart(_)));
|
||||
|
||||
@@ -18,8 +18,8 @@ use crate::config::PageServerConf;
|
||||
use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
|
||||
use crate::tenant::storage_layer::LayerFileName;
|
||||
use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::{Generation, TENANT_DELETED_MARKER_FILE_NAME};
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage};
|
||||
use crate::tenant::Generation;
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode};
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
@@ -170,53 +170,43 @@ pub fn is_temp_download_file(path: &Utf8Path) -> bool {
|
||||
pub async fn list_remote_timelines(
|
||||
storage: &GenericRemoteStorage,
|
||||
tenant_id: TenantId,
|
||||
) -> anyhow::Result<HashSet<TimelineId>> {
|
||||
cancel: CancellationToken,
|
||||
) -> anyhow::Result<(HashSet<TimelineId>, HashSet<String>)> {
|
||||
let remote_path = remote_timelines_path(&tenant_id);
|
||||
|
||||
fail::fail_point!("storage-sync-list-remote-timelines", |_| {
|
||||
anyhow::bail!("storage-sync-list-remote-timelines");
|
||||
});
|
||||
|
||||
let timelines = download_retry(
|
||||
|| storage.list_prefixes(Some(&remote_path)),
|
||||
&format!("list prefixes for {tenant_id}"),
|
||||
let listing = download_retry_forever(
|
||||
|| storage.list(Some(&remote_path), ListingMode::WithDelimiter),
|
||||
&format!("list timelines for {tenant_id}"),
|
||||
cancel,
|
||||
)
|
||||
.await?;
|
||||
|
||||
if timelines.is_empty() {
|
||||
anyhow::bail!("no timelines found on the remote storage")
|
||||
}
|
||||
|
||||
let mut timeline_ids = HashSet::new();
|
||||
let mut other_prefixes = HashSet::new();
|
||||
|
||||
for timeline_remote_storage_key in timelines {
|
||||
if timeline_remote_storage_key.object_name() == Some(TENANT_DELETED_MARKER_FILE_NAME) {
|
||||
// A `deleted` key within `timelines/` is a marker file, not a timeline. Ignore it.
|
||||
// This code will be removed in https://github.com/neondatabase/neon/pull/5580
|
||||
continue;
|
||||
}
|
||||
|
||||
for timeline_remote_storage_key in listing.prefixes {
|
||||
let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
|
||||
anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}")
|
||||
})?;
|
||||
|
||||
let timeline_id: TimelineId = object_name
|
||||
.parse()
|
||||
.with_context(|| format!("parse object name into timeline id '{object_name}'"))?;
|
||||
|
||||
// list_prefixes is assumed to return unique names. Ensure this here.
|
||||
// NB: it's safer to bail out than warn-log this because the pageserver
|
||||
// needs to absolutely know about _all_ timelines that exist, so that
|
||||
// GC knows all the branchpoints. If we skipped over a timeline instead,
|
||||
// GC could delete a layer that's still needed by that timeline.
|
||||
anyhow::ensure!(
|
||||
!timeline_ids.contains(&timeline_id),
|
||||
"list_prefixes contains duplicate timeline id {timeline_id}"
|
||||
);
|
||||
timeline_ids.insert(timeline_id);
|
||||
match object_name.parse::<TimelineId>() {
|
||||
Ok(t) => timeline_ids.insert(t),
|
||||
Err(_) => other_prefixes.insert(object_name.to_string()),
|
||||
};
|
||||
}
|
||||
|
||||
Ok(timeline_ids)
|
||||
for key in listing.keys {
|
||||
let object_name = key
|
||||
.object_name()
|
||||
.ok_or_else(|| anyhow::anyhow!("object name for key {key}"))?;
|
||||
other_prefixes.insert(object_name.to_string());
|
||||
}
|
||||
|
||||
Ok((timeline_ids, other_prefixes))
|
||||
}
|
||||
|
||||
async fn do_download_index_part(
|
||||
@@ -224,10 +214,11 @@ async fn do_download_index_part(
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
index_generation: Generation,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<IndexPart, DownloadError> {
|
||||
let remote_path = remote_index_path(tenant_id, timeline_id, index_generation);
|
||||
|
||||
let index_part_bytes = download_retry(
|
||||
let index_part_bytes = download_retry_forever(
|
||||
|| async {
|
||||
let mut index_part_download = storage.download(&remote_path).await?;
|
||||
|
||||
@@ -242,6 +233,7 @@ async fn do_download_index_part(
|
||||
Ok(index_part_bytes)
|
||||
},
|
||||
&format!("download {remote_path:?}"),
|
||||
cancel,
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -263,19 +255,28 @@ pub(super) async fn download_index_part(
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
my_generation: Generation,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<IndexPart, DownloadError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
if my_generation.is_none() {
|
||||
// Operating without generations: just fetch the generation-less path
|
||||
return do_download_index_part(storage, tenant_id, timeline_id, my_generation).await;
|
||||
return do_download_index_part(storage, tenant_id, timeline_id, my_generation, cancel)
|
||||
.await;
|
||||
}
|
||||
|
||||
// Stale case: If we were intentionally attached in a stale generation, there may already be a remote
|
||||
// index in our generation.
|
||||
//
|
||||
// This is an optimization to avoid doing the listing for the general case below.
|
||||
let res = do_download_index_part(storage, tenant_id, timeline_id, my_generation).await;
|
||||
let res = do_download_index_part(
|
||||
storage,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
my_generation,
|
||||
cancel.clone(),
|
||||
)
|
||||
.await;
|
||||
match res {
|
||||
Ok(index_part) => {
|
||||
tracing::debug!(
|
||||
@@ -295,8 +296,14 @@ pub(super) async fn download_index_part(
|
||||
// we want to find the most recent index from a previous generation.
|
||||
//
|
||||
// This is an optimization to avoid doing the listing for the general case below.
|
||||
let res =
|
||||
do_download_index_part(storage, tenant_id, timeline_id, my_generation.previous()).await;
|
||||
let res = do_download_index_part(
|
||||
storage,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
my_generation.previous(),
|
||||
cancel.clone(),
|
||||
)
|
||||
.await;
|
||||
match res {
|
||||
Ok(index_part) => {
|
||||
tracing::debug!("Found index_part from previous generation");
|
||||
@@ -340,13 +347,14 @@ pub(super) async fn download_index_part(
|
||||
match max_previous_generation {
|
||||
Some(g) => {
|
||||
tracing::debug!("Found index_part in generation {g:?}");
|
||||
do_download_index_part(storage, tenant_id, timeline_id, g).await
|
||||
do_download_index_part(storage, tenant_id, timeline_id, g, cancel).await
|
||||
}
|
||||
None => {
|
||||
// Migration from legacy pre-generation state: we have a generation but no prior
|
||||
// attached pageservers did. Try to load from a no-generation path.
|
||||
tracing::info!("No index_part.json* found");
|
||||
do_download_index_part(storage, tenant_id, timeline_id, Generation::none()).await
|
||||
do_download_index_part(storage, tenant_id, timeline_id, Generation::none(), cancel)
|
||||
.await
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -376,3 +384,23 @@ where
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn download_retry_forever<T, O, F>(
|
||||
op: O,
|
||||
description: &str,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<T, DownloadError>
|
||||
where
|
||||
O: FnMut() -> F,
|
||||
F: Future<Output = Result<T, DownloadError>>,
|
||||
{
|
||||
backoff::retry(
|
||||
op,
|
||||
|e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound),
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
u32::MAX,
|
||||
description,
|
||||
backoff::Cancel::new(cancel, || DownloadError::Cancelled),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user