mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-05 03:30:36 +00:00
Compare commits
170 Commits
hack/compu
...
task_hiera
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d87549696b | ||
|
|
960a29a6fe | ||
|
|
d6f6e9a87b | ||
|
|
ddae6e2b0a | ||
|
|
e021298dec | ||
|
|
9790a7c2e8 | ||
|
|
9660282c69 | ||
|
|
894cd3ddf7 | ||
|
|
735c9b3b70 | ||
|
|
e76b24ccc5 | ||
|
|
6ff2c07cc8 | ||
|
|
efd46e478a | ||
|
|
c2528ae671 | ||
|
|
a919b863d1 | ||
|
|
2d22661061 | ||
|
|
e3778381a8 | ||
|
|
c8316b7a3f | ||
|
|
8460654f61 | ||
|
|
7c2c87a5ab | ||
|
|
5820faaa87 | ||
|
|
dfb0a6fdaf | ||
|
|
6acbee2368 | ||
|
|
aec1acdbac | ||
|
|
8bb4a13192 | ||
|
|
9e071e4458 | ||
|
|
fead836f26 | ||
|
|
20e9cf7d31 | ||
|
|
3b04f3a749 | ||
|
|
c49fd69bd6 | ||
|
|
5ab9592a2d | ||
|
|
036558c956 | ||
|
|
6a922b1a75 | ||
|
|
f1fc1fd639 | ||
|
|
66a7a226f8 | ||
|
|
f0d15cee6f | ||
|
|
0ba4cae491 | ||
|
|
df1f8e13c4 | ||
|
|
e640bc7dba | ||
|
|
cf024de202 | ||
|
|
e1a564ace2 | ||
|
|
f5b9af6ac7 | ||
|
|
5e98855d80 | ||
|
|
699049b8f3 | ||
|
|
2c544343e0 | ||
|
|
193e60e2b8 | ||
|
|
1bbd6cae24 | ||
|
|
65f48c7002 | ||
|
|
d9d8e9afc7 | ||
|
|
7914eaf1e6 | ||
|
|
37fdbc3aaa | ||
|
|
7aa1e58301 | ||
|
|
f2892d3798 | ||
|
|
b492cedf51 | ||
|
|
880663f6bc | ||
|
|
e89e41f8ba | ||
|
|
f9401fdd31 | ||
|
|
b7ffe24426 | ||
|
|
52718bb8ff | ||
|
|
10c77cb410 | ||
|
|
31be301ef3 | ||
|
|
a3c7d400b4 | ||
|
|
7501ca6efb | ||
|
|
987c9aaea0 | ||
|
|
7fab731f65 | ||
|
|
483caa22c6 | ||
|
|
da5e03b0d8 | ||
|
|
be885370f6 | ||
|
|
bc1020f965 | ||
|
|
61fe9d360d | ||
|
|
f60e49fe8e | ||
|
|
c48918d329 | ||
|
|
bad686bb71 | ||
|
|
85d08581ed | ||
|
|
c7f1143e57 | ||
|
|
7403d55013 | ||
|
|
12f02523a4 | ||
|
|
207c527270 | ||
|
|
eae49ff598 | ||
|
|
e6b2f89fec | ||
|
|
1d81e70d60 | ||
|
|
e3512340c1 | ||
|
|
e43cde7aba | ||
|
|
c1295bfb3a | ||
|
|
711425cc47 | ||
|
|
fd81945a60 | ||
|
|
e49c21a3cd | ||
|
|
92e7cd40e8 | ||
|
|
7eabfc40ee | ||
|
|
ce1652990d | ||
|
|
8cd28e1718 | ||
|
|
1c88824ed0 | ||
|
|
1ce1c82d78 | ||
|
|
f784e59b12 | ||
|
|
b71b8ecfc2 | ||
|
|
3842773546 | ||
|
|
f39fca0049 | ||
|
|
b451e75dc6 | ||
|
|
3657a3c76e | ||
|
|
eba3bfc57e | ||
|
|
57ae9cd07f | ||
|
|
3bb1030f5d | ||
|
|
5d3c3636fc | ||
|
|
0c87d1866b | ||
|
|
8ec6033ed8 | ||
|
|
e12e2681e9 | ||
|
|
1e57ddaabc | ||
|
|
3e094e90d7 | ||
|
|
292281c9df | ||
|
|
50d959fddc | ||
|
|
fc77c42c57 | ||
|
|
f05d1b598a | ||
|
|
ca597206b8 | ||
|
|
46f20faa0d | ||
|
|
9e55ad4796 | ||
|
|
70b5646fba | ||
|
|
64890594a5 | ||
|
|
78e73b20e1 | ||
|
|
c48cc020bd | ||
|
|
a15969714c | ||
|
|
8c195d8214 | ||
|
|
0d16874960 | ||
|
|
fd440e7d79 | ||
|
|
65160650da | ||
|
|
12dd6b61df | ||
|
|
5345c1c21b | ||
|
|
105edc265c | ||
|
|
8625466144 | ||
|
|
1ab0cfc8cb | ||
|
|
ca469be1cf | ||
|
|
286f34dfce | ||
|
|
f290b27378 | ||
|
|
4cd18fcebd | ||
|
|
4c29e0594e | ||
|
|
3c56a4dd18 | ||
|
|
316309c85b | ||
|
|
e09bb9974c | ||
|
|
5289f341ce | ||
|
|
683ec2417c | ||
|
|
a76a503b8b | ||
|
|
92bc2bb132 | ||
|
|
b80b9e1c4c | ||
|
|
87b8ac3ec3 | ||
|
|
6b1c4cc983 | ||
|
|
831fad46d5 | ||
|
|
53851ea8ec | ||
|
|
044375732a | ||
|
|
ea63b43009 | ||
|
|
a56fd45f56 | ||
|
|
582a42762b | ||
|
|
f5dfa6f140 | ||
|
|
f8d9bd8d14 | ||
|
|
04e6c09f14 | ||
|
|
54327bbeec | ||
|
|
35f243e787 | ||
|
|
b7a988ba46 | ||
|
|
a0e61145c8 | ||
|
|
6afbadc90e | ||
|
|
2a12e9c46b | ||
|
|
9e3c07611c | ||
|
|
d353fa1998 | ||
|
|
0d10992e46 | ||
|
|
3e131bb3d7 | ||
|
|
81b2cefe10 | ||
|
|
d2ca410919 | ||
|
|
d98ac04136 | ||
|
|
ac08072d2e | ||
|
|
d22dce2e31 | ||
|
|
3b3f040be3 | ||
|
|
cad0dca4b8 | ||
|
|
5d13a2e426 |
@@ -1,17 +1,3 @@
|
|||||||
# The binaries are really slow, if you compile them in 'dev' mode with the defaults.
|
|
||||||
# Enable some optimizations even in 'dev' mode, to make tests faster. The basic
|
|
||||||
# optimizations enabled by "opt-level=1" don't affect debuggability too much.
|
|
||||||
#
|
|
||||||
# See https://www.reddit.com/r/rust/comments/gvrgca/this_is_a_neat_trick_for_getting_good_runtime/
|
|
||||||
#
|
|
||||||
[profile.dev.package."*"]
|
|
||||||
# Set the default for dependencies in Development mode.
|
|
||||||
opt-level = 3
|
|
||||||
|
|
||||||
[profile.dev]
|
|
||||||
# Turn on a small amount of optimization in Development mode.
|
|
||||||
opt-level = 1
|
|
||||||
|
|
||||||
[build]
|
[build]
|
||||||
# This is only present for local builds, as it will be overridden
|
# This is only present for local builds, as it will be overridden
|
||||||
# by the RUSTDOCFLAGS env var in CI.
|
# by the RUSTDOCFLAGS env var in CI.
|
||||||
|
|||||||
11
.github/workflows/build_and_test.yml
vendored
11
.github/workflows/build_and_test.yml
vendored
@@ -199,6 +199,10 @@ jobs:
|
|||||||
#
|
#
|
||||||
git config --global --add safe.directory ${{ github.workspace }}
|
git config --global --add safe.directory ${{ github.workspace }}
|
||||||
git config --global --add safe.directory ${GITHUB_WORKSPACE}
|
git config --global --add safe.directory ${GITHUB_WORKSPACE}
|
||||||
|
for r in 14 15 16; do
|
||||||
|
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
|
||||||
|
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
|
||||||
|
done
|
||||||
|
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v3
|
uses: actions/checkout@v3
|
||||||
@@ -404,7 +408,7 @@ jobs:
|
|||||||
uses: ./.github/actions/save-coverage-data
|
uses: ./.github/actions/save-coverage-data
|
||||||
|
|
||||||
regress-tests:
|
regress-tests:
|
||||||
needs: [ check-permissions, build-neon ]
|
needs: [ check-permissions, build-neon, tag ]
|
||||||
runs-on: [ self-hosted, gen3, large ]
|
runs-on: [ self-hosted, gen3, large ]
|
||||||
container:
|
container:
|
||||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||||
@@ -436,6 +440,7 @@ jobs:
|
|||||||
env:
|
env:
|
||||||
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||||
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
|
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
|
||||||
|
BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
|
||||||
|
|
||||||
- name: Merge and upload coverage data
|
- name: Merge and upload coverage data
|
||||||
if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
|
if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
|
||||||
@@ -1096,6 +1101,10 @@ jobs:
|
|||||||
#
|
#
|
||||||
git config --global --add safe.directory ${{ github.workspace }}
|
git config --global --add safe.directory ${{ github.workspace }}
|
||||||
git config --global --add safe.directory ${GITHUB_WORKSPACE}
|
git config --global --add safe.directory ${GITHUB_WORKSPACE}
|
||||||
|
for r in 14 15 16; do
|
||||||
|
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
|
||||||
|
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
|
||||||
|
done
|
||||||
|
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v3
|
uses: actions/checkout@v3
|
||||||
|
|||||||
18
.github/workflows/neon_extra_builds.yml
vendored
18
.github/workflows/neon_extra_builds.yml
vendored
@@ -142,6 +142,10 @@ jobs:
|
|||||||
#
|
#
|
||||||
git config --global --add safe.directory ${{ github.workspace }}
|
git config --global --add safe.directory ${{ github.workspace }}
|
||||||
git config --global --add safe.directory ${GITHUB_WORKSPACE}
|
git config --global --add safe.directory ${GITHUB_WORKSPACE}
|
||||||
|
for r in 14 15 16; do
|
||||||
|
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
|
||||||
|
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
|
||||||
|
done
|
||||||
|
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
@@ -238,6 +242,20 @@ jobs:
|
|||||||
options: --init
|
options: --init
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
|
- name: Fix git ownership
|
||||||
|
run: |
|
||||||
|
# Workaround for `fatal: detected dubious ownership in repository at ...`
|
||||||
|
#
|
||||||
|
# Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
|
||||||
|
# Ref https://github.com/actions/checkout/issues/785
|
||||||
|
#
|
||||||
|
git config --global --add safe.directory ${{ github.workspace }}
|
||||||
|
git config --global --add safe.directory ${GITHUB_WORKSPACE}
|
||||||
|
for r in 14 15 16; do
|
||||||
|
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
|
||||||
|
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
|
||||||
|
done
|
||||||
|
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
|
|||||||
2
.github/workflows/release.yml
vendored
2
.github/workflows/release.yml
vendored
@@ -2,7 +2,7 @@ name: Create Release Branch
|
|||||||
|
|
||||||
on:
|
on:
|
||||||
schedule:
|
schedule:
|
||||||
- cron: '0 7 * * 5'
|
- cron: '0 6 * * 1'
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
|||||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -18,3 +18,6 @@ test_output/
|
|||||||
*.o
|
*.o
|
||||||
*.so
|
*.so
|
||||||
*.Po
|
*.Po
|
||||||
|
|
||||||
|
# pgindent typedef lists
|
||||||
|
*.list
|
||||||
|
|||||||
@@ -9,6 +9,24 @@ refactoring, additional comments, and so forth. Let's try to raise the
|
|||||||
bar, and clean things up as we go. Try to leave code in a better shape
|
bar, and clean things up as we go. Try to leave code in a better shape
|
||||||
than it was before.
|
than it was before.
|
||||||
|
|
||||||
|
## Pre-commit hook
|
||||||
|
|
||||||
|
We have a sample pre-commit hook in `pre-commit.py`.
|
||||||
|
To set it up, run:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ln -s ../../pre-commit.py .git/hooks/pre-commit
|
||||||
|
```
|
||||||
|
|
||||||
|
This will run following checks on staged files before each commit:
|
||||||
|
- `rustfmt`
|
||||||
|
- checks for python files, see [obligatory checks](/docs/sourcetree.md#obligatory-checks).
|
||||||
|
|
||||||
|
There is also a separate script `./run_clippy.sh` that runs `cargo clippy` on the whole project
|
||||||
|
and `./scripts/reformat` that runs all formatting tools to ensure the project is up to date.
|
||||||
|
|
||||||
|
If you want to skip the hook, run `git commit` with `--no-verify` option.
|
||||||
|
|
||||||
## Submitting changes
|
## Submitting changes
|
||||||
|
|
||||||
1. Get at least one +1 on your PR before you push.
|
1. Get at least one +1 on your PR before you push.
|
||||||
|
|||||||
697
Cargo.lock
generated
697
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
41
Cargo.toml
41
Cargo.toml
@@ -37,20 +37,19 @@ license = "Apache-2.0"
|
|||||||
[workspace.dependencies]
|
[workspace.dependencies]
|
||||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||||
arc-swap = "1.6"
|
arc-swap = "1.6"
|
||||||
async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
|
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
|
||||||
azure_core = "0.16"
|
azure_core = "0.18"
|
||||||
azure_identity = "0.16"
|
azure_identity = "0.18"
|
||||||
azure_storage = "0.16"
|
azure_storage = "0.18"
|
||||||
azure_storage_blobs = "0.16"
|
azure_storage_blobs = "0.18"
|
||||||
flate2 = "1.0.26"
|
flate2 = "1.0.26"
|
||||||
async-stream = "0.3"
|
async-stream = "0.3"
|
||||||
async-trait = "0.1"
|
async-trait = "0.1"
|
||||||
aws-config = { version = "0.56", default-features = false, features=["rustls"] }
|
aws-config = { version = "1.0", default-features = false, features=["rustls"] }
|
||||||
aws-sdk-s3 = "0.29"
|
aws-sdk-s3 = "1.0"
|
||||||
aws-smithy-http = "0.56"
|
aws-smithy-async = { version = "1.0", default-features = false, features=["rt-tokio"] }
|
||||||
aws-smithy-async = { version = "0.56", default-features = false, features=["rt-tokio"] }
|
aws-smithy-types = "1.0"
|
||||||
aws-credential-types = "0.56"
|
aws-credential-types = "1.0"
|
||||||
aws-types = "0.56"
|
|
||||||
axum = { version = "0.6.20", features = ["ws"] }
|
axum = { version = "0.6.20", features = ["ws"] }
|
||||||
base64 = "0.13.0"
|
base64 = "0.13.0"
|
||||||
bincode = "1.3"
|
bincode = "1.3"
|
||||||
@@ -89,6 +88,7 @@ humantime-serde = "1.1.1"
|
|||||||
hyper = "0.14"
|
hyper = "0.14"
|
||||||
hyper-tungstenite = "0.11"
|
hyper-tungstenite = "0.11"
|
||||||
inotify = "0.10.2"
|
inotify = "0.10.2"
|
||||||
|
ipnet = "2.9.0"
|
||||||
itertools = "0.10"
|
itertools = "0.10"
|
||||||
jsonwebtoken = "8"
|
jsonwebtoken = "8"
|
||||||
libc = "0.2"
|
libc = "0.2"
|
||||||
@@ -109,7 +109,7 @@ pin-project-lite = "0.2"
|
|||||||
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
|
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
|
||||||
prost = "0.11"
|
prost = "0.11"
|
||||||
rand = "0.8"
|
rand = "0.8"
|
||||||
regex = "1.4"
|
regex = "1.10.2"
|
||||||
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
|
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
|
||||||
reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] }
|
reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] }
|
||||||
reqwest-middleware = "0.2.0"
|
reqwest-middleware = "0.2.0"
|
||||||
@@ -122,14 +122,17 @@ rustls-pemfile = "1"
|
|||||||
rustls-split = "0.3"
|
rustls-split = "0.3"
|
||||||
scopeguard = "1.1"
|
scopeguard = "1.1"
|
||||||
sysinfo = "0.29.2"
|
sysinfo = "0.29.2"
|
||||||
|
sd-notify = "0.4.1"
|
||||||
sentry = { version = "0.31", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
|
sentry = { version = "0.31", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
|
||||||
serde = { version = "1.0", features = ["derive"] }
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
serde_json = "1"
|
serde_json = "1"
|
||||||
|
serde_path_to_error = "0.1"
|
||||||
serde_with = "2.0"
|
serde_with = "2.0"
|
||||||
serde_assert = "0.5.0"
|
serde_assert = "0.5.0"
|
||||||
sha2 = "0.10.2"
|
sha2 = "0.10.2"
|
||||||
signal-hook = "0.3"
|
signal-hook = "0.3"
|
||||||
smallvec = "1.11"
|
smallvec = "1.11"
|
||||||
|
smol_str = { version = "0.2.0", features = ["serde"] }
|
||||||
socket2 = "0.5"
|
socket2 = "0.5"
|
||||||
strum = "0.24"
|
strum = "0.24"
|
||||||
strum_macros = "0.24"
|
strum_macros = "0.24"
|
||||||
@@ -146,7 +149,7 @@ tokio-postgres-rustls = "0.10.0"
|
|||||||
tokio-rustls = "0.24"
|
tokio-rustls = "0.24"
|
||||||
tokio-stream = "0.1"
|
tokio-stream = "0.1"
|
||||||
tokio-tar = "0.3"
|
tokio-tar = "0.3"
|
||||||
tokio-util = { version = "0.7", features = ["io"] }
|
tokio-util = { version = "0.7.10", features = ["io", "rt"] }
|
||||||
toml = "0.7"
|
toml = "0.7"
|
||||||
toml_edit = "0.19"
|
toml_edit = "0.19"
|
||||||
tonic = {version = "0.9", features = ["tls", "tls-roots"]}
|
tonic = {version = "0.9", features = ["tls", "tls-roots"]}
|
||||||
@@ -165,11 +168,11 @@ env_logger = "0.10"
|
|||||||
log = "0.4"
|
log = "0.4"
|
||||||
|
|
||||||
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
|
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
|
||||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="6ce32f791526e27533cab0232a6bb243b2c32584" }
|
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||||
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="6ce32f791526e27533cab0232a6bb243b2c32584" }
|
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="6ce32f791526e27533cab0232a6bb243b2c32584" }
|
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="6ce32f791526e27533cab0232a6bb243b2c32584" }
|
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="6ce32f791526e27533cab0232a6bb243b2c32584" }
|
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||||
|
|
||||||
## Other git libraries
|
## Other git libraries
|
||||||
heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
|
heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
|
||||||
@@ -206,7 +209,7 @@ tonic-build = "0.9"
|
|||||||
|
|
||||||
# This is only needed for proxy's tests.
|
# This is only needed for proxy's tests.
|
||||||
# TODO: we should probably fork `tokio-postgres-rustls` instead.
|
# TODO: we should probably fork `tokio-postgres-rustls` instead.
|
||||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="6ce32f791526e27533cab0232a6bb243b2c32584" }
|
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||||
|
|
||||||
################# Binary contents sections
|
################# Binary contents sections
|
||||||
|
|
||||||
|
|||||||
@@ -393,7 +393,9 @@ RUN case "${PG_VERSION}" in \
|
|||||||
export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \
|
export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \
|
||||||
;; \
|
;; \
|
||||||
*) \
|
*) \
|
||||||
echo "TimescaleDB not supported on this PostgreSQL version. See https://github.com/timescale/timescaledb/issues/5752" && exit 0;; \
|
export TIMESCALEDB_VERSION=2.13.0 \
|
||||||
|
export TIMESCALEDB_CHECKSUM=584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d \
|
||||||
|
;; \
|
||||||
esac && \
|
esac && \
|
||||||
apt-get update && \
|
apt-get update && \
|
||||||
apt-get install -y cmake && \
|
apt-get install -y cmake && \
|
||||||
@@ -714,6 +716,23 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz -
|
|||||||
cargo pgrx install --release && \
|
cargo pgrx install --release && \
|
||||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control
|
echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control
|
||||||
|
|
||||||
|
#########################################################################################
|
||||||
|
#
|
||||||
|
# Layer "wal2json-build"
|
||||||
|
# Compile "wal2json" extension
|
||||||
|
#
|
||||||
|
#########################################################################################
|
||||||
|
|
||||||
|
FROM build-deps AS wal2json-pg-build
|
||||||
|
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||||
|
|
||||||
|
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||||
|
RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \
|
||||||
|
echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
|
||||||
|
mkdir wal2json-src && cd wal2json-src && tar xvzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
|
||||||
|
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||||
|
make -j $(getconf _NPROCESSORS_ONLN) install
|
||||||
|
|
||||||
#########################################################################################
|
#########################################################################################
|
||||||
#
|
#
|
||||||
# Layer "neon-pg-ext-build"
|
# Layer "neon-pg-ext-build"
|
||||||
@@ -750,6 +769,7 @@ COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
|||||||
COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||||
COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||||
COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||||
|
COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
|
||||||
COPY pgxn/ pgxn/
|
COPY pgxn/ pgxn/
|
||||||
|
|
||||||
RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
||||||
|
|||||||
38
Makefile
38
Makefile
@@ -260,6 +260,44 @@ distclean:
|
|||||||
fmt:
|
fmt:
|
||||||
./pre-commit.py --fix-inplace
|
./pre-commit.py --fix-inplace
|
||||||
|
|
||||||
|
postgres-%-pg-bsd-indent: postgres-%
|
||||||
|
+@echo "Compiling pg_bsd_indent"
|
||||||
|
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/tools/pg_bsd_indent/
|
||||||
|
|
||||||
|
# Create typedef list for the core. Note that generally it should be combined with
|
||||||
|
# buildfarm one to cover platform specific stuff.
|
||||||
|
# https://wiki.postgresql.org/wiki/Running_pgindent_on_non-core_code_or_development_code
|
||||||
|
postgres-%-typedefs.list: postgres-%
|
||||||
|
$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/find_typedef $(POSTGRES_INSTALL_DIR)/$*/bin > $@
|
||||||
|
|
||||||
|
# Indent postgres. See src/tools/pgindent/README for details.
|
||||||
|
.PHONY: postgres-%-pgindent
|
||||||
|
postgres-%-pgindent: postgres-%-pg-bsd-indent postgres-%-typedefs.list
|
||||||
|
+@echo merge with buildfarm typedef to cover all platforms
|
||||||
|
+@echo note: I first tried to download from pgbuildfarm.org, but for unclear reason e.g. \
|
||||||
|
REL_16_STABLE list misses PGSemaphoreData
|
||||||
|
# wget -q -O - "http://www.pgbuildfarm.org/cgi-bin/typedefs.pl?branch=REL_16_STABLE" |\
|
||||||
|
# cat - postgres-$*-typedefs.list | sort | uniq > postgres-$*-typedefs-full.list
|
||||||
|
cat $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/typedefs.list |\
|
||||||
|
cat - postgres-$*-typedefs.list | sort | uniq > postgres-$*-typedefs-full.list
|
||||||
|
+@echo note: you might want to run it on selected files/dirs instead.
|
||||||
|
INDENT=$(POSTGRES_INSTALL_DIR)/build/$*/src/tools/pg_bsd_indent/pg_bsd_indent \
|
||||||
|
$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/pgindent --typedefs postgres-$*-typedefs-full.list \
|
||||||
|
$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/ \
|
||||||
|
--excludes $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/exclude_file_patterns
|
||||||
|
rm -f pg*.BAK
|
||||||
|
|
||||||
|
# Indent pxgn/neon.
|
||||||
|
.PHONY: pgindent
|
||||||
|
neon-pgindent: postgres-v16-pg-bsd-indent neon-pg-ext-v16
|
||||||
|
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
|
||||||
|
FIND_TYPEDEF=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/find_typedef \
|
||||||
|
INDENT=$(POSTGRES_INSTALL_DIR)/build/v16/src/tools/pg_bsd_indent/pg_bsd_indent \
|
||||||
|
PGINDENT_SCRIPT=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/pgindent/pgindent \
|
||||||
|
-C $(POSTGRES_INSTALL_DIR)/build/neon-v16 \
|
||||||
|
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile pgindent
|
||||||
|
|
||||||
|
|
||||||
.PHONY: setup-pre-commit-hook
|
.PHONY: setup-pre-commit-hook
|
||||||
setup-pre-commit-hook:
|
setup-pre-commit-hook:
|
||||||
ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit
|
ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit
|
||||||
|
|||||||
@@ -149,6 +149,9 @@ tenant 9ef87a5bf0d92544f6fafeeb3239695c successfully created on the pageserver
|
|||||||
Created an initial timeline 'de200bd42b49cc1814412c7e592dd6e9' at Lsn 0/16B5A50 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c
|
Created an initial timeline 'de200bd42b49cc1814412c7e592dd6e9' at Lsn 0/16B5A50 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c
|
||||||
Setting tenant 9ef87a5bf0d92544f6fafeeb3239695c as a default one
|
Setting tenant 9ef87a5bf0d92544f6fafeeb3239695c as a default one
|
||||||
|
|
||||||
|
# create postgres compute node
|
||||||
|
> cargo neon endpoint create main
|
||||||
|
|
||||||
# start postgres compute node
|
# start postgres compute node
|
||||||
> cargo neon endpoint start main
|
> cargo neon endpoint start main
|
||||||
Starting new endpoint main (PostgreSQL v14) on timeline de200bd42b49cc1814412c7e592dd6e9 ...
|
Starting new endpoint main (PostgreSQL v14) on timeline de200bd42b49cc1814412c7e592dd6e9 ...
|
||||||
@@ -185,8 +188,11 @@ Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant:
|
|||||||
(L) main [de200bd42b49cc1814412c7e592dd6e9]
|
(L) main [de200bd42b49cc1814412c7e592dd6e9]
|
||||||
(L) ┗━ @0/16F9A00: migration_check [b3b863fa45fa9e57e615f9f2d944e601]
|
(L) ┗━ @0/16F9A00: migration_check [b3b863fa45fa9e57e615f9f2d944e601]
|
||||||
|
|
||||||
|
# create postgres on that branch
|
||||||
|
> cargo neon endpoint create migration_check --branch-name migration_check
|
||||||
|
|
||||||
# start postgres on that branch
|
# start postgres on that branch
|
||||||
> cargo neon endpoint start migration_check --branch-name migration_check
|
> cargo neon endpoint start migration_check
|
||||||
Starting new endpoint migration_check (PostgreSQL v14) on timeline b3b863fa45fa9e57e615f9f2d944e601 ...
|
Starting new endpoint migration_check (PostgreSQL v14) on timeline b3b863fa45fa9e57e615f9f2d944e601 ...
|
||||||
Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55434/postgres'
|
Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55434/postgres'
|
||||||
|
|
||||||
|
|||||||
@@ -38,3 +38,4 @@ toml_edit.workspace = true
|
|||||||
remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
|
remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
|
||||||
vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
|
vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
|
||||||
zstd = "0.12.4"
|
zstd = "0.12.4"
|
||||||
|
bytes = "1.0"
|
||||||
|
|||||||
@@ -31,7 +31,7 @@
|
|||||||
//! -C 'postgresql://cloud_admin@localhost/postgres' \
|
//! -C 'postgresql://cloud_admin@localhost/postgres' \
|
||||||
//! -S /var/db/postgres/specs/current.json \
|
//! -S /var/db/postgres/specs/current.json \
|
||||||
//! -b /usr/local/bin/postgres \
|
//! -b /usr/local/bin/postgres \
|
||||||
//! -r {"bucket": "neon-dev-extensions-eu-central-1", "region": "eu-central-1"}
|
//! -r http://pg-ext-s3-gateway
|
||||||
//! ```
|
//! ```
|
||||||
//!
|
//!
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
@@ -51,7 +51,7 @@ use compute_api::responses::ComputeStatus;
|
|||||||
|
|
||||||
use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
|
use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
|
||||||
use compute_tools::configurator::launch_configurator;
|
use compute_tools::configurator::launch_configurator;
|
||||||
use compute_tools::extension_server::{get_pg_version, init_remote_storage};
|
use compute_tools::extension_server::get_pg_version;
|
||||||
use compute_tools::http::api::launch_http_server;
|
use compute_tools::http::api::launch_http_server;
|
||||||
use compute_tools::logger::*;
|
use compute_tools::logger::*;
|
||||||
use compute_tools::monitor::launch_monitor;
|
use compute_tools::monitor::launch_monitor;
|
||||||
@@ -60,7 +60,7 @@ use compute_tools::spec::*;
|
|||||||
|
|
||||||
// this is an arbitrary build tag. Fine as a default / for testing purposes
|
// this is an arbitrary build tag. Fine as a default / for testing purposes
|
||||||
// in-case of not-set environment var
|
// in-case of not-set environment var
|
||||||
const BUILD_TAG_DEFAULT: &str = "5670669815";
|
const BUILD_TAG_DEFAULT: &str = "latest";
|
||||||
|
|
||||||
fn main() -> Result<()> {
|
fn main() -> Result<()> {
|
||||||
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
|
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
|
||||||
@@ -74,10 +74,18 @@ fn main() -> Result<()> {
|
|||||||
let pgbin_default = String::from("postgres");
|
let pgbin_default = String::from("postgres");
|
||||||
let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
|
let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
|
||||||
|
|
||||||
let remote_ext_config = matches.get_one::<String>("remote-ext-config");
|
let ext_remote_storage = matches
|
||||||
let ext_remote_storage = remote_ext_config.map(|x| {
|
.get_one::<String>("remote-ext-config")
|
||||||
init_remote_storage(x).expect("cannot initialize remote extension storage from config")
|
// Compatibility hack: if the control plane specified any remote-ext-config
|
||||||
});
|
// use the default value for extension storage proxy gateway.
|
||||||
|
// Remove this once the control plane is updated to pass the gateway URL
|
||||||
|
.map(|conf| {
|
||||||
|
if conf.starts_with("http") {
|
||||||
|
conf.trim_end_matches('/')
|
||||||
|
} else {
|
||||||
|
"http://pg-ext-s3-gateway"
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
let http_port = *matches
|
let http_port = *matches
|
||||||
.get_one::<u16>("http-port")
|
.get_one::<u16>("http-port")
|
||||||
@@ -198,7 +206,7 @@ fn main() -> Result<()> {
|
|||||||
live_config_allowed,
|
live_config_allowed,
|
||||||
state: Mutex::new(new_state),
|
state: Mutex::new(new_state),
|
||||||
state_changed: Condvar::new(),
|
state_changed: Condvar::new(),
|
||||||
ext_remote_storage,
|
ext_remote_storage: ext_remote_storage.map(|s| s.to_string()),
|
||||||
ext_download_progress: RwLock::new(HashMap::new()),
|
ext_download_progress: RwLock::new(HashMap::new()),
|
||||||
build_tag,
|
build_tag,
|
||||||
};
|
};
|
||||||
@@ -266,7 +274,13 @@ fn main() -> Result<()> {
|
|||||||
let mut state = compute.state.lock().unwrap();
|
let mut state = compute.state.lock().unwrap();
|
||||||
state.error = Some(format!("{:?}", err));
|
state.error = Some(format!("{:?}", err));
|
||||||
state.status = ComputeStatus::Failed;
|
state.status = ComputeStatus::Failed;
|
||||||
drop(state);
|
// Notify others that Postgres failed to start. In case of configuring the
|
||||||
|
// empty compute, it's likely that API handler is still waiting for compute
|
||||||
|
// state change. With this we will notify it that compute is in Failed state,
|
||||||
|
// so control plane will know about it earlier and record proper error instead
|
||||||
|
// of timeout.
|
||||||
|
compute.state_changed.notify_all();
|
||||||
|
drop(state); // unlock
|
||||||
delay_exit = true;
|
delay_exit = true;
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
@@ -479,13 +493,6 @@ fn cli() -> clap::Command {
|
|||||||
)
|
)
|
||||||
.value_name("FILECACHE_CONNSTR"),
|
.value_name("FILECACHE_CONNSTR"),
|
||||||
)
|
)
|
||||||
.arg(
|
|
||||||
// DEPRECATED, NO LONGER DOES ANYTHING.
|
|
||||||
// See https://github.com/neondatabase/cloud/issues/7516
|
|
||||||
Arg::new("file-cache-on-disk")
|
|
||||||
.long("file-cache-on-disk")
|
|
||||||
.action(clap::ArgAction::SetTrue),
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|||||||
@@ -22,10 +22,10 @@ use utils::id::{TenantId, TimelineId};
|
|||||||
use utils::lsn::Lsn;
|
use utils::lsn::Lsn;
|
||||||
|
|
||||||
use compute_api::responses::{ComputeMetrics, ComputeStatus};
|
use compute_api::responses::{ComputeMetrics, ComputeStatus};
|
||||||
use compute_api::spec::{ComputeMode, ComputeSpec};
|
use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec};
|
||||||
use utils::measured_stream::MeasuredReader;
|
use utils::measured_stream::MeasuredReader;
|
||||||
|
|
||||||
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
|
use remote_storage::{DownloadError, RemotePath};
|
||||||
|
|
||||||
use crate::checker::create_availability_check_data;
|
use crate::checker::create_availability_check_data;
|
||||||
use crate::pg_helpers::*;
|
use crate::pg_helpers::*;
|
||||||
@@ -59,8 +59,8 @@ pub struct ComputeNode {
|
|||||||
pub state: Mutex<ComputeState>,
|
pub state: Mutex<ComputeState>,
|
||||||
/// `Condvar` to allow notifying waiters about state changes.
|
/// `Condvar` to allow notifying waiters about state changes.
|
||||||
pub state_changed: Condvar,
|
pub state_changed: Condvar,
|
||||||
/// the S3 bucket that we search for extensions in
|
/// the address of extension storage proxy gateway
|
||||||
pub ext_remote_storage: Option<GenericRemoteStorage>,
|
pub ext_remote_storage: Option<String>,
|
||||||
// key: ext_archive_name, value: started download time, download_completed?
|
// key: ext_archive_name, value: started download time, download_completed?
|
||||||
pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
|
pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
|
||||||
pub build_tag: String,
|
pub build_tag: String,
|
||||||
@@ -252,7 +252,7 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
|
|||||||
IF NOT EXISTS (
|
IF NOT EXISTS (
|
||||||
SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
|
SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
|
||||||
THEN
|
THEN
|
||||||
CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION IN ROLE pg_read_all_data, pg_write_all_data;
|
CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data;
|
||||||
IF array_length(roles, 1) IS NOT NULL THEN
|
IF array_length(roles, 1) IS NOT NULL THEN
|
||||||
EXECUTE format('GRANT neon_superuser TO %s',
|
EXECUTE format('GRANT neon_superuser TO %s',
|
||||||
array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(roles) as x), ', '));
|
array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(roles) as x), ', '));
|
||||||
@@ -277,6 +277,17 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl ComputeNode {
|
impl ComputeNode {
|
||||||
|
/// Check that compute node has corresponding feature enabled.
|
||||||
|
pub fn has_feature(&self, feature: ComputeFeature) -> bool {
|
||||||
|
let state = self.state.lock().unwrap();
|
||||||
|
|
||||||
|
if let Some(s) = state.pspec.as_ref() {
|
||||||
|
s.spec.features.contains(&feature)
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn set_status(&self, status: ComputeStatus) {
|
pub fn set_status(&self, status: ComputeStatus) {
|
||||||
let mut state = self.state.lock().unwrap();
|
let mut state = self.state.lock().unwrap();
|
||||||
state.status = status;
|
state.status = status;
|
||||||
@@ -698,6 +709,7 @@ impl ComputeNode {
|
|||||||
handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
|
handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
|
||||||
handle_grants(spec, &mut client, self.connstr.as_str())?;
|
handle_grants(spec, &mut client, self.connstr.as_str())?;
|
||||||
handle_extensions(spec, &mut client)?;
|
handle_extensions(spec, &mut client)?;
|
||||||
|
handle_extension_neon(&mut client)?;
|
||||||
create_availability_check_data(&mut client)?;
|
create_availability_check_data(&mut client)?;
|
||||||
|
|
||||||
// 'Close' connection
|
// 'Close' connection
|
||||||
@@ -727,7 +739,12 @@ impl ComputeNode {
|
|||||||
|
|
||||||
// Write new config
|
// Write new config
|
||||||
let pgdata_path = Path::new(&self.pgdata);
|
let pgdata_path = Path::new(&self.pgdata);
|
||||||
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;
|
let postgresql_conf_path = pgdata_path.join("postgresql.conf");
|
||||||
|
config::write_postgres_conf(&postgresql_conf_path, &spec, None)?;
|
||||||
|
// temporarily reset max_cluster_size in config
|
||||||
|
// to avoid the possibility of hitting the limit, while we are reconfiguring:
|
||||||
|
// creating new extensions, roles, etc...
|
||||||
|
config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
|
||||||
self.pg_reload_conf()?;
|
self.pg_reload_conf()?;
|
||||||
|
|
||||||
let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
|
let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
|
||||||
@@ -742,11 +759,16 @@ impl ComputeNode {
|
|||||||
handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
|
handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
|
||||||
handle_grants(&spec, &mut client, self.connstr.as_str())?;
|
handle_grants(&spec, &mut client, self.connstr.as_str())?;
|
||||||
handle_extensions(&spec, &mut client)?;
|
handle_extensions(&spec, &mut client)?;
|
||||||
|
handle_extension_neon(&mut client)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 'Close' connection
|
// 'Close' connection
|
||||||
drop(client);
|
drop(client);
|
||||||
|
|
||||||
|
// reset max_cluster_size in config back to original value and reload config
|
||||||
|
config::compute_ctl_temp_override_remove(pgdata_path)?;
|
||||||
|
self.pg_reload_conf()?;
|
||||||
|
|
||||||
let unknown_op = "unknown".to_string();
|
let unknown_op = "unknown".to_string();
|
||||||
let op_id = spec.operation_uuid.as_ref().unwrap_or(&unknown_op);
|
let op_id = spec.operation_uuid.as_ref().unwrap_or(&unknown_op);
|
||||||
info!(
|
info!(
|
||||||
@@ -807,7 +829,17 @@ impl ComputeNode {
|
|||||||
|
|
||||||
let config_time = Utc::now();
|
let config_time = Utc::now();
|
||||||
if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
|
if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
|
||||||
|
let pgdata_path = Path::new(&self.pgdata);
|
||||||
|
// temporarily reset max_cluster_size in config
|
||||||
|
// to avoid the possibility of hitting the limit, while we are applying config:
|
||||||
|
// creating new extensions, roles, etc...
|
||||||
|
config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
|
||||||
|
self.pg_reload_conf()?;
|
||||||
|
|
||||||
self.apply_config(&compute_state)?;
|
self.apply_config(&compute_state)?;
|
||||||
|
|
||||||
|
config::compute_ctl_temp_override_remove(pgdata_path)?;
|
||||||
|
self.pg_reload_conf()?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let startup_end_time = Utc::now();
|
let startup_end_time = Utc::now();
|
||||||
@@ -955,12 +987,12 @@ LIMIT 100",
|
|||||||
real_ext_name: String,
|
real_ext_name: String,
|
||||||
ext_path: RemotePath,
|
ext_path: RemotePath,
|
||||||
) -> Result<u64, DownloadError> {
|
) -> Result<u64, DownloadError> {
|
||||||
let remote_storage = self
|
let ext_remote_storage =
|
||||||
.ext_remote_storage
|
self.ext_remote_storage
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.ok_or(DownloadError::BadInput(anyhow::anyhow!(
|
.ok_or(DownloadError::BadInput(anyhow::anyhow!(
|
||||||
"Remote extensions storage is not configured",
|
"Remote extensions storage is not configured",
|
||||||
)))?;
|
)))?;
|
||||||
|
|
||||||
let ext_archive_name = ext_path.object_name().expect("bad path");
|
let ext_archive_name = ext_path.object_name().expect("bad path");
|
||||||
|
|
||||||
@@ -1016,7 +1048,7 @@ LIMIT 100",
|
|||||||
let download_size = extension_server::download_extension(
|
let download_size = extension_server::download_extension(
|
||||||
&real_ext_name,
|
&real_ext_name,
|
||||||
&ext_path,
|
&ext_path,
|
||||||
remote_storage,
|
ext_remote_storage,
|
||||||
&self.pgbin,
|
&self.pgbin,
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
|
|||||||
@@ -93,5 +93,25 @@ pub fn write_postgres_conf(
|
|||||||
writeln!(file, "neon.extension_server_port={}", port)?;
|
writeln!(file, "neon.extension_server_port={}", port)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// This is essential to keep this line at the end of the file,
|
||||||
|
// because it is intended to override any settings above.
|
||||||
|
writeln!(file, "include_if_exists = 'compute_ctl_temp_override.conf'")?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// create file compute_ctl_temp_override.conf in pgdata_dir
|
||||||
|
/// add provided options to this file
|
||||||
|
pub fn compute_ctl_temp_override_create(pgdata_path: &Path, options: &str) -> Result<()> {
|
||||||
|
let path = pgdata_path.join("compute_ctl_temp_override.conf");
|
||||||
|
let mut file = File::create(path)?;
|
||||||
|
write!(file, "{}", options)?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// remove file compute_ctl_temp_override.conf in pgdata_dir
|
||||||
|
pub fn compute_ctl_temp_override_remove(pgdata_path: &Path) -> Result<()> {
|
||||||
|
let path = pgdata_path.join("compute_ctl_temp_override.conf");
|
||||||
|
std::fs::remove_file(path)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -71,18 +71,16 @@ More specifically, here is an example ext_index.json
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
use anyhow::Context;
|
|
||||||
use anyhow::{self, Result};
|
use anyhow::{self, Result};
|
||||||
|
use anyhow::{bail, Context};
|
||||||
|
use bytes::Bytes;
|
||||||
use compute_api::spec::RemoteExtSpec;
|
use compute_api::spec::RemoteExtSpec;
|
||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
use remote_storage::*;
|
use remote_storage::*;
|
||||||
use serde_json;
|
use reqwest::StatusCode;
|
||||||
use std::io::Read;
|
|
||||||
use std::num::NonZeroUsize;
|
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::str;
|
use std::str;
|
||||||
use tar::Archive;
|
use tar::Archive;
|
||||||
use tokio::io::AsyncReadExt;
|
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
use tracing::log::warn;
|
use tracing::log::warn;
|
||||||
use zstd::stream::read::Decoder;
|
use zstd::stream::read::Decoder;
|
||||||
@@ -138,23 +136,31 @@ fn parse_pg_version(human_version: &str) -> &str {
|
|||||||
pub async fn download_extension(
|
pub async fn download_extension(
|
||||||
ext_name: &str,
|
ext_name: &str,
|
||||||
ext_path: &RemotePath,
|
ext_path: &RemotePath,
|
||||||
remote_storage: &GenericRemoteStorage,
|
ext_remote_storage: &str,
|
||||||
pgbin: &str,
|
pgbin: &str,
|
||||||
) -> Result<u64> {
|
) -> Result<u64> {
|
||||||
info!("Download extension {:?} from {:?}", ext_name, ext_path);
|
info!("Download extension {:?} from {:?}", ext_name, ext_path);
|
||||||
let mut download = remote_storage.download(ext_path).await?;
|
|
||||||
let mut download_buffer = Vec::new();
|
// TODO add retry logic
|
||||||
download
|
let download_buffer =
|
||||||
.download_stream
|
match download_extension_tar(ext_remote_storage, &ext_path.to_string()).await {
|
||||||
.read_to_end(&mut download_buffer)
|
Ok(buffer) => buffer,
|
||||||
.await?;
|
Err(error_message) => {
|
||||||
|
return Err(anyhow::anyhow!(
|
||||||
|
"error downloading extension {:?}: {:?}",
|
||||||
|
ext_name,
|
||||||
|
error_message
|
||||||
|
));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
let download_size = download_buffer.len() as u64;
|
let download_size = download_buffer.len() as u64;
|
||||||
|
info!("Download size {:?}", download_size);
|
||||||
// it's unclear whether it is more performant to decompress into memory or not
|
// it's unclear whether it is more performant to decompress into memory or not
|
||||||
// TODO: decompressing into memory can be avoided
|
// TODO: decompressing into memory can be avoided
|
||||||
let mut decoder = Decoder::new(download_buffer.as_slice())?;
|
let decoder = Decoder::new(download_buffer.as_ref())?;
|
||||||
let mut decompress_buffer = Vec::new();
|
let mut archive = Archive::new(decoder);
|
||||||
decoder.read_to_end(&mut decompress_buffer)?;
|
|
||||||
let mut archive = Archive::new(decompress_buffer.as_slice());
|
|
||||||
let unzip_dest = pgbin
|
let unzip_dest = pgbin
|
||||||
.strip_suffix("/bin/postgres")
|
.strip_suffix("/bin/postgres")
|
||||||
.expect("bad pgbin")
|
.expect("bad pgbin")
|
||||||
@@ -222,29 +228,32 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// This function initializes the necessary structs to use remote storage
|
// Do request to extension storage proxy, i.e.
|
||||||
pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRemoteStorage> {
|
// curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst
|
||||||
#[derive(Debug, serde::Deserialize)]
|
// using HHTP GET
|
||||||
struct RemoteExtJson {
|
// and return the response body as bytes
|
||||||
bucket: String,
|
//
|
||||||
region: String,
|
async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Result<Bytes> {
|
||||||
endpoint: Option<String>,
|
let uri = format!("{}/{}", ext_remote_storage, ext_path);
|
||||||
prefix: Option<String>,
|
|
||||||
}
|
|
||||||
let remote_ext_json = serde_json::from_str::<RemoteExtJson>(remote_ext_config)?;
|
|
||||||
|
|
||||||
let config = S3Config {
|
info!("Download extension {:?} from uri {:?}", ext_path, uri);
|
||||||
bucket_name: remote_ext_json.bucket,
|
|
||||||
bucket_region: remote_ext_json.region,
|
let resp = reqwest::get(uri).await?;
|
||||||
prefix_in_bucket: remote_ext_json.prefix,
|
|
||||||
endpoint: remote_ext_json.endpoint,
|
match resp.status() {
|
||||||
concurrency_limit: NonZeroUsize::new(100).expect("100 != 0"),
|
StatusCode::OK => match resp.bytes().await {
|
||||||
max_keys_per_list_response: None,
|
Ok(resp) => {
|
||||||
};
|
info!("Download extension {:?} completed successfully", ext_path);
|
||||||
let config = RemoteStorageConfig {
|
Ok(resp)
|
||||||
storage: RemoteStorageKind::AwsS3(config),
|
}
|
||||||
};
|
Err(e) => bail!("could not deserialize remote extension response: {}", e),
|
||||||
GenericRemoteStorage::from_config(&config)
|
},
|
||||||
|
StatusCode::SERVICE_UNAVAILABLE => bail!("remote extension is temporarily unavailable"),
|
||||||
|
_ => bail!(
|
||||||
|
"unexpected remote extension response status code: {}",
|
||||||
|
resp.status()
|
||||||
|
),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
|||||||
@@ -123,7 +123,7 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// download extension files from S3 on demand
|
// download extension files from remote extension storage on demand
|
||||||
(&Method::POST, route) if route.starts_with("/extension_server/") => {
|
(&Method::POST, route) if route.starts_with("/extension_server/") => {
|
||||||
info!("serving {:?} POST request", route);
|
info!("serving {:?} POST request", route);
|
||||||
info!("req.uri {:?}", req.uri());
|
info!("req.uri {:?}", req.uri());
|
||||||
@@ -227,7 +227,7 @@ async fn handle_configure_request(
|
|||||||
|
|
||||||
let parsed_spec = match ParsedSpec::try_from(spec) {
|
let parsed_spec = match ParsedSpec::try_from(spec) {
|
||||||
Ok(ps) => ps,
|
Ok(ps) => ps,
|
||||||
Err(msg) => return Err((msg, StatusCode::PRECONDITION_FAILED)),
|
Err(msg) => return Err((msg, StatusCode::BAD_REQUEST)),
|
||||||
};
|
};
|
||||||
|
|
||||||
// XXX: wrap state update under lock in code blocks. Otherwise,
|
// XXX: wrap state update under lock in code blocks. Otherwise,
|
||||||
|
|||||||
@@ -156,17 +156,17 @@ paths:
|
|||||||
description: Error text or 'OK' if download succeeded.
|
description: Error text or 'OK' if download succeeded.
|
||||||
example: "OK"
|
example: "OK"
|
||||||
400:
|
400:
|
||||||
description: Request is invalid.
|
description: Request is invalid.
|
||||||
content:
|
content:
|
||||||
application/json:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
$ref: "#/components/schemas/GenericError"
|
$ref: "#/components/schemas/GenericError"
|
||||||
500:
|
500:
|
||||||
description: Extension download request failed.
|
description: Extension download request failed.
|
||||||
content:
|
content:
|
||||||
application/json:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
$ref: "#/components/schemas/GenericError"
|
$ref: "#/components/schemas/GenericError"
|
||||||
|
|
||||||
components:
|
components:
|
||||||
securitySchemes:
|
securitySchemes:
|
||||||
|
|||||||
@@ -193,16 +193,11 @@ impl Escaping for PgIdent {
|
|||||||
/// Build a list of existing Postgres roles
|
/// Build a list of existing Postgres roles
|
||||||
pub fn get_existing_roles(xact: &mut Transaction<'_>) -> Result<Vec<Role>> {
|
pub fn get_existing_roles(xact: &mut Transaction<'_>) -> Result<Vec<Role>> {
|
||||||
let postgres_roles = xact
|
let postgres_roles = xact
|
||||||
.query(
|
.query("SELECT rolname, rolpassword FROM pg_catalog.pg_authid", &[])?
|
||||||
"SELECT rolname, rolpassword, rolreplication, rolbypassrls FROM pg_catalog.pg_authid",
|
|
||||||
&[],
|
|
||||||
)?
|
|
||||||
.iter()
|
.iter()
|
||||||
.map(|row| Role {
|
.map(|row| Role {
|
||||||
name: row.get("rolname"),
|
name: row.get("rolname"),
|
||||||
encrypted_password: row.get("rolpassword"),
|
encrypted_password: row.get("rolpassword"),
|
||||||
replication: Some(row.get("rolreplication")),
|
|
||||||
bypassrls: Some(row.get("rolbypassrls")),
|
|
||||||
options: None,
|
options: None,
|
||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
|||||||
@@ -118,19 +118,6 @@ pub fn get_spec_from_control_plane(
|
|||||||
spec
|
spec
|
||||||
}
|
}
|
||||||
|
|
||||||
/// It takes cluster specification and does the following:
|
|
||||||
/// - Serialize cluster config and put it into `postgresql.conf` completely rewriting the file.
|
|
||||||
/// - Update `pg_hba.conf` to allow external connections.
|
|
||||||
pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> {
|
|
||||||
// File `postgresql.conf` is no longer included into `basebackup`, so just
|
|
||||||
// always write all config into it creating new file.
|
|
||||||
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec, None)?;
|
|
||||||
|
|
||||||
update_pg_hba(pgdata_path)?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Check `pg_hba.conf` and update if needed to allow external connections.
|
/// Check `pg_hba.conf` and update if needed to allow external connections.
|
||||||
pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
|
pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
|
||||||
// XXX: consider making it a part of spec.json
|
// XXX: consider making it a part of spec.json
|
||||||
@@ -265,8 +252,6 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
|||||||
let action = if let Some(r) = pg_role {
|
let action = if let Some(r) = pg_role {
|
||||||
if (r.encrypted_password.is_none() && role.encrypted_password.is_some())
|
if (r.encrypted_password.is_none() && role.encrypted_password.is_some())
|
||||||
|| (r.encrypted_password.is_some() && role.encrypted_password.is_none())
|
|| (r.encrypted_password.is_some() && role.encrypted_password.is_none())
|
||||||
|| !r.bypassrls.unwrap_or(false)
|
|
||||||
|| !r.replication.unwrap_or(false)
|
|
||||||
{
|
{
|
||||||
RoleAction::Update
|
RoleAction::Update
|
||||||
} else if let Some(pg_pwd) = &r.encrypted_password {
|
} else if let Some(pg_pwd) = &r.encrypted_password {
|
||||||
@@ -298,14 +283,22 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
|||||||
match action {
|
match action {
|
||||||
RoleAction::None => {}
|
RoleAction::None => {}
|
||||||
RoleAction::Update => {
|
RoleAction::Update => {
|
||||||
let mut query: String =
|
// This can be run on /every/ role! Not just ones created through the console.
|
||||||
format!("ALTER ROLE {} BYPASSRLS REPLICATION", name.pg_quote());
|
// This means that if you add some funny ALTER here that adds a permission,
|
||||||
|
// this will get run even on user-created roles! This will result in different
|
||||||
|
// behavior before and after a spec gets reapplied. The below ALTER as it stands
|
||||||
|
// now only grants LOGIN and changes the password. Please do not allow this branch
|
||||||
|
// to do anything silly.
|
||||||
|
let mut query: String = format!("ALTER ROLE {} ", name.pg_quote());
|
||||||
query.push_str(&role.to_pg_options());
|
query.push_str(&role.to_pg_options());
|
||||||
xact.execute(query.as_str(), &[])?;
|
xact.execute(query.as_str(), &[])?;
|
||||||
}
|
}
|
||||||
RoleAction::Create => {
|
RoleAction::Create => {
|
||||||
|
// This branch only runs when roles are created through the console, so it is
|
||||||
|
// safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
|
||||||
|
// from neon_superuser.
|
||||||
let mut query: String = format!(
|
let mut query: String = format!(
|
||||||
"CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
|
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB IN ROLE neon_superuser",
|
||||||
name.pg_quote()
|
name.pg_quote()
|
||||||
);
|
);
|
||||||
info!("role create query: '{}'", &query);
|
info!("role create query: '{}'", &query);
|
||||||
@@ -674,3 +667,33 @@ pub fn handle_extensions(spec: &ComputeSpec, client: &mut Client) -> Result<()>
|
|||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Run CREATE and ALTER EXTENSION neon UPDATE for postgres database
|
||||||
|
#[instrument(skip_all)]
|
||||||
|
pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
|
||||||
|
info!("handle extension neon");
|
||||||
|
|
||||||
|
let mut query = "CREATE SCHEMA IF NOT EXISTS neon";
|
||||||
|
client.simple_query(query)?;
|
||||||
|
|
||||||
|
query = "CREATE EXTENSION IF NOT EXISTS neon WITH SCHEMA neon";
|
||||||
|
info!("create neon extension with query: {}", query);
|
||||||
|
client.simple_query(query)?;
|
||||||
|
|
||||||
|
query = "UPDATE pg_extension SET extrelocatable = true WHERE extname = 'neon'";
|
||||||
|
client.simple_query(query)?;
|
||||||
|
|
||||||
|
query = "ALTER EXTENSION neon SET SCHEMA neon";
|
||||||
|
info!("alter neon extension schema with query: {}", query);
|
||||||
|
client.simple_query(query)?;
|
||||||
|
|
||||||
|
// this will be a no-op if extension is already up to date,
|
||||||
|
// which may happen in two cases:
|
||||||
|
// - extension was just installed
|
||||||
|
// - extension was already installed and is up to date
|
||||||
|
let query = "ALTER EXTENSION neon UPDATE";
|
||||||
|
info!("update neon extension schema with query: {}", query);
|
||||||
|
client.simple_query(query)?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ use clap::Parser;
|
|||||||
use hex::FromHex;
|
use hex::FromHex;
|
||||||
use hyper::StatusCode;
|
use hyper::StatusCode;
|
||||||
use hyper::{Body, Request, Response};
|
use hyper::{Body, Request, Response};
|
||||||
|
use pageserver_api::shard::TenantShardId;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
use std::{collections::HashMap, sync::Arc};
|
use std::{collections::HashMap, sync::Arc};
|
||||||
@@ -173,7 +174,8 @@ async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiE
|
|||||||
if state.pageserver == Some(reattach_req.node_id) {
|
if state.pageserver == Some(reattach_req.node_id) {
|
||||||
state.generation += 1;
|
state.generation += 1;
|
||||||
response.tenants.push(ReAttachResponseTenant {
|
response.tenants.push(ReAttachResponseTenant {
|
||||||
id: *t,
|
// TODO(sharding): make this shard-aware
|
||||||
|
id: TenantShardId::unsharded(*t),
|
||||||
gen: state.generation,
|
gen: state.generation,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@@ -196,8 +198,15 @@ async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiEr
|
|||||||
};
|
};
|
||||||
|
|
||||||
for req_tenant in validate_req.tenants {
|
for req_tenant in validate_req.tenants {
|
||||||
if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) {
|
// TODO(sharding): make this shard-aware
|
||||||
|
if let Some(tenant_state) = locked.tenants.get(&req_tenant.id.tenant_id) {
|
||||||
let valid = tenant_state.generation == req_tenant.gen;
|
let valid = tenant_state.generation == req_tenant.gen;
|
||||||
|
tracing::info!(
|
||||||
|
"handle_validate: {}(gen {}): valid={valid} (latest {})",
|
||||||
|
req_tenant.id,
|
||||||
|
req_tenant.gen,
|
||||||
|
tenant_state.generation
|
||||||
|
);
|
||||||
response.tenants.push(ValidateResponseTenant {
|
response.tenants.push(ValidateResponseTenant {
|
||||||
id: req_tenant.id,
|
id: req_tenant.id,
|
||||||
valid,
|
valid,
|
||||||
@@ -247,6 +256,13 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
|
|||||||
tenant_state.pageserver = attach_req.node_id;
|
tenant_state.pageserver = attach_req.node_id;
|
||||||
let generation = tenant_state.generation;
|
let generation = tenant_state.generation;
|
||||||
|
|
||||||
|
tracing::info!(
|
||||||
|
"handle_attach_hook: tenant {} set generation {}, pageserver {}",
|
||||||
|
attach_req.tenant_id,
|
||||||
|
tenant_state.generation,
|
||||||
|
attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
|
||||||
|
);
|
||||||
|
|
||||||
locked.save().await.map_err(ApiError::InternalServerError)?;
|
locked.save().await.map_err(ApiError::InternalServerError)?;
|
||||||
|
|
||||||
json_response(
|
json_response(
|
||||||
@@ -286,6 +302,7 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
logging::init(
|
logging::init(
|
||||||
LogFormat::Plain,
|
LogFormat::Plain,
|
||||||
logging::TracingErrorLayerEnablement::Disabled,
|
logging::TracingErrorLayerEnablement::Disabled,
|
||||||
|
logging::Output::Stdout,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
let args = Cli::parse();
|
let args = Cli::parse();
|
||||||
|
|||||||
@@ -168,7 +168,7 @@ fn print_timelines_tree(
|
|||||||
info: t.clone(),
|
info: t.clone(),
|
||||||
children: BTreeSet::new(),
|
children: BTreeSet::new(),
|
||||||
name: timeline_name_mappings
|
name: timeline_name_mappings
|
||||||
.remove(&TenantTimelineId::new(t.tenant_id, t.timeline_id)),
|
.remove(&TenantTimelineId::new(t.tenant_id.tenant_id, t.timeline_id)),
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
@@ -415,6 +415,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
|
|||||||
None,
|
None,
|
||||||
None,
|
None,
|
||||||
Some(pg_version),
|
Some(pg_version),
|
||||||
|
None,
|
||||||
)?;
|
)?;
|
||||||
let new_timeline_id = timeline_info.timeline_id;
|
let new_timeline_id = timeline_info.timeline_id;
|
||||||
let last_record_lsn = timeline_info.last_record_lsn;
|
let last_record_lsn = timeline_info.last_record_lsn;
|
||||||
@@ -487,8 +488,16 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
|
|||||||
.copied()
|
.copied()
|
||||||
.context("Failed to parse postgres version from the argument string")?;
|
.context("Failed to parse postgres version from the argument string")?;
|
||||||
|
|
||||||
let timeline_info =
|
let new_timeline_id_opt = parse_timeline_id(create_match)?;
|
||||||
pageserver.timeline_create(tenant_id, None, None, None, Some(pg_version))?;
|
|
||||||
|
let timeline_info = pageserver.timeline_create(
|
||||||
|
tenant_id,
|
||||||
|
new_timeline_id_opt,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
Some(pg_version),
|
||||||
|
None,
|
||||||
|
)?;
|
||||||
let new_timeline_id = timeline_info.timeline_id;
|
let new_timeline_id = timeline_info.timeline_id;
|
||||||
|
|
||||||
let last_record_lsn = timeline_info.last_record_lsn;
|
let last_record_lsn = timeline_info.last_record_lsn;
|
||||||
@@ -575,6 +584,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
|
|||||||
start_lsn,
|
start_lsn,
|
||||||
Some(ancestor_timeline_id),
|
Some(ancestor_timeline_id),
|
||||||
None,
|
None,
|
||||||
|
None,
|
||||||
)?;
|
)?;
|
||||||
let new_timeline_id = timeline_info.timeline_id;
|
let new_timeline_id = timeline_info.timeline_id;
|
||||||
|
|
||||||
@@ -601,11 +611,9 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
|
|||||||
};
|
};
|
||||||
let mut cplane = ComputeControlPlane::load(env.clone())?;
|
let mut cplane = ComputeControlPlane::load(env.clone())?;
|
||||||
|
|
||||||
// All subcommands take an optional --tenant-id option
|
|
||||||
let tenant_id = get_tenant_id(sub_args, env)?;
|
|
||||||
|
|
||||||
match sub_name {
|
match sub_name {
|
||||||
"list" => {
|
"list" => {
|
||||||
|
let tenant_id = get_tenant_id(sub_args, env)?;
|
||||||
let timeline_infos = get_timeline_infos(env, &tenant_id).unwrap_or_else(|e| {
|
let timeline_infos = get_timeline_infos(env, &tenant_id).unwrap_or_else(|e| {
|
||||||
eprintln!("Failed to load timeline info: {}", e);
|
eprintln!("Failed to load timeline info: {}", e);
|
||||||
HashMap::new()
|
HashMap::new()
|
||||||
@@ -665,6 +673,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
|
|||||||
println!("{table}");
|
println!("{table}");
|
||||||
}
|
}
|
||||||
"create" => {
|
"create" => {
|
||||||
|
let tenant_id = get_tenant_id(sub_args, env)?;
|
||||||
let branch_name = sub_args
|
let branch_name = sub_args
|
||||||
.get_one::<String>("branch-name")
|
.get_one::<String>("branch-name")
|
||||||
.map(|s| s.as_str())
|
.map(|s| s.as_str())
|
||||||
@@ -709,6 +718,18 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
|
|||||||
(Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
|
(Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
match (mode, hot_standby) {
|
||||||
|
(ComputeMode::Static(_), true) => {
|
||||||
|
bail!("Cannot start a node in hot standby mode when it is already configured as a static replica")
|
||||||
|
}
|
||||||
|
(ComputeMode::Primary, true) => {
|
||||||
|
bail!("Cannot start a node as a hot standby replica, it is already configured as primary node")
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
|
||||||
|
cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?;
|
||||||
|
|
||||||
cplane.new_endpoint(
|
cplane.new_endpoint(
|
||||||
&endpoint_id,
|
&endpoint_id,
|
||||||
tenant_id,
|
tenant_id,
|
||||||
@@ -721,8 +742,6 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
|
|||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
"start" => {
|
"start" => {
|
||||||
let pg_port: Option<u16> = sub_args.get_one::<u16>("pg-port").copied();
|
|
||||||
let http_port: Option<u16> = sub_args.get_one::<u16>("http-port").copied();
|
|
||||||
let endpoint_id = sub_args
|
let endpoint_id = sub_args
|
||||||
.get_one::<String>("endpoint_id")
|
.get_one::<String>("endpoint_id")
|
||||||
.ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;
|
.ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;
|
||||||
@@ -751,80 +770,28 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
|
|||||||
env.safekeepers.iter().map(|sk| sk.id).collect()
|
env.safekeepers.iter().map(|sk| sk.id).collect()
|
||||||
};
|
};
|
||||||
|
|
||||||
let endpoint = cplane.endpoints.get(endpoint_id.as_str());
|
let endpoint = cplane
|
||||||
|
.endpoints
|
||||||
|
.get(endpoint_id.as_str())
|
||||||
|
.ok_or_else(|| anyhow::anyhow!("endpoint {endpoint_id} not found"))?;
|
||||||
|
|
||||||
|
cplane.check_conflicting_endpoints(
|
||||||
|
endpoint.mode,
|
||||||
|
endpoint.tenant_id,
|
||||||
|
endpoint.timeline_id,
|
||||||
|
)?;
|
||||||
|
|
||||||
let ps_conf = env.get_pageserver_conf(pageserver_id)?;
|
let ps_conf = env.get_pageserver_conf(pageserver_id)?;
|
||||||
let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) {
|
let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) {
|
||||||
let claims = Claims::new(Some(tenant_id), Scope::Tenant);
|
let claims = Claims::new(Some(endpoint.tenant_id), Scope::Tenant);
|
||||||
|
|
||||||
Some(env.generate_auth_token(&claims)?)
|
Some(env.generate_auth_token(&claims)?)
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
};
|
};
|
||||||
|
|
||||||
let hot_standby = sub_args
|
println!("Starting existing endpoint {endpoint_id}...");
|
||||||
.get_one::<bool>("hot-standby")
|
endpoint.start(&auth_token, safekeepers, remote_ext_config)?;
|
||||||
.copied()
|
|
||||||
.unwrap_or(false);
|
|
||||||
|
|
||||||
if let Some(endpoint) = endpoint {
|
|
||||||
match (&endpoint.mode, hot_standby) {
|
|
||||||
(ComputeMode::Static(_), true) => {
|
|
||||||
bail!("Cannot start a node in hot standby mode when it is already configured as a static replica")
|
|
||||||
}
|
|
||||||
(ComputeMode::Primary, true) => {
|
|
||||||
bail!("Cannot start a node as a hot standby replica, it is already configured as primary node")
|
|
||||||
}
|
|
||||||
_ => {}
|
|
||||||
}
|
|
||||||
println!("Starting existing endpoint {endpoint_id}...");
|
|
||||||
endpoint.start(&auth_token, safekeepers, remote_ext_config)?;
|
|
||||||
} else {
|
|
||||||
let branch_name = sub_args
|
|
||||||
.get_one::<String>("branch-name")
|
|
||||||
.map(|s| s.as_str())
|
|
||||||
.unwrap_or(DEFAULT_BRANCH_NAME);
|
|
||||||
let timeline_id = env
|
|
||||||
.get_branch_timeline_id(branch_name, tenant_id)
|
|
||||||
.ok_or_else(|| {
|
|
||||||
anyhow!("Found no timeline id for branch name '{branch_name}'")
|
|
||||||
})?;
|
|
||||||
let lsn = sub_args
|
|
||||||
.get_one::<String>("lsn")
|
|
||||||
.map(|lsn_str| Lsn::from_str(lsn_str))
|
|
||||||
.transpose()
|
|
||||||
.context("Failed to parse Lsn from the request")?;
|
|
||||||
let pg_version = sub_args
|
|
||||||
.get_one::<u32>("pg-version")
|
|
||||||
.copied()
|
|
||||||
.context("Failed to `pg-version` from the argument string")?;
|
|
||||||
|
|
||||||
let mode = match (lsn, hot_standby) {
|
|
||||||
(Some(lsn), false) => ComputeMode::Static(lsn),
|
|
||||||
(None, true) => ComputeMode::Replica,
|
|
||||||
(None, false) => ComputeMode::Primary,
|
|
||||||
(Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
|
|
||||||
};
|
|
||||||
|
|
||||||
// when used with custom port this results in non obvious behaviour
|
|
||||||
// port is remembered from first start command, i e
|
|
||||||
// start --port X
|
|
||||||
// stop
|
|
||||||
// start <-- will also use port X even without explicit port argument
|
|
||||||
println!("Starting new endpoint {endpoint_id} (PostgreSQL v{pg_version}) on timeline {timeline_id} ...");
|
|
||||||
|
|
||||||
let ep = cplane.new_endpoint(
|
|
||||||
endpoint_id,
|
|
||||||
tenant_id,
|
|
||||||
timeline_id,
|
|
||||||
pg_port,
|
|
||||||
http_port,
|
|
||||||
pg_version,
|
|
||||||
mode,
|
|
||||||
pageserver_id,
|
|
||||||
)?;
|
|
||||||
ep.start(&auth_token, safekeepers, remote_ext_config)?;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
"reconfigure" => {
|
"reconfigure" => {
|
||||||
let endpoint_id = sub_args
|
let endpoint_id = sub_args
|
||||||
@@ -1245,7 +1212,7 @@ fn cli() -> Command {
|
|||||||
let remote_ext_config_args = Arg::new("remote-ext-config")
|
let remote_ext_config_args = Arg::new("remote-ext-config")
|
||||||
.long("remote-ext-config")
|
.long("remote-ext-config")
|
||||||
.num_args(1)
|
.num_args(1)
|
||||||
.help("Configure the S3 bucket that we search for extensions in.")
|
.help("Configure the remote extensions storage proxy gateway to request for extensions.")
|
||||||
.required(false);
|
.required(false);
|
||||||
|
|
||||||
let lsn_arg = Arg::new("lsn")
|
let lsn_arg = Arg::new("lsn")
|
||||||
@@ -1308,6 +1275,7 @@ fn cli() -> Command {
|
|||||||
.subcommand(Command::new("create")
|
.subcommand(Command::new("create")
|
||||||
.about("Create a new blank timeline")
|
.about("Create a new blank timeline")
|
||||||
.arg(tenant_id_arg.clone())
|
.arg(tenant_id_arg.clone())
|
||||||
|
.arg(timeline_id_arg.clone())
|
||||||
.arg(branch_name_arg.clone())
|
.arg(branch_name_arg.clone())
|
||||||
.arg(pg_version_arg.clone())
|
.arg(pg_version_arg.clone())
|
||||||
)
|
)
|
||||||
@@ -1429,15 +1397,7 @@ fn cli() -> Command {
|
|||||||
.subcommand(Command::new("start")
|
.subcommand(Command::new("start")
|
||||||
.about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
|
.about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
|
||||||
.arg(endpoint_id_arg.clone())
|
.arg(endpoint_id_arg.clone())
|
||||||
.arg(tenant_id_arg.clone())
|
|
||||||
.arg(branch_name_arg.clone())
|
|
||||||
.arg(timeline_id_arg.clone())
|
|
||||||
.arg(lsn_arg)
|
|
||||||
.arg(pg_port_arg)
|
|
||||||
.arg(http_port_arg)
|
|
||||||
.arg(endpoint_pageserver_id_arg.clone())
|
.arg(endpoint_pageserver_id_arg.clone())
|
||||||
.arg(pg_version_arg)
|
|
||||||
.arg(hot_standby_arg)
|
|
||||||
.arg(safekeepers_arg)
|
.arg(safekeepers_arg)
|
||||||
.arg(remote_ext_config_args)
|
.arg(remote_ext_config_args)
|
||||||
)
|
)
|
||||||
@@ -1450,7 +1410,6 @@ fn cli() -> Command {
|
|||||||
.subcommand(
|
.subcommand(
|
||||||
Command::new("stop")
|
Command::new("stop")
|
||||||
.arg(endpoint_id_arg)
|
.arg(endpoint_id_arg)
|
||||||
.arg(tenant_id_arg.clone())
|
|
||||||
.arg(
|
.arg(
|
||||||
Arg::new("destroy")
|
Arg::new("destroy")
|
||||||
.help("Also delete data directory (now optional, should be default in future)")
|
.help("Also delete data directory (now optional, should be default in future)")
|
||||||
|
|||||||
@@ -45,6 +45,7 @@ use std::sync::Arc;
|
|||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
|
||||||
use anyhow::{anyhow, bail, Context, Result};
|
use anyhow::{anyhow, bail, Context, Result};
|
||||||
|
use compute_api::spec::RemoteExtSpec;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use utils::id::{NodeId, TenantId, TimelineId};
|
use utils::id::{NodeId, TenantId, TimelineId};
|
||||||
|
|
||||||
@@ -124,6 +125,7 @@ impl ComputeControlPlane {
|
|||||||
let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
|
let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
|
||||||
let pageserver =
|
let pageserver =
|
||||||
PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
|
PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
|
||||||
|
|
||||||
let ep = Arc::new(Endpoint {
|
let ep = Arc::new(Endpoint {
|
||||||
endpoint_id: endpoint_id.to_owned(),
|
endpoint_id: endpoint_id.to_owned(),
|
||||||
pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
|
pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
|
||||||
@@ -168,6 +170,30 @@ impl ComputeControlPlane {
|
|||||||
|
|
||||||
Ok(ep)
|
Ok(ep)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn check_conflicting_endpoints(
|
||||||
|
&self,
|
||||||
|
mode: ComputeMode,
|
||||||
|
tenant_id: TenantId,
|
||||||
|
timeline_id: TimelineId,
|
||||||
|
) -> Result<()> {
|
||||||
|
if matches!(mode, ComputeMode::Primary) {
|
||||||
|
// this check is not complete, as you could have a concurrent attempt at
|
||||||
|
// creating another primary, both reading the state before checking it here,
|
||||||
|
// but it's better than nothing.
|
||||||
|
let mut duplicates = self.endpoints.iter().filter(|(_k, v)| {
|
||||||
|
v.tenant_id == tenant_id
|
||||||
|
&& v.timeline_id == timeline_id
|
||||||
|
&& v.mode == mode
|
||||||
|
&& v.status() != "stopped"
|
||||||
|
});
|
||||||
|
|
||||||
|
if let Some((key, _)) = duplicates.next() {
|
||||||
|
bail!("attempting to create a duplicate primary endpoint on tenant {tenant_id}, timeline {timeline_id}: endpoint {key:?} exists already. please don't do this, it is not supported.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
@@ -476,11 +502,24 @@ impl Endpoint {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// check for file remote_extensions_spec.json
|
||||||
|
// if it is present, read it and pass to compute_ctl
|
||||||
|
let remote_extensions_spec_path = self.endpoint_path().join("remote_extensions_spec.json");
|
||||||
|
let remote_extensions_spec = std::fs::File::open(remote_extensions_spec_path);
|
||||||
|
let remote_extensions: Option<RemoteExtSpec>;
|
||||||
|
|
||||||
|
if let Ok(spec_file) = remote_extensions_spec {
|
||||||
|
remote_extensions = serde_json::from_reader(spec_file).ok();
|
||||||
|
} else {
|
||||||
|
remote_extensions = None;
|
||||||
|
};
|
||||||
|
|
||||||
// Create spec file
|
// Create spec file
|
||||||
let spec = ComputeSpec {
|
let spec = ComputeSpec {
|
||||||
skip_pg_catalog_updates: self.skip_pg_catalog_updates,
|
skip_pg_catalog_updates: self.skip_pg_catalog_updates,
|
||||||
format_version: 1.0,
|
format_version: 1.0,
|
||||||
operation_uuid: None,
|
operation_uuid: None,
|
||||||
|
features: vec![],
|
||||||
cluster: Cluster {
|
cluster: Cluster {
|
||||||
cluster_id: None, // project ID: not used
|
cluster_id: None, // project ID: not used
|
||||||
name: None, // project name: not used
|
name: None, // project name: not used
|
||||||
@@ -497,7 +536,7 @@ impl Endpoint {
|
|||||||
pageserver_connstring: Some(pageserver_connstring),
|
pageserver_connstring: Some(pageserver_connstring),
|
||||||
safekeeper_connstrings,
|
safekeeper_connstrings,
|
||||||
storage_auth_token: auth_token.clone(),
|
storage_auth_token: auth_token.clone(),
|
||||||
remote_extensions: None,
|
remote_extensions,
|
||||||
};
|
};
|
||||||
let spec_path = self.endpoint_path().join("spec.json");
|
let spec_path = self.endpoint_path().join("spec.json");
|
||||||
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
|
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ use std::io::{BufReader, Write};
|
|||||||
use std::num::NonZeroU64;
|
use std::num::NonZeroU64;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::process::{Child, Command};
|
use std::process::{Child, Command};
|
||||||
|
use std::time::Duration;
|
||||||
use std::{io, result};
|
use std::{io, result};
|
||||||
|
|
||||||
use anyhow::{bail, Context};
|
use anyhow::{bail, Context};
|
||||||
@@ -522,19 +523,24 @@ impl PageServerNode {
|
|||||||
&self,
|
&self,
|
||||||
tenant_id: TenantId,
|
tenant_id: TenantId,
|
||||||
config: LocationConfig,
|
config: LocationConfig,
|
||||||
|
flush_ms: Option<Duration>,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let req_body = TenantLocationConfigRequest { tenant_id, config };
|
let req_body = TenantLocationConfigRequest { tenant_id, config };
|
||||||
|
|
||||||
self.http_request(
|
let path = format!(
|
||||||
Method::PUT,
|
"{}/tenant/{}/location_config",
|
||||||
format!(
|
self.http_base_url, tenant_id
|
||||||
"{}/tenant/{}/location_config",
|
);
|
||||||
self.http_base_url, tenant_id
|
let path = if let Some(flush_ms) = flush_ms {
|
||||||
),
|
format!("{}?flush_ms={}", path, flush_ms.as_millis())
|
||||||
)?
|
} else {
|
||||||
.json(&req_body)
|
path
|
||||||
.send()?
|
};
|
||||||
.error_from_body()?;
|
|
||||||
|
self.http_request(Method::PUT, path)?
|
||||||
|
.json(&req_body)
|
||||||
|
.send()?
|
||||||
|
.error_from_body()?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -559,6 +565,7 @@ impl PageServerNode {
|
|||||||
ancestor_start_lsn: Option<Lsn>,
|
ancestor_start_lsn: Option<Lsn>,
|
||||||
ancestor_timeline_id: Option<TimelineId>,
|
ancestor_timeline_id: Option<TimelineId>,
|
||||||
pg_version: Option<u32>,
|
pg_version: Option<u32>,
|
||||||
|
existing_initdb_timeline_id: Option<TimelineId>,
|
||||||
) -> anyhow::Result<TimelineInfo> {
|
) -> anyhow::Result<TimelineInfo> {
|
||||||
// If timeline ID was not specified, generate one
|
// If timeline ID was not specified, generate one
|
||||||
let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate());
|
let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate());
|
||||||
@@ -572,6 +579,7 @@ impl PageServerNode {
|
|||||||
ancestor_start_lsn,
|
ancestor_start_lsn,
|
||||||
ancestor_timeline_id,
|
ancestor_timeline_id,
|
||||||
pg_version,
|
pg_version,
|
||||||
|
existing_initdb_timeline_id,
|
||||||
})
|
})
|
||||||
.send()?
|
.send()?
|
||||||
.error_from_body()?
|
.error_from_body()?
|
||||||
|
|||||||
@@ -14,7 +14,6 @@ use pageserver_api::models::{
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use utils::{
|
use utils::{
|
||||||
generation::Generation,
|
|
||||||
id::{TenantId, TimelineId},
|
id::{TenantId, TimelineId},
|
||||||
lsn::Lsn,
|
lsn::Lsn,
|
||||||
};
|
};
|
||||||
@@ -93,6 +92,22 @@ pub fn migrate_tenant(
|
|||||||
// Get a new generation
|
// Get a new generation
|
||||||
let attachment_service = AttachmentService::from_env(env);
|
let attachment_service = AttachmentService::from_env(env);
|
||||||
|
|
||||||
|
fn build_location_config(
|
||||||
|
mode: LocationConfigMode,
|
||||||
|
generation: Option<u32>,
|
||||||
|
secondary_conf: Option<LocationConfigSecondary>,
|
||||||
|
) -> LocationConfig {
|
||||||
|
LocationConfig {
|
||||||
|
mode,
|
||||||
|
generation,
|
||||||
|
secondary_conf,
|
||||||
|
tenant_conf: TenantConfig::default(),
|
||||||
|
shard_number: 0,
|
||||||
|
shard_count: 0,
|
||||||
|
shard_stripe_size: 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let previous = attachment_service.inspect(tenant_id)?;
|
let previous = attachment_service.inspect(tenant_id)?;
|
||||||
let mut baseline_lsns = None;
|
let mut baseline_lsns = None;
|
||||||
if let Some((generation, origin_ps_id)) = &previous {
|
if let Some((generation, origin_ps_id)) = &previous {
|
||||||
@@ -101,40 +116,26 @@ pub fn migrate_tenant(
|
|||||||
if origin_ps_id == &dest_ps.conf.id {
|
if origin_ps_id == &dest_ps.conf.id {
|
||||||
println!("🔁 Already attached to {origin_ps_id}, freshening...");
|
println!("🔁 Already attached to {origin_ps_id}, freshening...");
|
||||||
let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
|
let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
|
||||||
let dest_conf = LocationConfig {
|
let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
|
||||||
mode: LocationConfigMode::AttachedSingle,
|
dest_ps.location_config(tenant_id, dest_conf, None)?;
|
||||||
generation: gen.map(Generation::new),
|
|
||||||
secondary_conf: None,
|
|
||||||
tenant_conf: TenantConfig::default(),
|
|
||||||
};
|
|
||||||
dest_ps.location_config(tenant_id, dest_conf)?;
|
|
||||||
println!("✅ Migration complete");
|
println!("✅ Migration complete");
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
println!("🔁 Switching origin pageserver {origin_ps_id} to stale mode");
|
println!("🔁 Switching origin pageserver {origin_ps_id} to stale mode");
|
||||||
|
|
||||||
let stale_conf = LocationConfig {
|
let stale_conf =
|
||||||
mode: LocationConfigMode::AttachedStale,
|
build_location_config(LocationConfigMode::AttachedStale, Some(*generation), None);
|
||||||
generation: Some(Generation::new(*generation)),
|
origin_ps.location_config(tenant_id, stale_conf, Some(Duration::from_secs(10)))?;
|
||||||
secondary_conf: None,
|
|
||||||
tenant_conf: TenantConfig::default(),
|
|
||||||
};
|
|
||||||
origin_ps.location_config(tenant_id, stale_conf)?;
|
|
||||||
|
|
||||||
baseline_lsns = Some(get_lsns(tenant_id, &origin_ps)?);
|
baseline_lsns = Some(get_lsns(tenant_id, &origin_ps)?);
|
||||||
}
|
}
|
||||||
|
|
||||||
let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
|
let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
|
||||||
let dest_conf = LocationConfig {
|
let dest_conf = build_location_config(LocationConfigMode::AttachedMulti, gen, None);
|
||||||
mode: LocationConfigMode::AttachedMulti,
|
|
||||||
generation: gen.map(Generation::new),
|
|
||||||
secondary_conf: None,
|
|
||||||
tenant_conf: TenantConfig::default(),
|
|
||||||
};
|
|
||||||
|
|
||||||
println!("🔁 Attaching to pageserver {}", dest_ps.conf.id);
|
println!("🔁 Attaching to pageserver {}", dest_ps.conf.id);
|
||||||
dest_ps.location_config(tenant_id, dest_conf)?;
|
dest_ps.location_config(tenant_id, dest_conf, None)?;
|
||||||
|
|
||||||
if let Some(baseline) = baseline_lsns {
|
if let Some(baseline) = baseline_lsns {
|
||||||
println!("🕑 Waiting for LSN to catch up...");
|
println!("🕑 Waiting for LSN to catch up...");
|
||||||
@@ -164,37 +165,31 @@ pub fn migrate_tenant(
|
|||||||
let found = other_ps_tenants
|
let found = other_ps_tenants
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|t| t.id)
|
.map(|t| t.id)
|
||||||
.any(|i| i == tenant_id);
|
.any(|i| i.tenant_id == tenant_id);
|
||||||
if !found {
|
if !found {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Downgrade to a secondary location
|
// Downgrade to a secondary location
|
||||||
let secondary_conf = LocationConfig {
|
let secondary_conf = build_location_config(
|
||||||
mode: LocationConfigMode::Secondary,
|
LocationConfigMode::Secondary,
|
||||||
generation: None,
|
None,
|
||||||
secondary_conf: Some(LocationConfigSecondary { warm: true }),
|
Some(LocationConfigSecondary { warm: true }),
|
||||||
tenant_conf: TenantConfig::default(),
|
);
|
||||||
};
|
|
||||||
|
|
||||||
println!(
|
println!(
|
||||||
"💤 Switching to secondary mode on pageserver {}",
|
"💤 Switching to secondary mode on pageserver {}",
|
||||||
other_ps.conf.id
|
other_ps.conf.id
|
||||||
);
|
);
|
||||||
other_ps.location_config(tenant_id, secondary_conf)?;
|
other_ps.location_config(tenant_id, secondary_conf, None)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
println!(
|
println!(
|
||||||
"🔁 Switching to AttachedSingle mode on pageserver {}",
|
"🔁 Switching to AttachedSingle mode on pageserver {}",
|
||||||
dest_ps.conf.id
|
dest_ps.conf.id
|
||||||
);
|
);
|
||||||
let dest_conf = LocationConfig {
|
let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
|
||||||
mode: LocationConfigMode::AttachedSingle,
|
dest_ps.location_config(tenant_id, dest_conf, None)?;
|
||||||
generation: gen.map(Generation::new),
|
|
||||||
secondary_conf: None,
|
|
||||||
tenant_conf: TenantConfig::default(),
|
|
||||||
};
|
|
||||||
dest_ps.location_config(tenant_id, dest_conf)?;
|
|
||||||
|
|
||||||
println!("✅ Migration complete");
|
println!("✅ Migration complete");
|
||||||
|
|
||||||
|
|||||||
205
docs/rfcs/029-pageserver-wal-disaster-recovery.md
Normal file
205
docs/rfcs/029-pageserver-wal-disaster-recovery.md
Normal file
@@ -0,0 +1,205 @@
|
|||||||
|
# Name
|
||||||
|
|
||||||
|
Created on: 2023-09-08
|
||||||
|
Author: Arpad Müller
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
Enable the pageserver to recover from data corruption events by implementing
|
||||||
|
a feature to re-apply historic WAL records in parallel to the already occurring
|
||||||
|
WAL replay.
|
||||||
|
|
||||||
|
The feature is outside of the user-visible backup and history story, and only
|
||||||
|
serves as a second-level backup for the case that there is a bug in the
|
||||||
|
pageservers that corrupted the served pages.
|
||||||
|
|
||||||
|
The RFC proposes the addition of two new features:
|
||||||
|
* recover a broken branch from WAL (downtime is allowed)
|
||||||
|
* a test recovery system to recover random branches to make sure recovery works
|
||||||
|
|
||||||
|
## Motivation
|
||||||
|
|
||||||
|
The historic WAL is currently stored in S3 even after it has been replayed by
|
||||||
|
the pageserver and thus been integrated into the pageserver's storage system.
|
||||||
|
This is done to defend from data corruption failures inside the pageservers.
|
||||||
|
|
||||||
|
However, application of this WAL in the disaster recovery setting is currently
|
||||||
|
very manual and we want to automate this to make it easier.
|
||||||
|
|
||||||
|
### Use cases
|
||||||
|
|
||||||
|
There are various use cases for this feature, like:
|
||||||
|
|
||||||
|
* The main motivation is replaying in the instance of pageservers corrupting
|
||||||
|
data.
|
||||||
|
* We might want to, beyond the user-visible history features, through our
|
||||||
|
support channels and upon customer request, in select instances, recover
|
||||||
|
historic versions beyond the range of history that we officially support.
|
||||||
|
* Running the recovery process in the background for random tenant timelines
|
||||||
|
to figure out if there was a corruption of data (we would compare with what
|
||||||
|
the pageserver stores for the "official" timeline).
|
||||||
|
* Using the WAL to arrive at historic pages we can then back up to S3 so that
|
||||||
|
WAL itself can be discarded, or at least not used for future replays.
|
||||||
|
Again, this sounds a lot like what the pageserver is already doing, but the
|
||||||
|
point is to provide a fallback to the service provided by the pageserver.
|
||||||
|
|
||||||
|
## Design
|
||||||
|
|
||||||
|
### Design constraints
|
||||||
|
|
||||||
|
The main design constraint is that the feature needs to be *simple* enough that
|
||||||
|
the number of bugs are as low, and reliability as high as possible: the main
|
||||||
|
goal of this endeavour is to achieve higher correctness than the pageserver.
|
||||||
|
|
||||||
|
For the background process, we cannot afford a downtime of the timeline that is
|
||||||
|
being cloned, as we don't want to restrict ourselves to offline tenants only.
|
||||||
|
In the scenario where we want to recover from disasters or roll back to a
|
||||||
|
historic lsn through support staff, downtimes are more affordable, and
|
||||||
|
inevitable if the original had been subject to the corruption. Ideally, the
|
||||||
|
two code paths would share code, so the solution would be designed for not
|
||||||
|
requiring downtimes.
|
||||||
|
|
||||||
|
### API endpoint changes
|
||||||
|
|
||||||
|
This RFC proposes two API endpoint changes in the safekeeper and the
|
||||||
|
pageserver.
|
||||||
|
|
||||||
|
Remember, the pageserver timeline API creation endpoint is to this URL:
|
||||||
|
|
||||||
|
```
|
||||||
|
/v1/tenant/{tenant_id}/timeline/
|
||||||
|
```
|
||||||
|
|
||||||
|
Where `{tenant_id}` is the ID of the tenant the timeline is created for,
|
||||||
|
and specified as part of the URL. The timeline ID is passed via the POST
|
||||||
|
request body as the only required parameter `new_timeline_id`.
|
||||||
|
|
||||||
|
This proposal adds one optional parameter called
|
||||||
|
`existing_initdb_timeline_id` to the request's json body. If the parameter
|
||||||
|
is not specified, behaviour should be as existing, so the pageserver runs
|
||||||
|
initdb.
|
||||||
|
If the parameter is specified, it is expected to point to a timeline ID.
|
||||||
|
In fact that ID might match `new_timeline_id`, what's important is that
|
||||||
|
S3 storage contains a matching initdb under the URL matching the given
|
||||||
|
tenant and timeline.
|
||||||
|
|
||||||
|
Having both `ancestor_timeline_id` and `existing_initdb_timeline_id`
|
||||||
|
specified is illegal and will yield in an HTTP error. This feature is
|
||||||
|
only meant for the "main" branch that doesn't have any ancestors
|
||||||
|
of its own, as only here initdb is relevant.
|
||||||
|
|
||||||
|
For the safekeeper, we propose the addition of the following copy endpoint:
|
||||||
|
|
||||||
|
```
|
||||||
|
/v1/tenant/{tenant_id}/timeline/{source_timeline_id}/copy
|
||||||
|
```
|
||||||
|
it is meant for POST requests with json, and the two URL parameters
|
||||||
|
`tenant_id` and `source_timeline_id`. The json request body contains
|
||||||
|
the two required parameters `target_timeline_id` and `until_lsn`.
|
||||||
|
|
||||||
|
After invoking, the copy endpoint starts a copy process of the WAL from
|
||||||
|
the source ID to the target ID. The lsn is updated according to the
|
||||||
|
progress of the API call.
|
||||||
|
|
||||||
|
### Higher level features
|
||||||
|
|
||||||
|
We want the API changes to support the following higher level features:
|
||||||
|
|
||||||
|
* recovery-after-corruption DR of the main timeline of a tenant. This
|
||||||
|
feature allows for downtime.
|
||||||
|
* test DR of the main timeline into a special copy timeline. this feature
|
||||||
|
is meant to run against selected production tenants in the background,
|
||||||
|
without the user noticing, so it does not allow for downtime.
|
||||||
|
|
||||||
|
The recovery-after-corruption DR only needs the pageserver changes.
|
||||||
|
It works as follows:
|
||||||
|
|
||||||
|
* delete the timeline from the pageservers via timeline deletion API
|
||||||
|
* re-create it via timeline creation API (same ID as before) and set
|
||||||
|
`existing_initdb_timeline_id` to the same timeline ID
|
||||||
|
|
||||||
|
The test DR requires also the copy primitive and works as follows:
|
||||||
|
|
||||||
|
* copy the WAL of the timeline to a new place
|
||||||
|
* create a new timeline for the tenant
|
||||||
|
|
||||||
|
## Non Goals
|
||||||
|
|
||||||
|
At the danger of being repetitive, the main goal of this feature is to be a
|
||||||
|
backup method, so reliability is very important. This implies that other
|
||||||
|
aspects like performance or space reduction are less important.
|
||||||
|
|
||||||
|
### Corrupt WAL
|
||||||
|
|
||||||
|
The process suggested by this RFC assumes that the WAL is free of corruption.
|
||||||
|
In some instances, corruption can make it into WAL, like for example when
|
||||||
|
higher level components like postgres or the application first read corrupt
|
||||||
|
data, and then execute a write with data derived from that earlier read. That
|
||||||
|
written data might then contain the corruption.
|
||||||
|
|
||||||
|
Common use cases can hit this quite easily. For example, an application reads
|
||||||
|
some counter, increments it, and then writes the new counter value to the
|
||||||
|
database.
|
||||||
|
On a lower level, the compute might put FPIs (Full Page Images) into the WAL,
|
||||||
|
which have corrupt data for rows unrelated to the write operation at hand.
|
||||||
|
|
||||||
|
Separating corrupt writes from non-corrupt ones is a hard problem in general,
|
||||||
|
and if the application was involved in making the corrupt write, a recovery
|
||||||
|
would also involve the application. Therefore, corruption that has made it into
|
||||||
|
the WAL is outside of the scope of this feature. However, the WAL replay can be
|
||||||
|
issued to right before the point in time where the corruption occured. Then the
|
||||||
|
data loss is isolated to post-corruption writes only.
|
||||||
|
|
||||||
|
## Impacted components (e.g. pageserver, safekeeper, console, etc)
|
||||||
|
|
||||||
|
Most changes would happen to the pageservers.
|
||||||
|
For the higher level features, maybe other components like the console would
|
||||||
|
be involved.
|
||||||
|
|
||||||
|
We need to make sure that the shadow timelines are not subject to the usual
|
||||||
|
limits and billing we apply to existing timelines.
|
||||||
|
|
||||||
|
## Proposed implementation
|
||||||
|
|
||||||
|
The first problem to keep in mind is the reproducability of `initdb`.
|
||||||
|
So an initial step would be to upload `initdb` snapshots to S3.
|
||||||
|
|
||||||
|
After that, we'd have the endpoint spawn a background process which
|
||||||
|
performs the replay of the WAL to that new timeline. This process should
|
||||||
|
follow the existing workflows as closely as possible, just using the
|
||||||
|
WAL records of a different timeline.
|
||||||
|
|
||||||
|
The timeline created will be in a special state that solely looks for WAL
|
||||||
|
entries of the timeline it is trying to copy. Once the target LSN is reached,
|
||||||
|
it turns into a normal timeline that also accepts writes to its own
|
||||||
|
timeline ID.
|
||||||
|
|
||||||
|
### Scalability
|
||||||
|
|
||||||
|
For now we want to run this entire process on a single node, and as
|
||||||
|
it is by nature linear, it's hard to parallelize. However, for the
|
||||||
|
verification workloads, we can easily start the WAL replay in parallel
|
||||||
|
for different points in time. This is valuable especially for tenants
|
||||||
|
with large WAL records.
|
||||||
|
|
||||||
|
Compare this with the tricks to make addition circuits execute with
|
||||||
|
lower latency by making them perform the addition for both possible
|
||||||
|
values of the carry bit, and then, in a second step, taking the
|
||||||
|
result for the carry bit that was actually obtained.
|
||||||
|
|
||||||
|
The other scalability dimension to consider is the WAL length, which
|
||||||
|
is a growing question as tenants accumulate changes. There are
|
||||||
|
possible approaches to this, including creating snapshots of the
|
||||||
|
page files and uploading them to S3, but if we do this for every single
|
||||||
|
branch, we lose the cheap branching property.
|
||||||
|
|
||||||
|
### Implementation by component
|
||||||
|
|
||||||
|
The proposed changes for the various components of the neon architecture
|
||||||
|
are written up in this notion page:
|
||||||
|
|
||||||
|
https://www.notion.so/neondatabase/Pageserver-disaster-recovery-one-pager-4ecfb5df16ce4f6bbfc3817ed1a6cbb2
|
||||||
|
|
||||||
|
### Unresolved questions
|
||||||
|
|
||||||
|
none known (outside of the mentioned ones).
|
||||||
@@ -26,6 +26,13 @@ pub struct ComputeSpec {
|
|||||||
// but we don't use it for anything. Serde will ignore missing fields when
|
// but we don't use it for anything. Serde will ignore missing fields when
|
||||||
// deserializing it.
|
// deserializing it.
|
||||||
pub operation_uuid: Option<String>,
|
pub operation_uuid: Option<String>,
|
||||||
|
|
||||||
|
/// Compute features to enable. These feature flags are provided, when we
|
||||||
|
/// know all the details about client's compute, so they cannot be used
|
||||||
|
/// to change `Empty` compute behavior.
|
||||||
|
#[serde(default)]
|
||||||
|
pub features: Vec<ComputeFeature>,
|
||||||
|
|
||||||
/// Expected cluster state at the end of transition process.
|
/// Expected cluster state at the end of transition process.
|
||||||
pub cluster: Cluster,
|
pub cluster: Cluster,
|
||||||
pub delta_operations: Option<Vec<DeltaOp>>,
|
pub delta_operations: Option<Vec<DeltaOp>>,
|
||||||
@@ -68,6 +75,19 @@ pub struct ComputeSpec {
|
|||||||
pub remote_extensions: Option<RemoteExtSpec>,
|
pub remote_extensions: Option<RemoteExtSpec>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
|
||||||
|
#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
|
||||||
|
#[serde(rename_all = "snake_case")]
|
||||||
|
pub enum ComputeFeature {
|
||||||
|
// XXX: Add more feature flags here.
|
||||||
|
|
||||||
|
// This is a special feature flag that is used to represent unknown feature flags.
|
||||||
|
// Basically all unknown to enum flags are represented as this one. See unit test
|
||||||
|
// `parse_unknown_features()` for more details.
|
||||||
|
#[serde(other)]
|
||||||
|
UnknownFeature,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug, Default, Deserialize, Serialize)]
|
#[derive(Clone, Debug, Default, Deserialize, Serialize)]
|
||||||
pub struct RemoteExtSpec {
|
pub struct RemoteExtSpec {
|
||||||
pub public_extensions: Option<Vec<String>>,
|
pub public_extensions: Option<Vec<String>>,
|
||||||
@@ -187,8 +207,6 @@ pub struct DeltaOp {
|
|||||||
pub struct Role {
|
pub struct Role {
|
||||||
pub name: PgIdent,
|
pub name: PgIdent,
|
||||||
pub encrypted_password: Option<String>,
|
pub encrypted_password: Option<String>,
|
||||||
pub replication: Option<bool>,
|
|
||||||
pub bypassrls: Option<bool>,
|
|
||||||
pub options: GenericOptions,
|
pub options: GenericOptions,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -229,7 +247,10 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn parse_spec_file() {
|
fn parse_spec_file() {
|
||||||
let file = File::open("tests/cluster_spec.json").unwrap();
|
let file = File::open("tests/cluster_spec.json").unwrap();
|
||||||
let _spec: ComputeSpec = serde_json::from_reader(file).unwrap();
|
let spec: ComputeSpec = serde_json::from_reader(file).unwrap();
|
||||||
|
|
||||||
|
// Features list defaults to empty vector.
|
||||||
|
assert!(spec.features.is_empty());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -241,4 +262,22 @@ mod tests {
|
|||||||
ob.insert("unknown_field_123123123".into(), "hello".into());
|
ob.insert("unknown_field_123123123".into(), "hello".into());
|
||||||
let _spec: ComputeSpec = serde_json::from_value(json).unwrap();
|
let _spec: ComputeSpec = serde_json::from_value(json).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_unknown_features() {
|
||||||
|
// Test that unknown feature flags do not cause any errors.
|
||||||
|
let file = File::open("tests/cluster_spec.json").unwrap();
|
||||||
|
let mut json: serde_json::Value = serde_json::from_reader(file).unwrap();
|
||||||
|
let ob = json.as_object_mut().unwrap();
|
||||||
|
|
||||||
|
// Add unknown feature flags.
|
||||||
|
let features = vec!["foo_bar_feature", "baz_feature"];
|
||||||
|
ob.insert("features".into(), features.into());
|
||||||
|
|
||||||
|
let spec: ComputeSpec = serde_json::from_value(json).unwrap();
|
||||||
|
|
||||||
|
assert!(spec.features.len() == 2);
|
||||||
|
assert!(spec.features.contains(&ComputeFeature::UnknownFeature));
|
||||||
|
assert_eq!(spec.features, vec![ComputeFeature::UnknownFeature; 2]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ enum-map.workspace = true
|
|||||||
strum.workspace = true
|
strum.workspace = true
|
||||||
strum_macros.workspace = true
|
strum_macros.workspace = true
|
||||||
hex.workspace = true
|
hex.workspace = true
|
||||||
|
thiserror.workspace = true
|
||||||
|
|
||||||
workspace_hack.workspace = true
|
workspace_hack.workspace = true
|
||||||
|
|
||||||
|
|||||||
@@ -4,7 +4,9 @@
|
|||||||
//! See docs/rfcs/025-generation-numbers.md
|
//! See docs/rfcs/025-generation-numbers.md
|
||||||
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use utils::id::{NodeId, TenantId};
|
use utils::id::NodeId;
|
||||||
|
|
||||||
|
use crate::shard::TenantShardId;
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize)]
|
||||||
pub struct ReAttachRequest {
|
pub struct ReAttachRequest {
|
||||||
@@ -13,7 +15,7 @@ pub struct ReAttachRequest {
|
|||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize)]
|
||||||
pub struct ReAttachResponseTenant {
|
pub struct ReAttachResponseTenant {
|
||||||
pub id: TenantId,
|
pub id: TenantShardId,
|
||||||
pub gen: u32,
|
pub gen: u32,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -24,7 +26,7 @@ pub struct ReAttachResponse {
|
|||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize)]
|
||||||
pub struct ValidateRequestTenant {
|
pub struct ValidateRequestTenant {
|
||||||
pub id: TenantId,
|
pub id: TenantShardId,
|
||||||
pub gen: u32,
|
pub gen: u32,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -40,6 +42,6 @@ pub struct ValidateResponse {
|
|||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize)]
|
||||||
pub struct ValidateResponseTenant {
|
pub struct ValidateResponseTenant {
|
||||||
pub id: TenantId,
|
pub id: TenantShardId,
|
||||||
pub valid: bool,
|
pub valid: bool,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -140,3 +140,7 @@ impl Key {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn is_rel_block_key(key: &Key) -> bool {
|
||||||
|
key.field1 == 0x00 && key.field4 != 0
|
||||||
|
}
|
||||||
|
|||||||
@@ -10,7 +10,6 @@ use serde_with::serde_as;
|
|||||||
use strum_macros;
|
use strum_macros;
|
||||||
use utils::{
|
use utils::{
|
||||||
completion,
|
completion,
|
||||||
generation::Generation,
|
|
||||||
history_buffer::HistoryBufferWithDropCounter,
|
history_buffer::HistoryBufferWithDropCounter,
|
||||||
id::{NodeId, TenantId, TimelineId},
|
id::{NodeId, TenantId, TimelineId},
|
||||||
lsn::Lsn,
|
lsn::Lsn,
|
||||||
@@ -180,6 +179,8 @@ pub struct TimelineCreateRequest {
|
|||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub ancestor_timeline_id: Option<TimelineId>,
|
pub ancestor_timeline_id: Option<TimelineId>,
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
|
pub existing_initdb_timeline_id: Option<TimelineId>,
|
||||||
|
#[serde(default)]
|
||||||
pub ancestor_start_lsn: Option<Lsn>,
|
pub ancestor_start_lsn: Option<Lsn>,
|
||||||
pub pg_version: Option<u32>,
|
pub pg_version: Option<u32>,
|
||||||
}
|
}
|
||||||
@@ -262,10 +263,19 @@ pub struct LocationConfig {
|
|||||||
pub mode: LocationConfigMode,
|
pub mode: LocationConfigMode,
|
||||||
/// If attaching, in what generation?
|
/// If attaching, in what generation?
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub generation: Option<Generation>,
|
pub generation: Option<u32>,
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub secondary_conf: Option<LocationConfigSecondary>,
|
pub secondary_conf: Option<LocationConfigSecondary>,
|
||||||
|
|
||||||
|
// Shard parameters: if shard_count is nonzero, then other shard_* fields
|
||||||
|
// must be set accurately.
|
||||||
|
#[serde(default)]
|
||||||
|
pub shard_number: u8,
|
||||||
|
#[serde(default)]
|
||||||
|
pub shard_count: u8,
|
||||||
|
#[serde(default)]
|
||||||
|
pub shard_stripe_size: u32,
|
||||||
|
|
||||||
// If requesting mode `Secondary`, configuration for that.
|
// If requesting mode `Secondary`, configuration for that.
|
||||||
// Custom storage configuration for the tenant, if any
|
// Custom storage configuration for the tenant, if any
|
||||||
pub tenant_conf: TenantConfig,
|
pub tenant_conf: TenantConfig,
|
||||||
@@ -306,31 +316,14 @@ impl std::ops::Deref for TenantConfigRequest {
|
|||||||
|
|
||||||
impl TenantConfigRequest {
|
impl TenantConfigRequest {
|
||||||
pub fn new(tenant_id: TenantId) -> TenantConfigRequest {
|
pub fn new(tenant_id: TenantId) -> TenantConfigRequest {
|
||||||
let config = TenantConfig {
|
let config = TenantConfig::default();
|
||||||
checkpoint_distance: None,
|
|
||||||
checkpoint_timeout: None,
|
|
||||||
compaction_target_size: None,
|
|
||||||
compaction_period: None,
|
|
||||||
compaction_threshold: None,
|
|
||||||
gc_horizon: None,
|
|
||||||
gc_period: None,
|
|
||||||
image_creation_threshold: None,
|
|
||||||
pitr_interval: None,
|
|
||||||
walreceiver_connect_timeout: None,
|
|
||||||
lagging_wal_timeout: None,
|
|
||||||
max_lsn_wal_lag: None,
|
|
||||||
trace_read_requests: None,
|
|
||||||
eviction_policy: None,
|
|
||||||
min_resident_size_override: None,
|
|
||||||
evictions_low_residence_duration_metric_threshold: None,
|
|
||||||
gc_feedback: None,
|
|
||||||
};
|
|
||||||
TenantConfigRequest { tenant_id, config }
|
TenantConfigRequest { tenant_id, config }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Deserialize)]
|
#[derive(Debug, Deserialize)]
|
||||||
pub struct TenantAttachRequest {
|
pub struct TenantAttachRequest {
|
||||||
|
#[serde(default)]
|
||||||
pub config: TenantAttachConfig,
|
pub config: TenantAttachConfig,
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub generation: Option<u32>,
|
pub generation: Option<u32>,
|
||||||
@@ -338,7 +331,7 @@ pub struct TenantAttachRequest {
|
|||||||
|
|
||||||
/// Newtype to enforce deny_unknown_fields on TenantConfig for
|
/// Newtype to enforce deny_unknown_fields on TenantConfig for
|
||||||
/// its usage inside `TenantAttachRequest`.
|
/// its usage inside `TenantAttachRequest`.
|
||||||
#[derive(Debug, Serialize, Deserialize)]
|
#[derive(Debug, Serialize, Deserialize, Default)]
|
||||||
#[serde(deny_unknown_fields)]
|
#[serde(deny_unknown_fields)]
|
||||||
pub struct TenantAttachConfig {
|
pub struct TenantAttachConfig {
|
||||||
#[serde(flatten)]
|
#[serde(flatten)]
|
||||||
@@ -364,7 +357,7 @@ pub enum TenantAttachmentStatus {
|
|||||||
|
|
||||||
#[derive(Serialize, Deserialize, Clone)]
|
#[derive(Serialize, Deserialize, Clone)]
|
||||||
pub struct TenantInfo {
|
pub struct TenantInfo {
|
||||||
pub id: TenantId,
|
pub id: TenantShardId,
|
||||||
// NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's
|
// NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's
|
||||||
pub state: TenantState,
|
pub state: TenantState,
|
||||||
/// Sum of the size of all layer files.
|
/// Sum of the size of all layer files.
|
||||||
@@ -376,7 +369,7 @@ pub struct TenantInfo {
|
|||||||
/// This represents the output of the "timeline_detail" and "timeline_list" API calls.
|
/// This represents the output of the "timeline_detail" and "timeline_list" API calls.
|
||||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||||
pub struct TimelineInfo {
|
pub struct TimelineInfo {
|
||||||
pub tenant_id: TenantId,
|
pub tenant_id: TenantShardId,
|
||||||
pub timeline_id: TimelineId,
|
pub timeline_id: TimelineId,
|
||||||
|
|
||||||
pub ancestor_timeline_id: Option<TimelineId>,
|
pub ancestor_timeline_id: Option<TimelineId>,
|
||||||
@@ -392,7 +385,12 @@ pub struct TimelineInfo {
|
|||||||
/// The LSN that we are advertizing to safekeepers
|
/// The LSN that we are advertizing to safekeepers
|
||||||
pub remote_consistent_lsn_visible: Lsn,
|
pub remote_consistent_lsn_visible: Lsn,
|
||||||
|
|
||||||
pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
|
/// The LSN from the start of the root timeline (never changes)
|
||||||
|
pub initdb_lsn: Lsn,
|
||||||
|
|
||||||
|
pub current_logical_size: u64,
|
||||||
|
pub current_logical_size_is_accurate: bool,
|
||||||
|
|
||||||
/// Sum of the size of all layer files.
|
/// Sum of the size of all layer files.
|
||||||
/// If a layer is present in both local FS and S3, it counts only once.
|
/// If a layer is present in both local FS and S3, it counts only once.
|
||||||
pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
|
pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
|
||||||
@@ -828,7 +826,7 @@ mod tests {
|
|||||||
fn test_tenantinfo_serde() {
|
fn test_tenantinfo_serde() {
|
||||||
// Test serialization/deserialization of TenantInfo
|
// Test serialization/deserialization of TenantInfo
|
||||||
let original_active = TenantInfo {
|
let original_active = TenantInfo {
|
||||||
id: TenantId::generate(),
|
id: TenantShardId::unsharded(TenantId::generate()),
|
||||||
state: TenantState::Active,
|
state: TenantState::Active,
|
||||||
current_physical_size: Some(42),
|
current_physical_size: Some(42),
|
||||||
attachment_status: TenantAttachmentStatus::Attached,
|
attachment_status: TenantAttachmentStatus::Attached,
|
||||||
@@ -845,7 +843,7 @@ mod tests {
|
|||||||
});
|
});
|
||||||
|
|
||||||
let original_broken = TenantInfo {
|
let original_broken = TenantInfo {
|
||||||
id: TenantId::generate(),
|
id: TenantShardId::unsharded(TenantId::generate()),
|
||||||
state: TenantState::Broken {
|
state: TenantState::Broken {
|
||||||
reason: "reason".into(),
|
reason: "reason".into(),
|
||||||
backtrace: "backtrace info".into(),
|
backtrace: "backtrace info".into(),
|
||||||
|
|||||||
@@ -1,13 +1,15 @@
|
|||||||
use std::{ops::RangeInclusive, str::FromStr};
|
use std::{ops::RangeInclusive, str::FromStr};
|
||||||
|
|
||||||
|
use crate::key::{is_rel_block_key, Key};
|
||||||
use hex::FromHex;
|
use hex::FromHex;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
use thiserror;
|
||||||
use utils::id::TenantId;
|
use utils::id::TenantId;
|
||||||
|
|
||||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug)]
|
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
|
||||||
pub struct ShardNumber(pub u8);
|
pub struct ShardNumber(pub u8);
|
||||||
|
|
||||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug)]
|
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
|
||||||
pub struct ShardCount(pub u8);
|
pub struct ShardCount(pub u8);
|
||||||
|
|
||||||
impl ShardCount {
|
impl ShardCount {
|
||||||
@@ -38,7 +40,7 @@ impl ShardNumber {
|
|||||||
/// Note that the binary encoding is _not_ backward compatible, because
|
/// Note that the binary encoding is _not_ backward compatible, because
|
||||||
/// at the time sharding is introduced, there are no existing binary structures
|
/// at the time sharding is introduced, there are no existing binary structures
|
||||||
/// containing TenantId that we need to handle.
|
/// containing TenantId that we need to handle.
|
||||||
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy)]
|
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
|
||||||
pub struct TenantShardId {
|
pub struct TenantShardId {
|
||||||
pub tenant_id: TenantId,
|
pub tenant_id: TenantId,
|
||||||
pub shard_number: ShardNumber,
|
pub shard_number: ShardNumber,
|
||||||
@@ -71,19 +73,33 @@ impl TenantShardId {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn shard_slug(&self) -> String {
|
pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
|
||||||
format!("{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
|
ShardSlug(self)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convenience for code that has special behavior on the 0th shard.
|
||||||
|
pub fn is_zero(&self) -> bool {
|
||||||
|
self.shard_number == ShardNumber(0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Formatting helper
|
||||||
|
struct ShardSlug<'a>(&'a TenantShardId);
|
||||||
|
|
||||||
|
impl<'a> std::fmt::Display for ShardSlug<'a> {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
write!(
|
||||||
|
f,
|
||||||
|
"{:02x}{:02x}",
|
||||||
|
self.0.shard_number.0, self.0.shard_count.0
|
||||||
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl std::fmt::Display for TenantShardId {
|
impl std::fmt::Display for TenantShardId {
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
if self.shard_count != ShardCount(0) {
|
if self.shard_count != ShardCount(0) {
|
||||||
write!(
|
write!(f, "{}-{}", self.tenant_id, self.shard_slug())
|
||||||
f,
|
|
||||||
"{}-{:02x}{:02x}",
|
|
||||||
self.tenant_id, self.shard_number.0, self.shard_count.0
|
|
||||||
)
|
|
||||||
} else {
|
} else {
|
||||||
// Legacy case (shard_count == 0) -- format as just the tenant id. Note that this
|
// Legacy case (shard_count == 0) -- format as just the tenant id. Note that this
|
||||||
// is distinct from the normal single shard case (shard count == 1).
|
// is distinct from the normal single shard case (shard count == 1).
|
||||||
@@ -139,6 +155,89 @@ impl From<[u8; 18]> for TenantShardId {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// For use within the context of a particular tenant, when we need to know which
|
||||||
|
/// shard we're dealing with, but do not need to know the full ShardIdentity (because
|
||||||
|
/// we won't be doing any page->shard mapping), and do not need to know the fully qualified
|
||||||
|
/// TenantShardId.
|
||||||
|
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy)]
|
||||||
|
pub struct ShardIndex {
|
||||||
|
pub shard_number: ShardNumber,
|
||||||
|
pub shard_count: ShardCount,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ShardIndex {
|
||||||
|
pub fn new(number: ShardNumber, count: ShardCount) -> Self {
|
||||||
|
Self {
|
||||||
|
shard_number: number,
|
||||||
|
shard_count: count,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pub fn unsharded() -> Self {
|
||||||
|
Self {
|
||||||
|
shard_number: ShardNumber(0),
|
||||||
|
shard_count: ShardCount(0),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_unsharded(&self) -> bool {
|
||||||
|
self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// For use in constructing remote storage paths: concatenate this with a TenantId
|
||||||
|
/// to get a fully qualified TenantShardId.
|
||||||
|
///
|
||||||
|
/// Backward compat: this function returns an empty string if Self::is_unsharded, such
|
||||||
|
/// that the legacy pre-sharding remote key format is preserved.
|
||||||
|
pub fn get_suffix(&self) -> String {
|
||||||
|
if self.is_unsharded() {
|
||||||
|
"".to_string()
|
||||||
|
} else {
|
||||||
|
format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Display for ShardIndex {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Debug for ShardIndex {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
// Debug is the same as Display: the compact hex representation
|
||||||
|
write!(f, "{}", self)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::str::FromStr for ShardIndex {
|
||||||
|
type Err = hex::FromHexError;
|
||||||
|
|
||||||
|
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||||
|
// Expect format: 1 byte shard number, 1 byte shard count
|
||||||
|
if s.len() == 4 {
|
||||||
|
let bytes = s.as_bytes();
|
||||||
|
let mut shard_parts: [u8; 2] = [0u8; 2];
|
||||||
|
hex::decode_to_slice(bytes, &mut shard_parts)?;
|
||||||
|
Ok(Self {
|
||||||
|
shard_number: ShardNumber(shard_parts[0]),
|
||||||
|
shard_count: ShardCount(shard_parts[1]),
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
Err(hex::FromHexError::InvalidStringLength)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<[u8; 2]> for ShardIndex {
|
||||||
|
fn from(b: [u8; 2]) -> Self {
|
||||||
|
Self {
|
||||||
|
shard_number: ShardNumber(b[0]),
|
||||||
|
shard_count: ShardCount(b[1]),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl Serialize for TenantShardId {
|
impl Serialize for TenantShardId {
|
||||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||||
where
|
where
|
||||||
@@ -209,6 +308,261 @@ impl<'de> Deserialize<'de> for TenantShardId {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Stripe size in number of pages
|
||||||
|
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
|
||||||
|
pub struct ShardStripeSize(pub u32);
|
||||||
|
|
||||||
|
/// Layout version: for future upgrades where we might change how the key->shard mapping works
|
||||||
|
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
|
||||||
|
pub struct ShardLayout(u8);
|
||||||
|
|
||||||
|
const LAYOUT_V1: ShardLayout = ShardLayout(1);
|
||||||
|
/// ShardIdentity uses a magic layout value to indicate if it is unusable
|
||||||
|
const LAYOUT_BROKEN: ShardLayout = ShardLayout(255);
|
||||||
|
|
||||||
|
/// Default stripe size in pages: 256MiB divided by 8kiB page size.
|
||||||
|
const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
|
||||||
|
|
||||||
|
/// The ShardIdentity contains the information needed for one member of map
|
||||||
|
/// to resolve a key to a shard, and then check whether that shard is ==self.
|
||||||
|
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
|
||||||
|
pub struct ShardIdentity {
|
||||||
|
pub number: ShardNumber,
|
||||||
|
pub count: ShardCount,
|
||||||
|
stripe_size: ShardStripeSize,
|
||||||
|
layout: ShardLayout,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(thiserror::Error, Debug, PartialEq, Eq)]
|
||||||
|
pub enum ShardConfigError {
|
||||||
|
#[error("Invalid shard count")]
|
||||||
|
InvalidCount,
|
||||||
|
#[error("Invalid shard number")]
|
||||||
|
InvalidNumber,
|
||||||
|
#[error("Invalid stripe size")]
|
||||||
|
InvalidStripeSize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ShardIdentity {
|
||||||
|
/// An identity with number=0 count=0 is a "none" identity, which represents legacy
|
||||||
|
/// tenants. Modern single-shard tenants should not use this: they should
|
||||||
|
/// have number=0 count=1.
|
||||||
|
pub fn unsharded() -> Self {
|
||||||
|
Self {
|
||||||
|
number: ShardNumber(0),
|
||||||
|
count: ShardCount(0),
|
||||||
|
layout: LAYOUT_V1,
|
||||||
|
stripe_size: DEFAULT_STRIPE_SIZE,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A broken instance of this type is only used for `TenantState::Broken` tenants,
|
||||||
|
/// which are constructed in code paths that don't have access to proper configuration.
|
||||||
|
///
|
||||||
|
/// A ShardIdentity in this state may not be used for anything, and should not be persisted.
|
||||||
|
/// Enforcement is via assertions, to avoid making our interface fallible for this
|
||||||
|
/// edge case: it is the Tenant's responsibility to avoid trying to do any I/O when in a broken
|
||||||
|
/// state, and by extension to avoid trying to do any page->shard resolution.
|
||||||
|
pub fn broken(number: ShardNumber, count: ShardCount) -> Self {
|
||||||
|
Self {
|
||||||
|
number,
|
||||||
|
count,
|
||||||
|
layout: LAYOUT_BROKEN,
|
||||||
|
stripe_size: DEFAULT_STRIPE_SIZE,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_unsharded(&self) -> bool {
|
||||||
|
self.number == ShardNumber(0) && self.count == ShardCount(0)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Count must be nonzero, and number must be < count. To construct
|
||||||
|
/// the legacy case (count==0), use Self::unsharded instead.
|
||||||
|
pub fn new(
|
||||||
|
number: ShardNumber,
|
||||||
|
count: ShardCount,
|
||||||
|
stripe_size: ShardStripeSize,
|
||||||
|
) -> Result<Self, ShardConfigError> {
|
||||||
|
if count.0 == 0 {
|
||||||
|
Err(ShardConfigError::InvalidCount)
|
||||||
|
} else if number.0 > count.0 - 1 {
|
||||||
|
Err(ShardConfigError::InvalidNumber)
|
||||||
|
} else if stripe_size.0 == 0 {
|
||||||
|
Err(ShardConfigError::InvalidStripeSize)
|
||||||
|
} else {
|
||||||
|
Ok(Self {
|
||||||
|
number,
|
||||||
|
count,
|
||||||
|
layout: LAYOUT_V1,
|
||||||
|
stripe_size,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_broken(&self) -> bool {
|
||||||
|
self.layout == LAYOUT_BROKEN
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get_shard_number(&self, key: &Key) -> ShardNumber {
|
||||||
|
assert!(!self.is_broken());
|
||||||
|
key_to_shard_number(self.count, self.stripe_size, key)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return true if the key should be ingested by this shard
|
||||||
|
pub fn is_key_local(&self, key: &Key) -> bool {
|
||||||
|
assert!(!self.is_broken());
|
||||||
|
if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) {
|
||||||
|
true
|
||||||
|
} else {
|
||||||
|
key_to_shard_number(self.count, self.stripe_size, key) == self.number
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn shard_slug(&self) -> String {
|
||||||
|
if self.count > ShardCount(0) {
|
||||||
|
format!("-{:02x}{:02x}", self.number.0, self.count.0)
|
||||||
|
} else {
|
||||||
|
String::new()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convenience for checking if this identity is the 0th shard in a tenant,
|
||||||
|
/// for special cases on shard 0 such as ingesting relation sizes.
|
||||||
|
pub fn is_zero(&self) -> bool {
|
||||||
|
self.number == ShardNumber(0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Serialize for ShardIndex {
|
||||||
|
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||||
|
where
|
||||||
|
S: serde::Serializer,
|
||||||
|
{
|
||||||
|
if serializer.is_human_readable() {
|
||||||
|
serializer.collect_str(self)
|
||||||
|
} else {
|
||||||
|
// Binary encoding is not used in index_part.json, but is included in anticipation of
|
||||||
|
// switching various structures (e.g. inter-process communication, remote metadata) to more
|
||||||
|
// compact binary encodings in future.
|
||||||
|
let mut packed: [u8; 2] = [0; 2];
|
||||||
|
packed[0] = self.shard_number.0;
|
||||||
|
packed[1] = self.shard_count.0;
|
||||||
|
packed.serialize(serializer)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'de> Deserialize<'de> for ShardIndex {
|
||||||
|
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||||
|
where
|
||||||
|
D: serde::Deserializer<'de>,
|
||||||
|
{
|
||||||
|
struct IdVisitor {
|
||||||
|
is_human_readable_deserializer: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'de> serde::de::Visitor<'de> for IdVisitor {
|
||||||
|
type Value = ShardIndex;
|
||||||
|
|
||||||
|
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||||
|
if self.is_human_readable_deserializer {
|
||||||
|
formatter.write_str("value in form of hex string")
|
||||||
|
} else {
|
||||||
|
formatter.write_str("value in form of integer array([u8; 2])")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
|
||||||
|
where
|
||||||
|
A: serde::de::SeqAccess<'de>,
|
||||||
|
{
|
||||||
|
let s = serde::de::value::SeqAccessDeserializer::new(seq);
|
||||||
|
let id: [u8; 2] = Deserialize::deserialize(s)?;
|
||||||
|
Ok(ShardIndex::from(id))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
|
||||||
|
where
|
||||||
|
E: serde::de::Error,
|
||||||
|
{
|
||||||
|
ShardIndex::from_str(v).map_err(E::custom)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if deserializer.is_human_readable() {
|
||||||
|
deserializer.deserialize_str(IdVisitor {
|
||||||
|
is_human_readable_deserializer: true,
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
deserializer.deserialize_tuple(
|
||||||
|
2,
|
||||||
|
IdVisitor {
|
||||||
|
is_human_readable_deserializer: false,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Whether this key is always held on shard 0 (e.g. shard 0 holds all SLRU keys
|
||||||
|
/// in order to be able to serve basebackup requests without peer communication).
|
||||||
|
fn key_is_shard0(key: &Key) -> bool {
|
||||||
|
// To decide what to shard out to shards >0, we apply a simple rule that only
|
||||||
|
// relation pages are distributed to shards other than shard zero. Everything else gets
|
||||||
|
// stored on shard 0. This guarantees that shard 0 can independently serve basebackup
|
||||||
|
// requests, and any request other than those for particular blocks in relations.
|
||||||
|
//
|
||||||
|
// In this condition:
|
||||||
|
// - is_rel_block_key includes only relations, i.e. excludes SLRU data and
|
||||||
|
// all metadata.
|
||||||
|
// - field6 is set to -1 for relation size pages.
|
||||||
|
!(is_rel_block_key(key) && key.field6 != 0xffffffff)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Provide the same result as the function in postgres `hashfn.h` with the same name
|
||||||
|
fn murmurhash32(mut h: u32) -> u32 {
|
||||||
|
h ^= h >> 16;
|
||||||
|
h = h.wrapping_mul(0x85ebca6b);
|
||||||
|
h ^= h >> 13;
|
||||||
|
h = h.wrapping_mul(0xc2b2ae35);
|
||||||
|
h ^= h >> 16;
|
||||||
|
h
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Provide the same result as the function in postgres `hashfn.h` with the same name
|
||||||
|
fn hash_combine(mut a: u32, mut b: u32) -> u32 {
|
||||||
|
b = b.wrapping_add(0x9e3779b9);
|
||||||
|
b = b.wrapping_add(a << 6);
|
||||||
|
b = b.wrapping_add(a >> 2);
|
||||||
|
|
||||||
|
a ^= b;
|
||||||
|
a
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Where a Key is to be distributed across shards, select the shard. This function
|
||||||
|
/// does not account for keys that should be broadcast across shards.
|
||||||
|
///
|
||||||
|
/// The hashing in this function must exactly match what we do in postgres smgr
|
||||||
|
/// code. The resulting distribution of pages is intended to preserve locality within
|
||||||
|
/// `stripe_size` ranges of contiguous block numbers in the same relation, while otherwise
|
||||||
|
/// distributing data pseudo-randomly.
|
||||||
|
///
|
||||||
|
/// The mapping of key to shard is not stable across changes to ShardCount: this is intentional
|
||||||
|
/// and will be handled at higher levels when shards are split.
|
||||||
|
fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Key) -> ShardNumber {
|
||||||
|
// Fast path for un-sharded tenants or broadcast keys
|
||||||
|
if count < ShardCount(2) || key_is_shard0(key) {
|
||||||
|
return ShardNumber(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// relNode
|
||||||
|
let mut hash = murmurhash32(key.field4);
|
||||||
|
// blockNum/stripe size
|
||||||
|
hash = hash_combine(hash, murmurhash32(key.field6 / stripe_size.0));
|
||||||
|
|
||||||
|
ShardNumber((hash % count.0 as u32) as u8)
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
@@ -318,4 +672,91 @@ mod tests {
|
|||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn shard_identity_validation() -> Result<(), ShardConfigError> {
|
||||||
|
// Happy cases
|
||||||
|
ShardIdentity::new(ShardNumber(0), ShardCount(1), DEFAULT_STRIPE_SIZE)?;
|
||||||
|
ShardIdentity::new(ShardNumber(0), ShardCount(1), ShardStripeSize(1))?;
|
||||||
|
ShardIdentity::new(ShardNumber(254), ShardCount(255), ShardStripeSize(1))?;
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
ShardIdentity::new(ShardNumber(0), ShardCount(0), DEFAULT_STRIPE_SIZE),
|
||||||
|
Err(ShardConfigError::InvalidCount)
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
ShardIdentity::new(ShardNumber(10), ShardCount(10), DEFAULT_STRIPE_SIZE),
|
||||||
|
Err(ShardConfigError::InvalidNumber)
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
ShardIdentity::new(ShardNumber(11), ShardCount(10), DEFAULT_STRIPE_SIZE),
|
||||||
|
Err(ShardConfigError::InvalidNumber)
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
ShardIdentity::new(ShardNumber(255), ShardCount(255), DEFAULT_STRIPE_SIZE),
|
||||||
|
Err(ShardConfigError::InvalidNumber)
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
ShardIdentity::new(ShardNumber(0), ShardCount(1), ShardStripeSize(0)),
|
||||||
|
Err(ShardConfigError::InvalidStripeSize)
|
||||||
|
);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn shard_index_human_encoding() -> Result<(), hex::FromHexError> {
|
||||||
|
let example = ShardIndex {
|
||||||
|
shard_number: ShardNumber(13),
|
||||||
|
shard_count: ShardCount(17),
|
||||||
|
};
|
||||||
|
let expected: String = "0d11".to_string();
|
||||||
|
let encoded = format!("{example}");
|
||||||
|
assert_eq!(&encoded, &expected);
|
||||||
|
|
||||||
|
let decoded = ShardIndex::from_str(&encoded)?;
|
||||||
|
assert_eq!(example, decoded);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn shard_index_binary_encoding() -> Result<(), hex::FromHexError> {
|
||||||
|
let example = ShardIndex {
|
||||||
|
shard_number: ShardNumber(13),
|
||||||
|
shard_count: ShardCount(17),
|
||||||
|
};
|
||||||
|
let expected: [u8; 2] = [0x0d, 0x11];
|
||||||
|
|
||||||
|
let encoded = bincode::serialize(&example).unwrap();
|
||||||
|
assert_eq!(Hex(&encoded), Hex(&expected));
|
||||||
|
let decoded = bincode::deserialize(&encoded).unwrap();
|
||||||
|
assert_eq!(example, decoded);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
// These are only smoke tests to spot check that our implementation doesn't
|
||||||
|
// deviate from a few examples values: not aiming to validate the overall
|
||||||
|
// hashing algorithm.
|
||||||
|
#[test]
|
||||||
|
fn murmur_hash() {
|
||||||
|
assert_eq!(murmurhash32(0), 0);
|
||||||
|
|
||||||
|
assert_eq!(hash_combine(0xb1ff3b40, 0), 0xfb7923c9);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn shard_mapping() {
|
||||||
|
let key = Key {
|
||||||
|
field1: 0x00,
|
||||||
|
field2: 0x67f,
|
||||||
|
field3: 0x5,
|
||||||
|
field4: 0x400c,
|
||||||
|
field5: 0x00,
|
||||||
|
field6: 0x7d06,
|
||||||
|
};
|
||||||
|
|
||||||
|
let shard = key_to_shard_number(ShardCount(10), DEFAULT_STRIPE_SIZE, &key);
|
||||||
|
assert_eq!(shard, ShardNumber(8));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -289,10 +289,10 @@ impl FeStartupPacket {
|
|||||||
// We shouldn't advance `buf` as probably full message is not there yet,
|
// We shouldn't advance `buf` as probably full message is not there yet,
|
||||||
// so can't directly use Bytes::get_u32 etc.
|
// so can't directly use Bytes::get_u32 etc.
|
||||||
let len = (&buf[0..4]).read_u32::<BigEndian>().unwrap() as usize;
|
let len = (&buf[0..4]).read_u32::<BigEndian>().unwrap() as usize;
|
||||||
// The proposed replacement is `!(4..=MAX_STARTUP_PACKET_LENGTH).contains(&len)`
|
// The proposed replacement is `!(8..=MAX_STARTUP_PACKET_LENGTH).contains(&len)`
|
||||||
// which is less readable
|
// which is less readable
|
||||||
#[allow(clippy::manual_range_contains)]
|
#[allow(clippy::manual_range_contains)]
|
||||||
if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
|
if len < 8 || len > MAX_STARTUP_PACKET_LENGTH {
|
||||||
return Err(ProtocolError::Protocol(format!(
|
return Err(ProtocolError::Protocol(format!(
|
||||||
"invalid startup packet message length {}",
|
"invalid startup packet message length {}",
|
||||||
len
|
len
|
||||||
@@ -975,4 +975,10 @@ mod tests {
|
|||||||
let params = make_params("foo\\ bar \\ \\\\ baz\\ lol");
|
let params = make_params("foo\\ bar \\ \\\\ baz\\ lol");
|
||||||
assert_eq!(split_options(¶ms), ["foo bar", " \\", "baz ", "lol"]);
|
assert_eq!(split_options(¶ms), ["foo bar", " \\", "baz ", "lol"]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_fe_startup_packet_regression() {
|
||||||
|
let data = [0, 0, 0, 7, 0, 0, 0, 0];
|
||||||
|
FeStartupPacket::parse(&mut BytesMut::from_iter(data)).unwrap_err();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -9,18 +9,18 @@ anyhow.workspace = true
|
|||||||
async-trait.workspace = true
|
async-trait.workspace = true
|
||||||
once_cell.workspace = true
|
once_cell.workspace = true
|
||||||
aws-smithy-async.workspace = true
|
aws-smithy-async.workspace = true
|
||||||
aws-smithy-http.workspace = true
|
aws-smithy-types.workspace = true
|
||||||
aws-types.workspace = true
|
|
||||||
aws-config.workspace = true
|
aws-config.workspace = true
|
||||||
aws-sdk-s3.workspace = true
|
aws-sdk-s3.workspace = true
|
||||||
aws-credential-types.workspace = true
|
aws-credential-types.workspace = true
|
||||||
bytes.workspace = true
|
bytes.workspace = true
|
||||||
camino.workspace = true
|
camino.workspace = true
|
||||||
hyper = { workspace = true, features = ["stream"] }
|
hyper = { workspace = true, features = ["stream"] }
|
||||||
|
futures.workspace = true
|
||||||
serde.workspace = true
|
serde.workspace = true
|
||||||
serde_json.workspace = true
|
serde_json.workspace = true
|
||||||
tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
|
tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
|
||||||
tokio-util.workspace = true
|
tokio-util = { workspace = true, features = ["compat"] }
|
||||||
toml_edit.workspace = true
|
toml_edit.workspace = true
|
||||||
tracing.workspace = true
|
tracing.workspace = true
|
||||||
scopeguard.workspace = true
|
scopeguard.workspace = true
|
||||||
|
|||||||
@@ -1,21 +1,24 @@
|
|||||||
//! Azure Blob Storage wrapper
|
//! Azure Blob Storage wrapper
|
||||||
|
|
||||||
|
use std::borrow::Cow;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::env;
|
use std::env;
|
||||||
use std::num::NonZeroU32;
|
use std::num::NonZeroU32;
|
||||||
|
use std::pin::Pin;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::{borrow::Cow, io::Cursor};
|
|
||||||
|
|
||||||
use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
|
use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use azure_core::request_options::{MaxResults, Metadata, Range};
|
use azure_core::request_options::{MaxResults, Metadata, Range};
|
||||||
|
use azure_core::RetryOptions;
|
||||||
use azure_identity::DefaultAzureCredential;
|
use azure_identity::DefaultAzureCredential;
|
||||||
use azure_storage::StorageCredentials;
|
use azure_storage::StorageCredentials;
|
||||||
use azure_storage_blobs::prelude::ClientBuilder;
|
use azure_storage_blobs::prelude::ClientBuilder;
|
||||||
use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
|
use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
|
||||||
|
use bytes::Bytes;
|
||||||
|
use futures::stream::Stream;
|
||||||
use futures_util::StreamExt;
|
use futures_util::StreamExt;
|
||||||
use http_types::StatusCode;
|
use http_types::StatusCode;
|
||||||
use tokio::io::AsyncRead;
|
|
||||||
use tracing::debug;
|
use tracing::debug;
|
||||||
|
|
||||||
use crate::s3_bucket::RequestKind;
|
use crate::s3_bucket::RequestKind;
|
||||||
@@ -49,7 +52,8 @@ impl AzureBlobStorage {
|
|||||||
StorageCredentials::token_credential(Arc::new(token_credential))
|
StorageCredentials::token_credential(Arc::new(token_credential))
|
||||||
};
|
};
|
||||||
|
|
||||||
let builder = ClientBuilder::new(account, credentials);
|
// we have an outer retry
|
||||||
|
let builder = ClientBuilder::new(account, credentials).retry(RetryOptions::none());
|
||||||
|
|
||||||
let client = builder.container_client(azure_config.container_name.to_owned());
|
let client = builder.container_client(azure_config.container_name.to_owned());
|
||||||
|
|
||||||
@@ -116,7 +120,8 @@ impl AzureBlobStorage {
|
|||||||
let mut metadata = HashMap::new();
|
let mut metadata = HashMap::new();
|
||||||
// TODO give proper streaming response instead of buffering into RAM
|
// TODO give proper streaming response instead of buffering into RAM
|
||||||
// https://github.com/neondatabase/neon/issues/5563
|
// https://github.com/neondatabase/neon/issues/5563
|
||||||
let mut buf = Vec::new();
|
|
||||||
|
let mut bufs = Vec::new();
|
||||||
while let Some(part) = response.next().await {
|
while let Some(part) = response.next().await {
|
||||||
let part = part.map_err(to_download_error)?;
|
let part = part.map_err(to_download_error)?;
|
||||||
if let Some(blob_meta) = part.blob.metadata {
|
if let Some(blob_meta) = part.blob.metadata {
|
||||||
@@ -127,10 +132,10 @@ impl AzureBlobStorage {
|
|||||||
.collect()
|
.collect()
|
||||||
.await
|
.await
|
||||||
.map_err(|e| DownloadError::Other(e.into()))?;
|
.map_err(|e| DownloadError::Other(e.into()))?;
|
||||||
buf.extend_from_slice(&data.slice(..));
|
bufs.push(data);
|
||||||
}
|
}
|
||||||
Ok(Download {
|
Ok(Download {
|
||||||
download_stream: Box::pin(Cursor::new(buf)),
|
download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
|
||||||
metadata: Some(StorageMetadata(metadata)),
|
metadata: Some(StorageMetadata(metadata)),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@@ -217,9 +222,10 @@ impl RemoteStorage for AzureBlobStorage {
|
|||||||
}
|
}
|
||||||
Ok(res)
|
Ok(res)
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn upload(
|
async fn upload(
|
||||||
&self,
|
&self,
|
||||||
mut from: impl AsyncRead + Unpin + Send + Sync + 'static,
|
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||||
data_size_bytes: usize,
|
data_size_bytes: usize,
|
||||||
to: &RemotePath,
|
to: &RemotePath,
|
||||||
metadata: Option<StorageMetadata>,
|
metadata: Option<StorageMetadata>,
|
||||||
@@ -227,13 +233,12 @@ impl RemoteStorage for AzureBlobStorage {
|
|||||||
let _permit = self.permit(RequestKind::Put).await;
|
let _permit = self.permit(RequestKind::Put).await;
|
||||||
let blob_client = self.client.blob_client(self.relative_path_to_name(to));
|
let blob_client = self.client.blob_client(self.relative_path_to_name(to));
|
||||||
|
|
||||||
// TODO FIX THIS UGLY HACK and don't buffer the entire object
|
let from: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>> =
|
||||||
// into RAM here, but use the streaming interface. For that,
|
Box::pin(from);
|
||||||
// we'd have to change the interface though...
|
|
||||||
// https://github.com/neondatabase/neon/issues/5563
|
let from = NonSeekableStream::new(from, data_size_bytes);
|
||||||
let mut buf = Vec::with_capacity(data_size_bytes);
|
|
||||||
tokio::io::copy(&mut from, &mut buf).await?;
|
let body = azure_core::Body::SeekableStream(Box::new(from));
|
||||||
let body = azure_core::Body::Bytes(buf.into());
|
|
||||||
|
|
||||||
let mut builder = blob_client.put_block_blob(body);
|
let mut builder = blob_client.put_block_blob(body);
|
||||||
|
|
||||||
@@ -266,17 +271,12 @@ impl RemoteStorage for AzureBlobStorage {
|
|||||||
|
|
||||||
let mut builder = blob_client.get();
|
let mut builder = blob_client.get();
|
||||||
|
|
||||||
if let Some(end_exclusive) = end_exclusive {
|
let range: Range = if let Some(end_exclusive) = end_exclusive {
|
||||||
builder = builder.range(Range::new(start_inclusive, end_exclusive));
|
(start_inclusive..end_exclusive).into()
|
||||||
} else {
|
} else {
|
||||||
// Open ranges are not supported by the SDK so we work around
|
(start_inclusive..).into()
|
||||||
// by setting the upper limit extremely high (but high enough
|
};
|
||||||
// to still be representable by signed 64 bit integers).
|
builder = builder.range(range);
|
||||||
// TODO remove workaround once the SDK adds open range support
|
|
||||||
// https://github.com/Azure/azure-sdk-for-rust/issues/1438
|
|
||||||
let end_exclusive = u64::MAX / 4;
|
|
||||||
builder = builder.range(Range::new(start_inclusive, end_exclusive));
|
|
||||||
}
|
|
||||||
|
|
||||||
self.download_for_builder(builder).await
|
self.download_for_builder(builder).await
|
||||||
}
|
}
|
||||||
@@ -312,3 +312,153 @@ impl RemoteStorage for AzureBlobStorage {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pin_project_lite::pin_project! {
|
||||||
|
/// Hack to work around not being able to stream once with azure sdk.
|
||||||
|
///
|
||||||
|
/// Azure sdk clones streams around with the assumption that they are like
|
||||||
|
/// `Arc<tokio::fs::File>` (except not supporting tokio), however our streams are not like
|
||||||
|
/// that. For example for an `index_part.json` we just have a single chunk of [`Bytes`]
|
||||||
|
/// representing the whole serialized vec. It could be trivially cloneable and "semi-trivially"
|
||||||
|
/// seekable, but we can also just re-try the request easier.
|
||||||
|
#[project = NonSeekableStreamProj]
|
||||||
|
enum NonSeekableStream<S> {
|
||||||
|
/// A stream wrappers initial form.
|
||||||
|
///
|
||||||
|
/// Mutex exists to allow moving when cloning. If the sdk changes to do less than 1
|
||||||
|
/// clone before first request, then this must be changed.
|
||||||
|
Initial {
|
||||||
|
inner: std::sync::Mutex<Option<tokio_util::compat::Compat<tokio_util::io::StreamReader<S, Bytes>>>>,
|
||||||
|
len: usize,
|
||||||
|
},
|
||||||
|
/// The actually readable variant, produced by cloning the Initial variant.
|
||||||
|
///
|
||||||
|
/// The sdk currently always clones once, even without retry policy.
|
||||||
|
Actual {
|
||||||
|
#[pin]
|
||||||
|
inner: tokio_util::compat::Compat<tokio_util::io::StreamReader<S, Bytes>>,
|
||||||
|
len: usize,
|
||||||
|
read_any: bool,
|
||||||
|
},
|
||||||
|
/// Most likely unneeded, but left to make life easier, in case more clones are added.
|
||||||
|
Cloned {
|
||||||
|
len_was: usize,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<S> NonSeekableStream<S>
|
||||||
|
where
|
||||||
|
S: Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||||
|
{
|
||||||
|
fn new(inner: S, len: usize) -> NonSeekableStream<S> {
|
||||||
|
use tokio_util::compat::TokioAsyncReadCompatExt;
|
||||||
|
|
||||||
|
let inner = tokio_util::io::StreamReader::new(inner).compat();
|
||||||
|
let inner = Some(inner);
|
||||||
|
let inner = std::sync::Mutex::new(inner);
|
||||||
|
NonSeekableStream::Initial { inner, len }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<S> std::fmt::Debug for NonSeekableStream<S> {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
match self {
|
||||||
|
Self::Initial { len, .. } => f.debug_struct("Initial").field("len", len).finish(),
|
||||||
|
Self::Actual { len, .. } => f.debug_struct("Actual").field("len", len).finish(),
|
||||||
|
Self::Cloned { len_was, .. } => f.debug_struct("Cloned").field("len", len_was).finish(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<S> futures::io::AsyncRead for NonSeekableStream<S>
|
||||||
|
where
|
||||||
|
S: Stream<Item = std::io::Result<Bytes>>,
|
||||||
|
{
|
||||||
|
fn poll_read(
|
||||||
|
self: std::pin::Pin<&mut Self>,
|
||||||
|
cx: &mut std::task::Context<'_>,
|
||||||
|
buf: &mut [u8],
|
||||||
|
) -> std::task::Poll<std::io::Result<usize>> {
|
||||||
|
match self.project() {
|
||||||
|
NonSeekableStreamProj::Actual {
|
||||||
|
inner, read_any, ..
|
||||||
|
} => {
|
||||||
|
*read_any = true;
|
||||||
|
inner.poll_read(cx, buf)
|
||||||
|
}
|
||||||
|
// NonSeekableStream::Initial does not support reading because it is just much easier
|
||||||
|
// to have the mutex in place where one does not poll the contents, or that's how it
|
||||||
|
// seemed originally. If there is a version upgrade which changes the cloning, then
|
||||||
|
// that support needs to be hacked in.
|
||||||
|
//
|
||||||
|
// including {self:?} into the message would be useful, but unsure how to unproject.
|
||||||
|
_ => std::task::Poll::Ready(Err(std::io::Error::new(
|
||||||
|
std::io::ErrorKind::Other,
|
||||||
|
"cloned or initial values cannot be read",
|
||||||
|
))),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<S> Clone for NonSeekableStream<S> {
|
||||||
|
/// Weird clone implementation exists to support the sdk doing cloning before issuing the first
|
||||||
|
/// request, see type documentation.
|
||||||
|
fn clone(&self) -> Self {
|
||||||
|
use NonSeekableStream::*;
|
||||||
|
|
||||||
|
match self {
|
||||||
|
Initial { inner, len } => {
|
||||||
|
if let Some(inner) = inner.lock().unwrap().take() {
|
||||||
|
Actual {
|
||||||
|
inner,
|
||||||
|
len: *len,
|
||||||
|
read_any: false,
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
Self::Cloned { len_was: *len }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Actual { len, .. } => Cloned { len_was: *len },
|
||||||
|
Cloned { len_was } => Cloned { len_was: *len_was },
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
|
impl<S> azure_core::SeekableStream for NonSeekableStream<S>
|
||||||
|
where
|
||||||
|
S: Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync + 'static,
|
||||||
|
{
|
||||||
|
async fn reset(&mut self) -> azure_core::error::Result<()> {
|
||||||
|
use NonSeekableStream::*;
|
||||||
|
|
||||||
|
let msg = match self {
|
||||||
|
Initial { inner, .. } => {
|
||||||
|
if inner.get_mut().unwrap().is_some() {
|
||||||
|
return Ok(());
|
||||||
|
} else {
|
||||||
|
"reset after first clone is not supported"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Actual { read_any, .. } if !*read_any => return Ok(()),
|
||||||
|
Actual { .. } => "reset after reading is not supported",
|
||||||
|
Cloned { .. } => "reset after second clone is not supported",
|
||||||
|
};
|
||||||
|
Err(azure_core::error::Error::new(
|
||||||
|
azure_core::error::ErrorKind::Io,
|
||||||
|
std::io::Error::new(std::io::ErrorKind::Other, msg),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Note: it is not documented if this should be the total or remaining length, total passes the
|
||||||
|
// tests.
|
||||||
|
fn len(&self) -> usize {
|
||||||
|
use NonSeekableStream::*;
|
||||||
|
match self {
|
||||||
|
Initial { len, .. } => *len,
|
||||||
|
Actual { len, .. } => *len,
|
||||||
|
Cloned { len_was, .. } => *len_was,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -19,8 +19,10 @@ use std::{collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::A
|
|||||||
use anyhow::{bail, Context};
|
use anyhow::{bail, Context};
|
||||||
use camino::{Utf8Path, Utf8PathBuf};
|
use camino::{Utf8Path, Utf8PathBuf};
|
||||||
|
|
||||||
|
use bytes::Bytes;
|
||||||
|
use futures::stream::Stream;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use tokio::{io, sync::Semaphore};
|
use tokio::sync::Semaphore;
|
||||||
use toml_edit::Item;
|
use toml_edit::Item;
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
|
|
||||||
@@ -179,7 +181,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
|||||||
/// Streams the local file contents into remote into the remote storage entry.
|
/// Streams the local file contents into remote into the remote storage entry.
|
||||||
async fn upload(
|
async fn upload(
|
||||||
&self,
|
&self,
|
||||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||||
// S3 PUT request requires the content length to be specified,
|
// S3 PUT request requires the content length to be specified,
|
||||||
// otherwise it starts to fail with the concurrent connection count increasing.
|
// otherwise it starts to fail with the concurrent connection count increasing.
|
||||||
data_size_bytes: usize,
|
data_size_bytes: usize,
|
||||||
@@ -206,7 +208,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub struct Download {
|
pub struct Download {
|
||||||
pub download_stream: Pin<Box<dyn io::AsyncRead + Unpin + Send + Sync>>,
|
pub download_stream: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>,
|
||||||
/// Extra key-value data, associated with the current remote file.
|
/// Extra key-value data, associated with the current remote file.
|
||||||
pub metadata: Option<StorageMetadata>,
|
pub metadata: Option<StorageMetadata>,
|
||||||
}
|
}
|
||||||
@@ -300,7 +302,7 @@ impl GenericRemoteStorage {
|
|||||||
|
|
||||||
pub async fn upload(
|
pub async fn upload(
|
||||||
&self,
|
&self,
|
||||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||||
data_size_bytes: usize,
|
data_size_bytes: usize,
|
||||||
to: &RemotePath,
|
to: &RemotePath,
|
||||||
metadata: Option<StorageMetadata>,
|
metadata: Option<StorageMetadata>,
|
||||||
@@ -398,7 +400,7 @@ impl GenericRemoteStorage {
|
|||||||
/// this path is used for the remote object id conversion only.
|
/// this path is used for the remote object id conversion only.
|
||||||
pub async fn upload_storage_object(
|
pub async fn upload_storage_object(
|
||||||
&self,
|
&self,
|
||||||
from: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
|
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||||
from_size_bytes: usize,
|
from_size_bytes: usize,
|
||||||
to: &RemotePath,
|
to: &RemotePath,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
|
|||||||
@@ -7,11 +7,14 @@
|
|||||||
use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin};
|
use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin};
|
||||||
|
|
||||||
use anyhow::{bail, ensure, Context};
|
use anyhow::{bail, ensure, Context};
|
||||||
|
use bytes::Bytes;
|
||||||
use camino::{Utf8Path, Utf8PathBuf};
|
use camino::{Utf8Path, Utf8PathBuf};
|
||||||
|
use futures::stream::Stream;
|
||||||
use tokio::{
|
use tokio::{
|
||||||
fs,
|
fs,
|
||||||
io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
|
io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
|
||||||
};
|
};
|
||||||
|
use tokio_util::io::ReaderStream;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
|
use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
|
||||||
|
|
||||||
@@ -99,27 +102,35 @@ impl LocalFs {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// If we were given a directory, we may use it as our starting point.
|
// If we were given a directory, we may use it as our starting point.
|
||||||
// Otherwise, we must go up to the parent directory. This is because
|
// Otherwise, we must go up to the first ancestor dir that exists. This is because
|
||||||
// S3 object list prefixes can be arbitrary strings, but when reading
|
// S3 object list prefixes can be arbitrary strings, but when reading
|
||||||
// the local filesystem we need a directory to start calling read_dir on.
|
// the local filesystem we need a directory to start calling read_dir on.
|
||||||
let mut initial_dir = full_path.clone();
|
let mut initial_dir = full_path.clone();
|
||||||
match fs::metadata(full_path.clone()).await {
|
loop {
|
||||||
Ok(meta) => {
|
// Did we make it to the root?
|
||||||
if !meta.is_dir() {
|
if initial_dir.parent().is_none() {
|
||||||
|
anyhow::bail!("list_files: failed to find valid ancestor dir for {full_path}");
|
||||||
|
}
|
||||||
|
|
||||||
|
match fs::metadata(initial_dir.clone()).await {
|
||||||
|
Ok(meta) if meta.is_dir() => {
|
||||||
|
// We found a directory, break
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
Ok(_meta) => {
|
||||||
// It's not a directory: strip back to the parent
|
// It's not a directory: strip back to the parent
|
||||||
initial_dir.pop();
|
initial_dir.pop();
|
||||||
}
|
}
|
||||||
}
|
Err(e) if e.kind() == ErrorKind::NotFound => {
|
||||||
Err(e) if e.kind() == ErrorKind::NotFound => {
|
// It's not a file that exists: strip the prefix back to the parent directory
|
||||||
// It's not a file that exists: strip the prefix back to the parent directory
|
initial_dir.pop();
|
||||||
initial_dir.pop();
|
}
|
||||||
}
|
Err(e) => {
|
||||||
Err(e) => {
|
// Unexpected I/O error
|
||||||
// Unexpected I/O error
|
anyhow::bail!(e)
|
||||||
anyhow::bail!(e)
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Note that Utf8PathBuf starts_with only considers full path segments, but
|
// Note that Utf8PathBuf starts_with only considers full path segments, but
|
||||||
// object prefixes are arbitrary strings, so we need the strings for doing
|
// object prefixes are arbitrary strings, so we need the strings for doing
|
||||||
// starts_with later.
|
// starts_with later.
|
||||||
@@ -211,7 +222,7 @@ impl RemoteStorage for LocalFs {
|
|||||||
|
|
||||||
async fn upload(
|
async fn upload(
|
||||||
&self,
|
&self,
|
||||||
data: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync,
|
||||||
data_size_bytes: usize,
|
data_size_bytes: usize,
|
||||||
to: &RemotePath,
|
to: &RemotePath,
|
||||||
metadata: Option<StorageMetadata>,
|
metadata: Option<StorageMetadata>,
|
||||||
@@ -244,9 +255,12 @@ impl RemoteStorage for LocalFs {
|
|||||||
);
|
);
|
||||||
|
|
||||||
let from_size_bytes = data_size_bytes as u64;
|
let from_size_bytes = data_size_bytes as u64;
|
||||||
|
let data = tokio_util::io::StreamReader::new(data);
|
||||||
|
let data = std::pin::pin!(data);
|
||||||
let mut buffer_to_read = data.take(from_size_bytes);
|
let mut buffer_to_read = data.take(from_size_bytes);
|
||||||
|
|
||||||
let bytes_read = io::copy(&mut buffer_to_read, &mut destination)
|
// alternatively we could just write the bytes to a file, but local_fs is a testing utility
|
||||||
|
let bytes_read = io::copy_buf(&mut buffer_to_read, &mut destination)
|
||||||
.await
|
.await
|
||||||
.with_context(|| {
|
.with_context(|| {
|
||||||
format!(
|
format!(
|
||||||
@@ -300,7 +314,7 @@ impl RemoteStorage for LocalFs {
|
|||||||
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
|
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
|
||||||
let target_path = from.with_base(&self.storage_root);
|
let target_path = from.with_base(&self.storage_root);
|
||||||
if file_exists(&target_path).map_err(DownloadError::BadInput)? {
|
if file_exists(&target_path).map_err(DownloadError::BadInput)? {
|
||||||
let source = io::BufReader::new(
|
let source = ReaderStream::new(
|
||||||
fs::OpenOptions::new()
|
fs::OpenOptions::new()
|
||||||
.read(true)
|
.read(true)
|
||||||
.open(&target_path)
|
.open(&target_path)
|
||||||
@@ -340,16 +354,14 @@ impl RemoteStorage for LocalFs {
|
|||||||
}
|
}
|
||||||
let target_path = from.with_base(&self.storage_root);
|
let target_path = from.with_base(&self.storage_root);
|
||||||
if file_exists(&target_path).map_err(DownloadError::BadInput)? {
|
if file_exists(&target_path).map_err(DownloadError::BadInput)? {
|
||||||
let mut source = io::BufReader::new(
|
let mut source = tokio::fs::OpenOptions::new()
|
||||||
fs::OpenOptions::new()
|
.read(true)
|
||||||
.read(true)
|
.open(&target_path)
|
||||||
.open(&target_path)
|
.await
|
||||||
.await
|
.with_context(|| {
|
||||||
.with_context(|| {
|
format!("Failed to open source file {target_path:?} to use in the download")
|
||||||
format!("Failed to open source file {target_path:?} to use in the download")
|
})
|
||||||
})
|
.map_err(DownloadError::Other)?;
|
||||||
.map_err(DownloadError::Other)?,
|
|
||||||
);
|
|
||||||
source
|
source
|
||||||
.seek(io::SeekFrom::Start(start_inclusive))
|
.seek(io::SeekFrom::Start(start_inclusive))
|
||||||
.await
|
.await
|
||||||
@@ -363,11 +375,13 @@ impl RemoteStorage for LocalFs {
|
|||||||
Ok(match end_exclusive {
|
Ok(match end_exclusive {
|
||||||
Some(end_exclusive) => Download {
|
Some(end_exclusive) => Download {
|
||||||
metadata,
|
metadata,
|
||||||
download_stream: Box::pin(source.take(end_exclusive - start_inclusive)),
|
download_stream: Box::pin(ReaderStream::new(
|
||||||
|
source.take(end_exclusive - start_inclusive),
|
||||||
|
)),
|
||||||
},
|
},
|
||||||
None => Download {
|
None => Download {
|
||||||
metadata,
|
metadata,
|
||||||
download_stream: Box::pin(source),
|
download_stream: Box::pin(ReaderStream::new(source)),
|
||||||
},
|
},
|
||||||
})
|
})
|
||||||
} else {
|
} else {
|
||||||
@@ -467,7 +481,9 @@ fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
|
|||||||
mod fs_tests {
|
mod fs_tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
|
use bytes::Bytes;
|
||||||
use camino_tempfile::tempdir;
|
use camino_tempfile::tempdir;
|
||||||
|
use futures_util::Stream;
|
||||||
use std::{collections::HashMap, io::Write};
|
use std::{collections::HashMap, io::Write};
|
||||||
|
|
||||||
async fn read_and_assert_remote_file_contents(
|
async fn read_and_assert_remote_file_contents(
|
||||||
@@ -477,7 +493,7 @@ mod fs_tests {
|
|||||||
remote_storage_path: &RemotePath,
|
remote_storage_path: &RemotePath,
|
||||||
expected_metadata: Option<&StorageMetadata>,
|
expected_metadata: Option<&StorageMetadata>,
|
||||||
) -> anyhow::Result<String> {
|
) -> anyhow::Result<String> {
|
||||||
let mut download = storage
|
let download = storage
|
||||||
.download(remote_storage_path)
|
.download(remote_storage_path)
|
||||||
.await
|
.await
|
||||||
.map_err(|e| anyhow::anyhow!("Download failed: {e}"))?;
|
.map_err(|e| anyhow::anyhow!("Download failed: {e}"))?;
|
||||||
@@ -486,13 +502,9 @@ mod fs_tests {
|
|||||||
"Unexpected metadata returned for the downloaded file"
|
"Unexpected metadata returned for the downloaded file"
|
||||||
);
|
);
|
||||||
|
|
||||||
let mut contents = String::new();
|
let contents = aggregate(download.download_stream).await?;
|
||||||
download
|
|
||||||
.download_stream
|
String::from_utf8(contents).map_err(anyhow::Error::new)
|
||||||
.read_to_string(&mut contents)
|
|
||||||
.await
|
|
||||||
.context("Failed to read remote file contents into string")?;
|
|
||||||
Ok(contents)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
@@ -521,25 +533,26 @@ mod fs_tests {
|
|||||||
let storage = create_storage()?;
|
let storage = create_storage()?;
|
||||||
|
|
||||||
let id = RemotePath::new(Utf8Path::new("dummy"))?;
|
let id = RemotePath::new(Utf8Path::new("dummy"))?;
|
||||||
let content = std::io::Cursor::new(b"12345");
|
let content = Bytes::from_static(b"12345");
|
||||||
|
let content = move || futures::stream::once(futures::future::ready(Ok(content.clone())));
|
||||||
|
|
||||||
// Check that you get an error if the size parameter doesn't match the actual
|
// Check that you get an error if the size parameter doesn't match the actual
|
||||||
// size of the stream.
|
// size of the stream.
|
||||||
storage
|
storage
|
||||||
.upload(Box::new(content.clone()), 0, &id, None)
|
.upload(content(), 0, &id, None)
|
||||||
.await
|
.await
|
||||||
.expect_err("upload with zero size succeeded");
|
.expect_err("upload with zero size succeeded");
|
||||||
storage
|
storage
|
||||||
.upload(Box::new(content.clone()), 4, &id, None)
|
.upload(content(), 4, &id, None)
|
||||||
.await
|
.await
|
||||||
.expect_err("upload with too short size succeeded");
|
.expect_err("upload with too short size succeeded");
|
||||||
storage
|
storage
|
||||||
.upload(Box::new(content.clone()), 6, &id, None)
|
.upload(content(), 6, &id, None)
|
||||||
.await
|
.await
|
||||||
.expect_err("upload with too large size succeeded");
|
.expect_err("upload with too large size succeeded");
|
||||||
|
|
||||||
// Correct size is 5, this should succeed.
|
// Correct size is 5, this should succeed.
|
||||||
storage.upload(Box::new(content), 5, &id, None).await?;
|
storage.upload(content(), 5, &id, None).await?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -587,7 +600,7 @@ mod fs_tests {
|
|||||||
let uploaded_bytes = dummy_contents(upload_name).into_bytes();
|
let uploaded_bytes = dummy_contents(upload_name).into_bytes();
|
||||||
let (first_part_local, second_part_local) = uploaded_bytes.split_at(3);
|
let (first_part_local, second_part_local) = uploaded_bytes.split_at(3);
|
||||||
|
|
||||||
let mut first_part_download = storage
|
let first_part_download = storage
|
||||||
.download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
|
.download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
|
||||||
.await?;
|
.await?;
|
||||||
assert!(
|
assert!(
|
||||||
@@ -595,21 +608,13 @@ mod fs_tests {
|
|||||||
"No metadata should be returned for no metadata upload"
|
"No metadata should be returned for no metadata upload"
|
||||||
);
|
);
|
||||||
|
|
||||||
let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
let first_part_remote = aggregate(first_part_download.download_stream).await?;
|
||||||
io::copy(
|
|
||||||
&mut first_part_download.download_stream,
|
|
||||||
&mut first_part_remote,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
first_part_remote.flush().await?;
|
|
||||||
let first_part_remote = first_part_remote.into_inner().into_inner();
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
first_part_local,
|
first_part_local, first_part_remote,
|
||||||
first_part_remote.as_slice(),
|
|
||||||
"First part bytes should be returned when requested"
|
"First part bytes should be returned when requested"
|
||||||
);
|
);
|
||||||
|
|
||||||
let mut second_part_download = storage
|
let second_part_download = storage
|
||||||
.download_byte_range(
|
.download_byte_range(
|
||||||
&upload_target,
|
&upload_target,
|
||||||
first_part_local.len() as u64,
|
first_part_local.len() as u64,
|
||||||
@@ -621,17 +626,9 @@ mod fs_tests {
|
|||||||
"No metadata should be returned for no metadata upload"
|
"No metadata should be returned for no metadata upload"
|
||||||
);
|
);
|
||||||
|
|
||||||
let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
let second_part_remote = aggregate(second_part_download.download_stream).await?;
|
||||||
io::copy(
|
|
||||||
&mut second_part_download.download_stream,
|
|
||||||
&mut second_part_remote,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
second_part_remote.flush().await?;
|
|
||||||
let second_part_remote = second_part_remote.into_inner().into_inner();
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
second_part_local,
|
second_part_local, second_part_remote,
|
||||||
second_part_remote.as_slice(),
|
|
||||||
"Second part bytes should be returned when requested"
|
"Second part bytes should be returned when requested"
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -721,17 +718,10 @@ mod fs_tests {
|
|||||||
let uploaded_bytes = dummy_contents(upload_name).into_bytes();
|
let uploaded_bytes = dummy_contents(upload_name).into_bytes();
|
||||||
let (first_part_local, _) = uploaded_bytes.split_at(3);
|
let (first_part_local, _) = uploaded_bytes.split_at(3);
|
||||||
|
|
||||||
let mut partial_download_with_metadata = storage
|
let partial_download_with_metadata = storage
|
||||||
.download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
|
.download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
|
||||||
.await?;
|
.await?;
|
||||||
let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
let first_part_remote = aggregate(partial_download_with_metadata.download_stream).await?;
|
||||||
io::copy(
|
|
||||||
&mut partial_download_with_metadata.download_stream,
|
|
||||||
&mut first_part_remote,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
first_part_remote.flush().await?;
|
|
||||||
let first_part_remote = first_part_remote.into_inner().into_inner();
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
first_part_local,
|
first_part_local,
|
||||||
first_part_remote.as_slice(),
|
first_part_remote.as_slice(),
|
||||||
@@ -807,16 +797,16 @@ mod fs_tests {
|
|||||||
)
|
)
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
storage
|
let file = tokio_util::io::ReaderStream::new(file);
|
||||||
.upload(Box::new(file), size, &relative_path, metadata)
|
|
||||||
.await?;
|
storage.upload(file, size, &relative_path, metadata).await?;
|
||||||
Ok(relative_path)
|
Ok(relative_path)
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn create_file_for_upload(
|
async fn create_file_for_upload(
|
||||||
path: &Utf8Path,
|
path: &Utf8Path,
|
||||||
contents: &str,
|
contents: &str,
|
||||||
) -> anyhow::Result<(io::BufReader<fs::File>, usize)> {
|
) -> anyhow::Result<(fs::File, usize)> {
|
||||||
std::fs::create_dir_all(path.parent().unwrap())?;
|
std::fs::create_dir_all(path.parent().unwrap())?;
|
||||||
let mut file_for_writing = std::fs::OpenOptions::new()
|
let mut file_for_writing = std::fs::OpenOptions::new()
|
||||||
.write(true)
|
.write(true)
|
||||||
@@ -826,7 +816,7 @@ mod fs_tests {
|
|||||||
drop(file_for_writing);
|
drop(file_for_writing);
|
||||||
let file_size = path.metadata()?.len() as usize;
|
let file_size = path.metadata()?.len() as usize;
|
||||||
Ok((
|
Ok((
|
||||||
io::BufReader::new(fs::OpenOptions::new().read(true).open(&path).await?),
|
fs::OpenOptions::new().read(true).open(&path).await?,
|
||||||
file_size,
|
file_size,
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
@@ -840,4 +830,16 @@ mod fs_tests {
|
|||||||
files.sort_by(|a, b| a.0.cmp(&b.0));
|
files.sort_by(|a, b| a.0.cmp(&b.0));
|
||||||
Ok(files)
|
Ok(files)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn aggregate(
|
||||||
|
stream: impl Stream<Item = std::io::Result<Bytes>>,
|
||||||
|
) -> anyhow::Result<Vec<u8>> {
|
||||||
|
use futures::stream::StreamExt;
|
||||||
|
let mut out = Vec::new();
|
||||||
|
let mut stream = std::pin::pin!(stream);
|
||||||
|
while let Some(res) = stream.next().await {
|
||||||
|
out.extend_from_slice(&res?[..]);
|
||||||
|
}
|
||||||
|
Ok(out)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,9 +4,14 @@
|
|||||||
//! allowing multiple api users to independently work with the same S3 bucket, if
|
//! allowing multiple api users to independently work with the same S3 bucket, if
|
||||||
//! their bucket prefixes are both specified and different.
|
//! their bucket prefixes are both specified and different.
|
||||||
|
|
||||||
use std::{borrow::Cow, sync::Arc};
|
use std::{
|
||||||
|
borrow::Cow,
|
||||||
|
pin::Pin,
|
||||||
|
sync::Arc,
|
||||||
|
task::{Context, Poll},
|
||||||
|
};
|
||||||
|
|
||||||
use anyhow::Context;
|
use anyhow::Context as _;
|
||||||
use aws_config::{
|
use aws_config::{
|
||||||
environment::credentials::EnvironmentVariableCredentialsProvider,
|
environment::credentials::EnvironmentVariableCredentialsProvider,
|
||||||
imds::credentials::ImdsCredentialsProvider,
|
imds::credentials::ImdsCredentialsProvider,
|
||||||
@@ -14,23 +19,24 @@ use aws_config::{
|
|||||||
provider_config::ProviderConfig,
|
provider_config::ProviderConfig,
|
||||||
retry::{RetryConfigBuilder, RetryMode},
|
retry::{RetryConfigBuilder, RetryMode},
|
||||||
web_identity_token::WebIdentityTokenCredentialsProvider,
|
web_identity_token::WebIdentityTokenCredentialsProvider,
|
||||||
|
BehaviorVersion,
|
||||||
};
|
};
|
||||||
use aws_credential_types::cache::CredentialsCache;
|
use aws_credential_types::provider::SharedCredentialsProvider;
|
||||||
use aws_sdk_s3::{
|
use aws_sdk_s3::{
|
||||||
config::{AsyncSleep, Config, Region, SharedAsyncSleep},
|
config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep},
|
||||||
error::SdkError,
|
error::SdkError,
|
||||||
operation::get_object::GetObjectError,
|
operation::get_object::GetObjectError,
|
||||||
primitives::ByteStream,
|
|
||||||
types::{Delete, ObjectIdentifier},
|
types::{Delete, ObjectIdentifier},
|
||||||
Client,
|
Client,
|
||||||
};
|
};
|
||||||
use aws_smithy_async::rt::sleep::TokioSleep;
|
use aws_smithy_async::rt::sleep::TokioSleep;
|
||||||
use aws_smithy_http::body::SdkBody;
|
|
||||||
|
use aws_smithy_types::body::SdkBody;
|
||||||
|
use aws_smithy_types::byte_stream::ByteStream;
|
||||||
|
use bytes::Bytes;
|
||||||
|
use futures::stream::Stream;
|
||||||
use hyper::Body;
|
use hyper::Body;
|
||||||
use scopeguard::ScopeGuard;
|
use scopeguard::ScopeGuard;
|
||||||
use tokio::io::{self, AsyncRead};
|
|
||||||
use tokio_util::io::ReaderStream;
|
|
||||||
use tracing::debug;
|
|
||||||
|
|
||||||
use super::StorageMetadata;
|
use super::StorageMetadata;
|
||||||
use crate::{
|
use crate::{
|
||||||
@@ -61,7 +67,7 @@ struct GetObjectRequest {
|
|||||||
impl S3Bucket {
|
impl S3Bucket {
|
||||||
/// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
|
/// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
|
||||||
pub fn new(aws_config: &S3Config) -> anyhow::Result<Self> {
|
pub fn new(aws_config: &S3Config) -> anyhow::Result<Self> {
|
||||||
debug!(
|
tracing::debug!(
|
||||||
"Creating s3 remote storage for S3 bucket {}",
|
"Creating s3 remote storage for S3 bucket {}",
|
||||||
aws_config.bucket_name
|
aws_config.bucket_name
|
||||||
);
|
);
|
||||||
@@ -78,7 +84,6 @@ impl S3Bucket {
|
|||||||
// needed to access remote extensions bucket
|
// needed to access remote extensions bucket
|
||||||
.or_else("token", {
|
.or_else("token", {
|
||||||
let provider_conf = ProviderConfig::without_region().with_region(region.clone());
|
let provider_conf = ProviderConfig::without_region().with_region(region.clone());
|
||||||
|
|
||||||
WebIdentityTokenCredentialsProvider::builder()
|
WebIdentityTokenCredentialsProvider::builder()
|
||||||
.configure(&provider_conf)
|
.configure(&provider_conf)
|
||||||
.build()
|
.build()
|
||||||
@@ -98,18 +103,20 @@ impl S3Bucket {
|
|||||||
.set_max_attempts(Some(1))
|
.set_max_attempts(Some(1))
|
||||||
.set_mode(Some(RetryMode::Adaptive));
|
.set_mode(Some(RetryMode::Adaptive));
|
||||||
|
|
||||||
let mut config_builder = Config::builder()
|
let mut config_builder = Builder::default()
|
||||||
|
.behavior_version(BehaviorVersion::v2023_11_09())
|
||||||
.region(region)
|
.region(region)
|
||||||
.credentials_cache(CredentialsCache::lazy())
|
.identity_cache(IdentityCache::lazy().build())
|
||||||
.credentials_provider(credentials_provider)
|
.credentials_provider(SharedCredentialsProvider::new(credentials_provider))
|
||||||
.sleep_impl(SharedAsyncSleep::from(sleep_impl))
|
.retry_config(retry_config.build())
|
||||||
.retry_config(retry_config.build());
|
.sleep_impl(SharedAsyncSleep::from(sleep_impl));
|
||||||
|
|
||||||
if let Some(custom_endpoint) = aws_config.endpoint.clone() {
|
if let Some(custom_endpoint) = aws_config.endpoint.clone() {
|
||||||
config_builder = config_builder
|
config_builder = config_builder
|
||||||
.endpoint_url(custom_endpoint)
|
.endpoint_url(custom_endpoint)
|
||||||
.force_path_style(true);
|
.force_path_style(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
let client = Client::from_conf(config_builder.build());
|
let client = Client::from_conf(config_builder.build());
|
||||||
|
|
||||||
let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| {
|
let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| {
|
||||||
@@ -222,12 +229,15 @@ impl S3Bucket {
|
|||||||
match get_object {
|
match get_object {
|
||||||
Ok(object_output) => {
|
Ok(object_output) => {
|
||||||
let metadata = object_output.metadata().cloned().map(StorageMetadata);
|
let metadata = object_output.metadata().cloned().map(StorageMetadata);
|
||||||
|
|
||||||
|
let body = object_output.body;
|
||||||
|
let body = ByteStreamAsStream::from(body);
|
||||||
|
let body = PermitCarrying::new(permit, body);
|
||||||
|
let body = TimedDownload::new(started_at, body);
|
||||||
|
|
||||||
Ok(Download {
|
Ok(Download {
|
||||||
metadata,
|
metadata,
|
||||||
download_stream: Box::pin(io::BufReader::new(TimedDownload::new(
|
download_stream: Box::pin(body),
|
||||||
started_at,
|
|
||||||
RatelimitedAsyncRead::new(permit, object_output.body.into_async_read()),
|
|
||||||
))),
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
|
Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
|
||||||
@@ -240,29 +250,55 @@ impl S3Bucket {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pin_project_lite::pin_project! {
|
||||||
|
struct ByteStreamAsStream {
|
||||||
|
#[pin]
|
||||||
|
inner: aws_smithy_types::byte_stream::ByteStream
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<aws_smithy_types::byte_stream::ByteStream> for ByteStreamAsStream {
|
||||||
|
fn from(inner: aws_smithy_types::byte_stream::ByteStream) -> Self {
|
||||||
|
ByteStreamAsStream { inner }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Stream for ByteStreamAsStream {
|
||||||
|
type Item = std::io::Result<Bytes>;
|
||||||
|
|
||||||
|
fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
|
||||||
|
// this does the std::io::ErrorKind::Other conversion
|
||||||
|
self.project().inner.poll_next(cx).map_err(|x| x.into())
|
||||||
|
}
|
||||||
|
|
||||||
|
// cannot implement size_hint because inner.size_hint is remaining size in bytes, which makes
|
||||||
|
// sense and Stream::size_hint does not really
|
||||||
|
}
|
||||||
|
|
||||||
pin_project_lite::pin_project! {
|
pin_project_lite::pin_project! {
|
||||||
/// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
|
/// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
|
||||||
struct RatelimitedAsyncRead<S> {
|
struct PermitCarrying<S> {
|
||||||
permit: tokio::sync::OwnedSemaphorePermit,
|
permit: tokio::sync::OwnedSemaphorePermit,
|
||||||
#[pin]
|
#[pin]
|
||||||
inner: S,
|
inner: S,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<S: AsyncRead> RatelimitedAsyncRead<S> {
|
impl<S> PermitCarrying<S> {
|
||||||
fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self {
|
fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self {
|
||||||
RatelimitedAsyncRead { permit, inner }
|
Self { permit, inner }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<S: AsyncRead> AsyncRead for RatelimitedAsyncRead<S> {
|
impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for PermitCarrying<S> {
|
||||||
fn poll_read(
|
type Item = <S as Stream>::Item;
|
||||||
self: std::pin::Pin<&mut Self>,
|
|
||||||
cx: &mut std::task::Context<'_>,
|
fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
|
||||||
buf: &mut io::ReadBuf<'_>,
|
self.project().inner.poll_next(cx)
|
||||||
) -> std::task::Poll<std::io::Result<()>> {
|
}
|
||||||
let this = self.project();
|
|
||||||
this.inner.poll_read(cx, buf)
|
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||||
|
self.inner.size_hint()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -282,7 +318,7 @@ pin_project_lite::pin_project! {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<S: AsyncRead> TimedDownload<S> {
|
impl<S> TimedDownload<S> {
|
||||||
fn new(started_at: std::time::Instant, inner: S) -> Self {
|
fn new(started_at: std::time::Instant, inner: S) -> Self {
|
||||||
TimedDownload {
|
TimedDownload {
|
||||||
started_at,
|
started_at,
|
||||||
@@ -292,25 +328,26 @@ impl<S: AsyncRead> TimedDownload<S> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<S: AsyncRead> AsyncRead for TimedDownload<S> {
|
impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
|
||||||
fn poll_read(
|
type Item = <S as Stream>::Item;
|
||||||
self: std::pin::Pin<&mut Self>,
|
|
||||||
cx: &mut std::task::Context<'_>,
|
fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
|
||||||
buf: &mut io::ReadBuf<'_>,
|
use std::task::ready;
|
||||||
) -> std::task::Poll<std::io::Result<()>> {
|
|
||||||
let this = self.project();
|
let this = self.project();
|
||||||
let before = buf.filled().len();
|
|
||||||
let read = std::task::ready!(this.inner.poll_read(cx, buf));
|
|
||||||
|
|
||||||
let read_eof = buf.filled().len() == before;
|
let res = ready!(this.inner.poll_next(cx));
|
||||||
|
match &res {
|
||||||
match read {
|
Some(Ok(_)) => {}
|
||||||
Ok(()) if read_eof => *this.outcome = AttemptOutcome::Ok,
|
Some(Err(_)) => *this.outcome = metrics::AttemptOutcome::Err,
|
||||||
Ok(()) => { /* still in progress */ }
|
None => *this.outcome = metrics::AttemptOutcome::Ok,
|
||||||
Err(_) => *this.outcome = AttemptOutcome::Err,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::task::Poll::Ready(read)
|
Poll::Ready(res)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||||
|
self.inner.size_hint()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -371,11 +408,11 @@ impl RemoteStorage for S3Bucket {
|
|||||||
|
|
||||||
let response = response?;
|
let response = response?;
|
||||||
|
|
||||||
let keys = response.contents().unwrap_or_default();
|
let keys = response.contents();
|
||||||
let empty = Vec::new();
|
let empty = Vec::new();
|
||||||
let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty);
|
let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty);
|
||||||
|
|
||||||
tracing::info!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
|
tracing::debug!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
|
||||||
|
|
||||||
for object in keys {
|
for object in keys {
|
||||||
let object_path = object.key().expect("response does not contain a key");
|
let object_path = object.key().expect("response does not contain a key");
|
||||||
@@ -400,7 +437,7 @@ impl RemoteStorage for S3Bucket {
|
|||||||
|
|
||||||
async fn upload(
|
async fn upload(
|
||||||
&self,
|
&self,
|
||||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||||
from_size_bytes: usize,
|
from_size_bytes: usize,
|
||||||
to: &RemotePath,
|
to: &RemotePath,
|
||||||
metadata: Option<StorageMetadata>,
|
metadata: Option<StorageMetadata>,
|
||||||
@@ -410,8 +447,8 @@ impl RemoteStorage for S3Bucket {
|
|||||||
|
|
||||||
let started_at = start_measuring_requests(kind);
|
let started_at = start_measuring_requests(kind);
|
||||||
|
|
||||||
let body = Body::wrap_stream(ReaderStream::new(from));
|
let body = Body::wrap_stream(from);
|
||||||
let bytes_stream = ByteStream::new(SdkBody::from(body));
|
let bytes_stream = ByteStream::new(SdkBody::from_body_0_4(body));
|
||||||
|
|
||||||
let res = self
|
let res = self
|
||||||
.client
|
.client
|
||||||
@@ -474,7 +511,7 @@ impl RemoteStorage for S3Bucket {
|
|||||||
for path in paths {
|
for path in paths {
|
||||||
let obj_id = ObjectIdentifier::builder()
|
let obj_id = ObjectIdentifier::builder()
|
||||||
.set_key(Some(self.relative_path_to_s3_object(path)))
|
.set_key(Some(self.relative_path_to_s3_object(path)))
|
||||||
.build();
|
.build()?;
|
||||||
delete_objects.push(obj_id);
|
delete_objects.push(obj_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -485,7 +522,11 @@ impl RemoteStorage for S3Bucket {
|
|||||||
.client
|
.client
|
||||||
.delete_objects()
|
.delete_objects()
|
||||||
.bucket(self.bucket_name.clone())
|
.bucket(self.bucket_name.clone())
|
||||||
.delete(Delete::builder().set_objects(Some(chunk.to_vec())).build())
|
.delete(
|
||||||
|
Delete::builder()
|
||||||
|
.set_objects(Some(chunk.to_vec()))
|
||||||
|
.build()?,
|
||||||
|
)
|
||||||
.send()
|
.send()
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
//! This module provides a wrapper around a real RemoteStorage implementation that
|
//! This module provides a wrapper around a real RemoteStorage implementation that
|
||||||
//! causes the first N attempts at each upload or download operatio to fail. For
|
//! causes the first N attempts at each upload or download operatio to fail. For
|
||||||
//! testing purposes.
|
//! testing purposes.
|
||||||
|
use bytes::Bytes;
|
||||||
|
use futures::stream::Stream;
|
||||||
use std::collections::hash_map::Entry;
|
use std::collections::hash_map::Entry;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::sync::Mutex;
|
use std::sync::Mutex;
|
||||||
@@ -108,7 +110,7 @@ impl RemoteStorage for UnreliableWrapper {
|
|||||||
|
|
||||||
async fn upload(
|
async fn upload(
|
||||||
&self,
|
&self,
|
||||||
data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
|
data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||||
// S3 PUT request requires the content length to be specified,
|
// S3 PUT request requires the content length to be specified,
|
||||||
// otherwise it starts to fail with the concurrent connection count increasing.
|
// otherwise it starts to fail with the concurrent connection count increasing.
|
||||||
data_size_bytes: usize,
|
data_size_bytes: usize,
|
||||||
|
|||||||
@@ -7,7 +7,9 @@ use std::sync::Arc;
|
|||||||
use std::time::UNIX_EPOCH;
|
use std::time::UNIX_EPOCH;
|
||||||
|
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
|
use bytes::Bytes;
|
||||||
use camino::Utf8Path;
|
use camino::Utf8Path;
|
||||||
|
use futures::stream::Stream;
|
||||||
use once_cell::sync::OnceCell;
|
use once_cell::sync::OnceCell;
|
||||||
use remote_storage::{
|
use remote_storage::{
|
||||||
AzureConfig, Download, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
|
AzureConfig, Download, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
|
||||||
@@ -180,23 +182,14 @@ async fn azure_delete_objects_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Resu
|
|||||||
let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
|
let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
|
||||||
.with_context(|| "RemotePath conversion")?;
|
.with_context(|| "RemotePath conversion")?;
|
||||||
|
|
||||||
let data1 = "remote blob data1".as_bytes();
|
let (data, len) = upload_stream("remote blob data1".as_bytes().into());
|
||||||
let data1_len = data1.len();
|
ctx.client.upload(data, len, &path1, None).await?;
|
||||||
let data2 = "remote blob data2".as_bytes();
|
|
||||||
let data2_len = data2.len();
|
|
||||||
let data3 = "remote blob data3".as_bytes();
|
|
||||||
let data3_len = data3.len();
|
|
||||||
ctx.client
|
|
||||||
.upload(std::io::Cursor::new(data1), data1_len, &path1, None)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
ctx.client
|
let (data, len) = upload_stream("remote blob data2".as_bytes().into());
|
||||||
.upload(std::io::Cursor::new(data2), data2_len, &path2, None)
|
ctx.client.upload(data, len, &path2, None).await?;
|
||||||
.await?;
|
|
||||||
|
|
||||||
ctx.client
|
let (data, len) = upload_stream("remote blob data3".as_bytes().into());
|
||||||
.upload(std::io::Cursor::new(data3), data3_len, &path3, None)
|
ctx.client.upload(data, len, &path3, None).await?;
|
||||||
.await?;
|
|
||||||
|
|
||||||
ctx.client.delete_objects(&[path1, path2]).await?;
|
ctx.client.delete_objects(&[path1, path2]).await?;
|
||||||
|
|
||||||
@@ -219,53 +212,56 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res
|
|||||||
let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
|
let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
|
||||||
.with_context(|| "RemotePath conversion")?;
|
.with_context(|| "RemotePath conversion")?;
|
||||||
|
|
||||||
let data = "remote blob data here".as_bytes();
|
let orig = bytes::Bytes::from_static("remote blob data here".as_bytes());
|
||||||
let data_len = data.len() as u64;
|
|
||||||
|
|
||||||
ctx.client
|
let (data, len) = wrap_stream(orig.clone());
|
||||||
.upload(std::io::Cursor::new(data), data.len(), &path, None)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
async fn download_and_compare(mut dl: Download) -> anyhow::Result<Vec<u8>> {
|
ctx.client.upload(data, len, &path, None).await?;
|
||||||
|
|
||||||
|
async fn download_and_compare(dl: Download) -> anyhow::Result<Vec<u8>> {
|
||||||
let mut buf = Vec::new();
|
let mut buf = Vec::new();
|
||||||
tokio::io::copy(&mut dl.download_stream, &mut buf).await?;
|
tokio::io::copy_buf(
|
||||||
|
&mut tokio_util::io::StreamReader::new(dl.download_stream),
|
||||||
|
&mut buf,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
Ok(buf)
|
Ok(buf)
|
||||||
}
|
}
|
||||||
// Normal download request
|
// Normal download request
|
||||||
let dl = ctx.client.download(&path).await?;
|
let dl = ctx.client.download(&path).await?;
|
||||||
let buf = download_and_compare(dl).await?;
|
let buf = download_and_compare(dl).await?;
|
||||||
assert_eq!(buf, data);
|
assert_eq!(&buf, &orig);
|
||||||
|
|
||||||
// Full range (end specified)
|
// Full range (end specified)
|
||||||
let dl = ctx
|
let dl = ctx
|
||||||
.client
|
.client
|
||||||
.download_byte_range(&path, 0, Some(data_len))
|
.download_byte_range(&path, 0, Some(len as u64))
|
||||||
.await?;
|
.await?;
|
||||||
let buf = download_and_compare(dl).await?;
|
let buf = download_and_compare(dl).await?;
|
||||||
assert_eq!(buf, data);
|
assert_eq!(&buf, &orig);
|
||||||
|
|
||||||
// partial range (end specified)
|
// partial range (end specified)
|
||||||
let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
|
let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
|
||||||
let buf = download_and_compare(dl).await?;
|
let buf = download_and_compare(dl).await?;
|
||||||
assert_eq!(buf, data[4..10]);
|
assert_eq!(&buf, &orig[4..10]);
|
||||||
|
|
||||||
// partial range (end beyond real end)
|
// partial range (end beyond real end)
|
||||||
let dl = ctx
|
let dl = ctx
|
||||||
.client
|
.client
|
||||||
.download_byte_range(&path, 8, Some(data_len * 100))
|
.download_byte_range(&path, 8, Some(len as u64 * 100))
|
||||||
.await?;
|
.await?;
|
||||||
let buf = download_and_compare(dl).await?;
|
let buf = download_and_compare(dl).await?;
|
||||||
assert_eq!(buf, data[8..]);
|
assert_eq!(&buf, &orig[8..]);
|
||||||
|
|
||||||
// Partial range (end unspecified)
|
// Partial range (end unspecified)
|
||||||
let dl = ctx.client.download_byte_range(&path, 4, None).await?;
|
let dl = ctx.client.download_byte_range(&path, 4, None).await?;
|
||||||
let buf = download_and_compare(dl).await?;
|
let buf = download_and_compare(dl).await?;
|
||||||
assert_eq!(buf, data[4..]);
|
assert_eq!(&buf, &orig[4..]);
|
||||||
|
|
||||||
// Full range (end unspecified)
|
// Full range (end unspecified)
|
||||||
let dl = ctx.client.download_byte_range(&path, 0, None).await?;
|
let dl = ctx.client.download_byte_range(&path, 0, None).await?;
|
||||||
let buf = download_and_compare(dl).await?;
|
let buf = download_and_compare(dl).await?;
|
||||||
assert_eq!(buf, data);
|
assert_eq!(&buf, &orig);
|
||||||
|
|
||||||
debug!("Cleanup: deleting file at path {path:?}");
|
debug!("Cleanup: deleting file at path {path:?}");
|
||||||
ctx.client
|
ctx.client
|
||||||
@@ -281,6 +277,7 @@ fn ensure_logging_ready() {
|
|||||||
utils::logging::init(
|
utils::logging::init(
|
||||||
utils::logging::LogFormat::Test,
|
utils::logging::LogFormat::Test,
|
||||||
utils::logging::TracingErrorLayerEnablement::Disabled,
|
utils::logging::TracingErrorLayerEnablement::Disabled,
|
||||||
|
utils::logging::Output::Stdout,
|
||||||
)
|
)
|
||||||
.expect("logging init failed");
|
.expect("logging init failed");
|
||||||
});
|
});
|
||||||
@@ -503,11 +500,8 @@ async fn upload_azure_data(
|
|||||||
let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
|
let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
|
||||||
debug!("Creating remote item {i} at path {blob_path:?}");
|
debug!("Creating remote item {i} at path {blob_path:?}");
|
||||||
|
|
||||||
let data = format!("remote blob data {i}").into_bytes();
|
let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
|
||||||
let data_len = data.len();
|
task_client.upload(data, len, &blob_path, None).await?;
|
||||||
task_client
|
|
||||||
.upload(std::io::Cursor::new(data), data_len, &blob_path, None)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
Ok::<_, anyhow::Error>((blob_prefix, blob_path))
|
Ok::<_, anyhow::Error>((blob_prefix, blob_path))
|
||||||
});
|
});
|
||||||
@@ -588,11 +582,8 @@ async fn upload_simple_azure_data(
|
|||||||
.with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
|
.with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
|
||||||
debug!("Creating remote item {i} at path {blob_path:?}");
|
debug!("Creating remote item {i} at path {blob_path:?}");
|
||||||
|
|
||||||
let data = format!("remote blob data {i}").into_bytes();
|
let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
|
||||||
let data_len = data.len();
|
task_client.upload(data, len, &blob_path, None).await?;
|
||||||
task_client
|
|
||||||
.upload(std::io::Cursor::new(data), data_len, &blob_path, None)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
Ok::<_, anyhow::Error>(blob_path)
|
Ok::<_, anyhow::Error>(blob_path)
|
||||||
});
|
});
|
||||||
@@ -621,3 +612,32 @@ async fn upload_simple_azure_data(
|
|||||||
ControlFlow::Continue(uploaded_blobs)
|
ControlFlow::Continue(uploaded_blobs)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// FIXME: copypasted from test_real_s3, can't remember how to share a module which is not compiled
|
||||||
|
// to binary
|
||||||
|
fn upload_stream(
|
||||||
|
content: std::borrow::Cow<'static, [u8]>,
|
||||||
|
) -> (
|
||||||
|
impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||||
|
usize,
|
||||||
|
) {
|
||||||
|
use std::borrow::Cow;
|
||||||
|
|
||||||
|
let content = match content {
|
||||||
|
Cow::Borrowed(x) => Bytes::from_static(x),
|
||||||
|
Cow::Owned(vec) => Bytes::from(vec),
|
||||||
|
};
|
||||||
|
wrap_stream(content)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn wrap_stream(
|
||||||
|
content: bytes::Bytes,
|
||||||
|
) -> (
|
||||||
|
impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||||
|
usize,
|
||||||
|
) {
|
||||||
|
let len = content.len();
|
||||||
|
let content = futures::future::ready(Ok(content));
|
||||||
|
|
||||||
|
(futures::stream::once(content), len)
|
||||||
|
}
|
||||||
|
|||||||
@@ -7,7 +7,9 @@ use std::sync::Arc;
|
|||||||
use std::time::UNIX_EPOCH;
|
use std::time::UNIX_EPOCH;
|
||||||
|
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
|
use bytes::Bytes;
|
||||||
use camino::Utf8Path;
|
use camino::Utf8Path;
|
||||||
|
use futures::stream::Stream;
|
||||||
use once_cell::sync::OnceCell;
|
use once_cell::sync::OnceCell;
|
||||||
use remote_storage::{
|
use remote_storage::{
|
||||||
GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
|
GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
|
||||||
@@ -176,23 +178,14 @@ async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()>
|
|||||||
let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
|
let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
|
||||||
.with_context(|| "RemotePath conversion")?;
|
.with_context(|| "RemotePath conversion")?;
|
||||||
|
|
||||||
let data1 = "remote blob data1".as_bytes();
|
let (data, len) = upload_stream("remote blob data1".as_bytes().into());
|
||||||
let data1_len = data1.len();
|
ctx.client.upload(data, len, &path1, None).await?;
|
||||||
let data2 = "remote blob data2".as_bytes();
|
|
||||||
let data2_len = data2.len();
|
|
||||||
let data3 = "remote blob data3".as_bytes();
|
|
||||||
let data3_len = data3.len();
|
|
||||||
ctx.client
|
|
||||||
.upload(std::io::Cursor::new(data1), data1_len, &path1, None)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
ctx.client
|
let (data, len) = upload_stream("remote blob data2".as_bytes().into());
|
||||||
.upload(std::io::Cursor::new(data2), data2_len, &path2, None)
|
ctx.client.upload(data, len, &path2, None).await?;
|
||||||
.await?;
|
|
||||||
|
|
||||||
ctx.client
|
let (data, len) = upload_stream("remote blob data3".as_bytes().into());
|
||||||
.upload(std::io::Cursor::new(data3), data3_len, &path3, None)
|
ctx.client.upload(data, len, &path3, None).await?;
|
||||||
.await?;
|
|
||||||
|
|
||||||
ctx.client.delete_objects(&[path1, path2]).await?;
|
ctx.client.delete_objects(&[path1, path2]).await?;
|
||||||
|
|
||||||
@@ -210,6 +203,7 @@ fn ensure_logging_ready() {
|
|||||||
utils::logging::init(
|
utils::logging::init(
|
||||||
utils::logging::LogFormat::Test,
|
utils::logging::LogFormat::Test,
|
||||||
utils::logging::TracingErrorLayerEnablement::Disabled,
|
utils::logging::TracingErrorLayerEnablement::Disabled,
|
||||||
|
utils::logging::Output::Stdout,
|
||||||
)
|
)
|
||||||
.expect("logging init failed");
|
.expect("logging init failed");
|
||||||
});
|
});
|
||||||
@@ -431,11 +425,9 @@ async fn upload_s3_data(
|
|||||||
let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
|
let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
|
||||||
debug!("Creating remote item {i} at path {blob_path:?}");
|
debug!("Creating remote item {i} at path {blob_path:?}");
|
||||||
|
|
||||||
let data = format!("remote blob data {i}").into_bytes();
|
let (data, data_len) =
|
||||||
let data_len = data.len();
|
upload_stream(format!("remote blob data {i}").into_bytes().into());
|
||||||
task_client
|
task_client.upload(data, data_len, &blob_path, None).await?;
|
||||||
.upload(std::io::Cursor::new(data), data_len, &blob_path, None)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
Ok::<_, anyhow::Error>((blob_prefix, blob_path))
|
Ok::<_, anyhow::Error>((blob_prefix, blob_path))
|
||||||
});
|
});
|
||||||
@@ -516,11 +508,9 @@ async fn upload_simple_s3_data(
|
|||||||
.with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
|
.with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
|
||||||
debug!("Creating remote item {i} at path {blob_path:?}");
|
debug!("Creating remote item {i} at path {blob_path:?}");
|
||||||
|
|
||||||
let data = format!("remote blob data {i}").into_bytes();
|
let (data, data_len) =
|
||||||
let data_len = data.len();
|
upload_stream(format!("remote blob data {i}").into_bytes().into());
|
||||||
task_client
|
task_client.upload(data, data_len, &blob_path, None).await?;
|
||||||
.upload(std::io::Cursor::new(data), data_len, &blob_path, None)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
Ok::<_, anyhow::Error>(blob_path)
|
Ok::<_, anyhow::Error>(blob_path)
|
||||||
});
|
});
|
||||||
@@ -549,3 +539,30 @@ async fn upload_simple_s3_data(
|
|||||||
ControlFlow::Continue(uploaded_blobs)
|
ControlFlow::Continue(uploaded_blobs)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn upload_stream(
|
||||||
|
content: std::borrow::Cow<'static, [u8]>,
|
||||||
|
) -> (
|
||||||
|
impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||||
|
usize,
|
||||||
|
) {
|
||||||
|
use std::borrow::Cow;
|
||||||
|
|
||||||
|
let content = match content {
|
||||||
|
Cow::Borrowed(x) => Bytes::from_static(x),
|
||||||
|
Cow::Owned(vec) => Bytes::from(vec),
|
||||||
|
};
|
||||||
|
wrap_stream(content)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn wrap_stream(
|
||||||
|
content: bytes::Bytes,
|
||||||
|
) -> (
|
||||||
|
impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||||
|
usize,
|
||||||
|
) {
|
||||||
|
let len = content.len();
|
||||||
|
let content = futures::future::ready(Ok(content));
|
||||||
|
|
||||||
|
(futures::stream::once(content), len)
|
||||||
|
}
|
||||||
|
|||||||
21
libs/utils/scripts/restore_from_wal_initdb.sh
Executable file
21
libs/utils/scripts/restore_from_wal_initdb.sh
Executable file
@@ -0,0 +1,21 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# like restore_from_wal.sh, but takes existing initdb.tar.zst
|
||||||
|
|
||||||
|
set -euxo pipefail
|
||||||
|
|
||||||
|
PG_BIN=$1
|
||||||
|
WAL_PATH=$2
|
||||||
|
DATA_DIR=$3
|
||||||
|
PORT=$4
|
||||||
|
echo "port=$PORT" >> "$DATA_DIR"/postgresql.conf
|
||||||
|
echo "shared_preload_libraries='\$libdir/neon_rmgr.so'" >> "$DATA_DIR"/postgresql.conf
|
||||||
|
REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"| cut -c 42-)
|
||||||
|
declare -i WAL_SIZE=$REDO_POS+114
|
||||||
|
"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" start
|
||||||
|
"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" stop -m immediate
|
||||||
|
cp "$DATA_DIR"/pg_wal/000000010000000000000001 .
|
||||||
|
cp "$WAL_PATH"/* "$DATA_DIR"/pg_wal/
|
||||||
|
for partial in "$DATA_DIR"/pg_wal/*.partial ; do mv "$partial" "${partial%.partial}" ; done
|
||||||
|
dd if=000000010000000000000001 of="$DATA_DIR"/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
|
||||||
|
rm -f 000000010000000000000001
|
||||||
@@ -1,16 +1,14 @@
|
|||||||
use std::sync::Arc;
|
use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker};
|
||||||
|
|
||||||
use tokio::sync::{mpsc, Mutex};
|
|
||||||
|
|
||||||
/// While a reference is kept around, the associated [`Barrier::wait`] will wait.
|
/// While a reference is kept around, the associated [`Barrier::wait`] will wait.
|
||||||
///
|
///
|
||||||
/// Can be cloned, moved and kept around in futures as "guard objects".
|
/// Can be cloned, moved and kept around in futures as "guard objects".
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct Completion(mpsc::Sender<()>);
|
pub struct Completion(TaskTrackerToken);
|
||||||
|
|
||||||
/// Barrier will wait until all clones of [`Completion`] have been dropped.
|
/// Barrier will wait until all clones of [`Completion`] have been dropped.
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct Barrier(Arc<Mutex<mpsc::Receiver<()>>>);
|
pub struct Barrier(TaskTracker);
|
||||||
|
|
||||||
impl Default for Barrier {
|
impl Default for Barrier {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
@@ -21,7 +19,7 @@ impl Default for Barrier {
|
|||||||
|
|
||||||
impl Barrier {
|
impl Barrier {
|
||||||
pub async fn wait(self) {
|
pub async fn wait(self) {
|
||||||
self.0.lock().await.recv().await;
|
self.0.wait().await;
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn maybe_wait(barrier: Option<Barrier>) {
|
pub async fn maybe_wait(barrier: Option<Barrier>) {
|
||||||
@@ -33,8 +31,7 @@ impl Barrier {
|
|||||||
|
|
||||||
impl PartialEq for Barrier {
|
impl PartialEq for Barrier {
|
||||||
fn eq(&self, other: &Self) -> bool {
|
fn eq(&self, other: &Self) -> bool {
|
||||||
// we don't use dyn so this is good
|
TaskTracker::ptr_eq(&self.0, &other.0)
|
||||||
Arc::ptr_eq(&self.0, &other.0)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -42,8 +39,10 @@ impl Eq for Barrier {}
|
|||||||
|
|
||||||
/// Create new Guard and Barrier pair.
|
/// Create new Guard and Barrier pair.
|
||||||
pub fn channel() -> (Completion, Barrier) {
|
pub fn channel() -> (Completion, Barrier) {
|
||||||
let (tx, rx) = mpsc::channel::<()>(1);
|
let tracker = TaskTracker::new();
|
||||||
let rx = Mutex::new(rx);
|
// otherwise wait never exits
|
||||||
let rx = Arc::new(rx);
|
tracker.close();
|
||||||
(Completion(tx), Barrier(rx))
|
|
||||||
|
let token = tracker.token();
|
||||||
|
(Completion(token), Barrier(tracker))
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -152,3 +152,16 @@ impl Debug for Generation {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod test {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn generation_gt() {
|
||||||
|
// Important that a None generation compares less than a valid one, during upgrades from
|
||||||
|
// pre-generation systems.
|
||||||
|
assert!(Generation::none() < Generation::new(0));
|
||||||
|
assert!(Generation::none() < Generation::new(1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -66,9 +66,17 @@ pub enum TracingErrorLayerEnablement {
|
|||||||
EnableWithRustLogFilter,
|
EnableWithRustLogFilter,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Where the logging should output to.
|
||||||
|
#[derive(Clone, Copy)]
|
||||||
|
pub enum Output {
|
||||||
|
Stdout,
|
||||||
|
Stderr,
|
||||||
|
}
|
||||||
|
|
||||||
pub fn init(
|
pub fn init(
|
||||||
log_format: LogFormat,
|
log_format: LogFormat,
|
||||||
tracing_error_layer_enablement: TracingErrorLayerEnablement,
|
tracing_error_layer_enablement: TracingErrorLayerEnablement,
|
||||||
|
output: Output,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
// We fall back to printing all spans at info-level or above if
|
// We fall back to printing all spans at info-level or above if
|
||||||
// the RUST_LOG environment variable is not set.
|
// the RUST_LOG environment variable is not set.
|
||||||
@@ -85,7 +93,12 @@ pub fn init(
|
|||||||
let log_layer = tracing_subscriber::fmt::layer()
|
let log_layer = tracing_subscriber::fmt::layer()
|
||||||
.with_target(false)
|
.with_target(false)
|
||||||
.with_ansi(false)
|
.with_ansi(false)
|
||||||
.with_writer(std::io::stdout);
|
.with_writer(move || -> Box<dyn std::io::Write> {
|
||||||
|
match output {
|
||||||
|
Output::Stdout => Box::new(std::io::stdout()),
|
||||||
|
Output::Stderr => Box::new(std::io::stderr()),
|
||||||
|
}
|
||||||
|
});
|
||||||
let log_layer = match log_format {
|
let log_layer = match log_format {
|
||||||
LogFormat::Json => log_layer.json().boxed(),
|
LogFormat::Json => log_layer.json().boxed(),
|
||||||
LogFormat::Plain => log_layer.boxed(),
|
LogFormat::Plain => log_layer.boxed(),
|
||||||
|
|||||||
@@ -1,10 +1,10 @@
|
|||||||
//!
|
//!
|
||||||
//! RCU stands for Read-Copy-Update. It's a synchronization mechanism somewhat
|
//! RCU stands for Read-Copy-Update. It's a synchronization mechanism somewhat
|
||||||
//! similar to a lock, but it allows readers to "hold on" to an old value of RCU
|
//! similar to a lock, but it allows readers to "hold on" to an old value of RCU
|
||||||
//! without blocking writers, and allows writing a new values without blocking
|
//! without blocking writers, and allows writing a new value without blocking
|
||||||
//! readers. When you update the new value, the new value is immediately visible
|
//! readers. When you update the value, the new value is immediately visible
|
||||||
//! to new readers, but the update waits until all existing readers have
|
//! to new readers, but the update waits until all existing readers have
|
||||||
//! finishe, so that no one sees the old value anymore.
|
//! finished, so that on return, no one sees the old value anymore.
|
||||||
//!
|
//!
|
||||||
//! This implementation isn't wait-free; it uses an RwLock that is held for a
|
//! This implementation isn't wait-free; it uses an RwLock that is held for a
|
||||||
//! short duration when the value is read or updated.
|
//! short duration when the value is read or updated.
|
||||||
@@ -26,6 +26,7 @@
|
|||||||
//! Increment the value by one, and wait for old readers to finish:
|
//! Increment the value by one, and wait for old readers to finish:
|
||||||
//!
|
//!
|
||||||
//! ```
|
//! ```
|
||||||
|
//! # async fn dox() {
|
||||||
//! # let rcu = utils::simple_rcu::Rcu::new(1);
|
//! # let rcu = utils::simple_rcu::Rcu::new(1);
|
||||||
//! let write_guard = rcu.lock_for_write();
|
//! let write_guard = rcu.lock_for_write();
|
||||||
//!
|
//!
|
||||||
@@ -36,15 +37,17 @@
|
|||||||
//!
|
//!
|
||||||
//! // Concurrent reads and writes are now possible again. Wait for all the readers
|
//! // Concurrent reads and writes are now possible again. Wait for all the readers
|
||||||
//! // that still observe the old value to finish.
|
//! // that still observe the old value to finish.
|
||||||
//! waitlist.wait();
|
//! waitlist.wait().await;
|
||||||
|
//! # }
|
||||||
//! ```
|
//! ```
|
||||||
//!
|
//!
|
||||||
#![warn(missing_docs)]
|
#![warn(missing_docs)]
|
||||||
|
|
||||||
use std::ops::Deref;
|
use std::ops::Deref;
|
||||||
use std::sync::mpsc::{sync_channel, Receiver, SyncSender};
|
|
||||||
use std::sync::{Arc, Weak};
|
use std::sync::{Arc, Weak};
|
||||||
use std::sync::{Mutex, RwLock, RwLockWriteGuard};
|
use std::sync::{RwLock, RwLockWriteGuard};
|
||||||
|
|
||||||
|
use tokio::sync::watch;
|
||||||
|
|
||||||
///
|
///
|
||||||
/// Rcu allows multiple readers to read and hold onto a value without blocking
|
/// Rcu allows multiple readers to read and hold onto a value without blocking
|
||||||
@@ -68,22 +71,21 @@ struct RcuCell<V> {
|
|||||||
value: V,
|
value: V,
|
||||||
|
|
||||||
/// A dummy channel. We never send anything to this channel. The point is
|
/// A dummy channel. We never send anything to this channel. The point is
|
||||||
/// that when the RcuCell is dropped, any cloned Senders will be notified
|
/// that when the RcuCell is dropped, any subscribed Receivers will be notified
|
||||||
/// that the channel is closed. Updaters can use this to wait out until the
|
/// that the channel is closed. Updaters can use this to wait out until the
|
||||||
/// RcuCell has been dropped, i.e. until the old value is no longer in use.
|
/// RcuCell has been dropped, i.e. until the old value is no longer in use.
|
||||||
///
|
///
|
||||||
/// We never do anything with the receiver, we just need to hold onto it so
|
/// We never send anything to this, we just need to hold onto it so that the
|
||||||
/// that the Senders will be notified when it's dropped. But because it's
|
/// Receivers will be notified when it's dropped.
|
||||||
/// not Sync, we need a Mutex on it.
|
watch: watch::Sender<()>,
|
||||||
watch: (SyncSender<()>, Mutex<Receiver<()>>),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<V> RcuCell<V> {
|
impl<V> RcuCell<V> {
|
||||||
fn new(value: V) -> Self {
|
fn new(value: V) -> Self {
|
||||||
let (watch_sender, watch_receiver) = sync_channel(0);
|
let (watch_sender, _) = watch::channel(());
|
||||||
RcuCell {
|
RcuCell {
|
||||||
value,
|
value,
|
||||||
watch: (watch_sender, Mutex::new(watch_receiver)),
|
watch: watch_sender,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -141,10 +143,10 @@ impl<V> Deref for RcuReadGuard<V> {
|
|||||||
///
|
///
|
||||||
/// Write guard returned by `write`
|
/// Write guard returned by `write`
|
||||||
///
|
///
|
||||||
/// NB: Holding this guard blocks all concurrent `read` and `write` calls, so
|
/// NB: Holding this guard blocks all concurrent `read` and `write` calls, so it should only be
|
||||||
/// it should only be held for a short duration!
|
/// held for a short duration!
|
||||||
///
|
///
|
||||||
/// Calling `store` consumes the guard, making new reads and new writes possible
|
/// Calling [`Self::store_and_unlock`] consumes the guard, making new reads and new writes possible
|
||||||
/// again.
|
/// again.
|
||||||
///
|
///
|
||||||
pub struct RcuWriteGuard<'a, V> {
|
pub struct RcuWriteGuard<'a, V> {
|
||||||
@@ -179,7 +181,7 @@ impl<'a, V> RcuWriteGuard<'a, V> {
|
|||||||
// the watches for any that do.
|
// the watches for any that do.
|
||||||
self.inner.old_cells.retain(|weak| {
|
self.inner.old_cells.retain(|weak| {
|
||||||
if let Some(cell) = weak.upgrade() {
|
if let Some(cell) = weak.upgrade() {
|
||||||
watches.push(cell.watch.0.clone());
|
watches.push(cell.watch.subscribe());
|
||||||
true
|
true
|
||||||
} else {
|
} else {
|
||||||
false
|
false
|
||||||
@@ -193,20 +195,20 @@ impl<'a, V> RcuWriteGuard<'a, V> {
|
|||||||
///
|
///
|
||||||
/// List of readers who can still see old values.
|
/// List of readers who can still see old values.
|
||||||
///
|
///
|
||||||
pub struct RcuWaitList(Vec<SyncSender<()>>);
|
pub struct RcuWaitList(Vec<watch::Receiver<()>>);
|
||||||
|
|
||||||
impl RcuWaitList {
|
impl RcuWaitList {
|
||||||
///
|
///
|
||||||
/// Wait for old readers to finish.
|
/// Wait for old readers to finish.
|
||||||
///
|
///
|
||||||
pub fn wait(mut self) {
|
pub async fn wait(mut self) {
|
||||||
// after all the old_cells are no longer in use, we're done
|
// after all the old_cells are no longer in use, we're done
|
||||||
for w in self.0.iter_mut() {
|
for w in self.0.iter_mut() {
|
||||||
// This will block until the Receiver is closed. That happens when
|
// This will block until the Receiver is closed. That happens when
|
||||||
// the RcuCell is dropped.
|
// the RcuCell is dropped.
|
||||||
#[allow(clippy::single_match)]
|
#[allow(clippy::single_match)]
|
||||||
match w.send(()) {
|
match w.changed().await {
|
||||||
Ok(_) => panic!("send() unexpectedly succeeded on dummy channel"),
|
Ok(_) => panic!("changed() unexpectedly succeeded on dummy channel"),
|
||||||
Err(_) => {
|
Err(_) => {
|
||||||
// closed, which means that the cell has been dropped, and
|
// closed, which means that the cell has been dropped, and
|
||||||
// its value is no longer in use
|
// its value is no longer in use
|
||||||
@@ -220,11 +222,10 @@ impl RcuWaitList {
|
|||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
use std::sync::{Arc, Mutex};
|
use std::sync::{Arc, Mutex};
|
||||||
use std::thread::{sleep, spawn};
|
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
|
||||||
#[test]
|
#[tokio::test]
|
||||||
fn two_writers() {
|
async fn two_writers() {
|
||||||
let rcu = Rcu::new(1);
|
let rcu = Rcu::new(1);
|
||||||
|
|
||||||
let read1 = rcu.read();
|
let read1 = rcu.read();
|
||||||
@@ -248,33 +249,35 @@ mod tests {
|
|||||||
assert_eq!(*read1, 1);
|
assert_eq!(*read1, 1);
|
||||||
|
|
||||||
let log = Arc::new(Mutex::new(Vec::new()));
|
let log = Arc::new(Mutex::new(Vec::new()));
|
||||||
// Wait for the old readers to finish in separate threads.
|
// Wait for the old readers to finish in separate tasks.
|
||||||
let log_clone = Arc::clone(&log);
|
let log_clone = Arc::clone(&log);
|
||||||
let thread2 = spawn(move || {
|
let task2 = tokio::spawn(async move {
|
||||||
wait2.wait();
|
wait2.wait().await;
|
||||||
log_clone.lock().unwrap().push("wait2 done");
|
log_clone.lock().unwrap().push("wait2 done");
|
||||||
});
|
});
|
||||||
let log_clone = Arc::clone(&log);
|
let log_clone = Arc::clone(&log);
|
||||||
let thread3 = spawn(move || {
|
let task3 = tokio::spawn(async move {
|
||||||
wait3.wait();
|
wait3.wait().await;
|
||||||
log_clone.lock().unwrap().push("wait3 done");
|
log_clone.lock().unwrap().push("wait3 done");
|
||||||
});
|
});
|
||||||
|
|
||||||
// without this sleep the test can pass on accident if the writer is slow
|
// without this sleep the test can pass on accident if the writer is slow
|
||||||
sleep(Duration::from_millis(500));
|
tokio::time::sleep(Duration::from_millis(100)).await;
|
||||||
|
|
||||||
// Release first reader. This allows first write to finish, but calling
|
// Release first reader. This allows first write to finish, but calling
|
||||||
// wait() on the second one would still block.
|
// wait() on the 'task3' would still block.
|
||||||
log.lock().unwrap().push("dropping read1");
|
log.lock().unwrap().push("dropping read1");
|
||||||
drop(read1);
|
drop(read1);
|
||||||
thread2.join().unwrap();
|
task2.await.unwrap();
|
||||||
|
|
||||||
sleep(Duration::from_millis(500));
|
assert!(!task3.is_finished());
|
||||||
|
|
||||||
|
tokio::time::sleep(Duration::from_millis(100)).await;
|
||||||
|
|
||||||
// Release second reader, and finish second writer.
|
// Release second reader, and finish second writer.
|
||||||
log.lock().unwrap().push("dropping read2");
|
log.lock().unwrap().push("dropping read2");
|
||||||
drop(read2);
|
drop(read2);
|
||||||
thread3.join().unwrap();
|
task3.await.unwrap();
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
log.lock().unwrap().as_slice(),
|
log.lock().unwrap().as_slice(),
|
||||||
|
|||||||
@@ -30,18 +30,32 @@ async fn warn_if_stuck<Fut: std::future::Future>(
|
|||||||
|
|
||||||
let mut fut = std::pin::pin!(fut);
|
let mut fut = std::pin::pin!(fut);
|
||||||
|
|
||||||
loop {
|
let mut warned = false;
|
||||||
|
let ret = loop {
|
||||||
match tokio::time::timeout(warn_period, &mut fut).await {
|
match tokio::time::timeout(warn_period, &mut fut).await {
|
||||||
Ok(ret) => return ret,
|
Ok(ret) => break ret,
|
||||||
Err(_) => {
|
Err(_) => {
|
||||||
tracing::warn!(
|
tracing::warn!(
|
||||||
gate = name,
|
gate = name,
|
||||||
elapsed_ms = started.elapsed().as_millis(),
|
elapsed_ms = started.elapsed().as_millis(),
|
||||||
"still waiting, taking longer than expected..."
|
"still waiting, taking longer than expected..."
|
||||||
);
|
);
|
||||||
|
warned = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// If we emitted a warning for slowness, also emit a message when we complete, so that
|
||||||
|
// someone debugging a shutdown can know for sure whether we have moved past this operation.
|
||||||
|
if warned {
|
||||||
|
tracing::info!(
|
||||||
|
gate = name,
|
||||||
|
elapsed_ms = started.elapsed().as_millis(),
|
||||||
|
"completed, after taking longer than expected"
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ret
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
|
|||||||
@@ -51,6 +51,7 @@ regex.workspace = true
|
|||||||
scopeguard.workspace = true
|
scopeguard.workspace = true
|
||||||
serde.workspace = true
|
serde.workspace = true
|
||||||
serde_json = { workspace = true, features = ["raw_value"] }
|
serde_json = { workspace = true, features = ["raw_value"] }
|
||||||
|
serde_path_to_error.workspace = true
|
||||||
serde_with.workspace = true
|
serde_with.workspace = true
|
||||||
signal-hook.workspace = true
|
signal-hook.workspace = true
|
||||||
smallvec = { workspace = true, features = ["write"] }
|
smallvec = { workspace = true, features = ["write"] }
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ use pageserver::repository::Key;
|
|||||||
use pageserver::tenant::layer_map::LayerMap;
|
use pageserver::tenant::layer_map::LayerMap;
|
||||||
use pageserver::tenant::storage_layer::LayerFileName;
|
use pageserver::tenant::storage_layer::LayerFileName;
|
||||||
use pageserver::tenant::storage_layer::PersistentLayerDesc;
|
use pageserver::tenant::storage_layer::PersistentLayerDesc;
|
||||||
|
use pageserver_api::shard::TenantShardId;
|
||||||
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
|
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
|
||||||
use std::cmp::{max, min};
|
use std::cmp::{max, min};
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
@@ -211,7 +212,7 @@ fn bench_sequential(c: &mut Criterion) {
|
|||||||
let i32 = (i as u32) % 100;
|
let i32 = (i as u32) % 100;
|
||||||
let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
|
let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
|
||||||
let layer = PersistentLayerDesc::new_img(
|
let layer = PersistentLayerDesc::new_img(
|
||||||
TenantId::generate(),
|
TenantShardId::unsharded(TenantId::generate()),
|
||||||
TimelineId::generate(),
|
TimelineId::generate(),
|
||||||
zero.add(10 * i32)..zero.add(10 * i32 + 1),
|
zero.add(10 * i32)..zero.add(10 * i32 + 1),
|
||||||
Lsn(i),
|
Lsn(i),
|
||||||
|
|||||||
@@ -18,3 +18,5 @@ tokio.workspace = true
|
|||||||
utils.workspace = true
|
utils.workspace = true
|
||||||
svg_fmt.workspace = true
|
svg_fmt.workspace = true
|
||||||
workspace_hack.workspace = true
|
workspace_hack.workspace = true
|
||||||
|
serde.workspace = true
|
||||||
|
serde_json.workspace = true
|
||||||
|
|||||||
38
pageserver/ctl/src/index_part.rs
Normal file
38
pageserver/ctl/src/index_part.rs
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
use anyhow::Context;
|
||||||
|
use camino::Utf8PathBuf;
|
||||||
|
use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
|
||||||
|
use pageserver::tenant::storage_layer::LayerFileName;
|
||||||
|
use pageserver::tenant::{metadata::TimelineMetadata, IndexPart};
|
||||||
|
use utils::lsn::Lsn;
|
||||||
|
|
||||||
|
#[derive(clap::Subcommand)]
|
||||||
|
pub(crate) enum IndexPartCmd {
|
||||||
|
Dump { path: Utf8PathBuf },
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
|
||||||
|
match cmd {
|
||||||
|
IndexPartCmd::Dump { path } => {
|
||||||
|
let bytes = tokio::fs::read(path).await.context("read file")?;
|
||||||
|
let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
|
||||||
|
#[derive(serde::Serialize)]
|
||||||
|
struct Output<'a> {
|
||||||
|
layer_metadata: &'a HashMap<LayerFileName, IndexLayerMetadata>,
|
||||||
|
disk_consistent_lsn: Lsn,
|
||||||
|
timeline_metadata: &'a TimelineMetadata,
|
||||||
|
}
|
||||||
|
|
||||||
|
let output = Output {
|
||||||
|
layer_metadata: &des.layer_metadata,
|
||||||
|
disk_consistent_lsn: des.get_disk_consistent_lsn(),
|
||||||
|
timeline_metadata: &des.metadata,
|
||||||
|
};
|
||||||
|
|
||||||
|
let output = serde_json::to_string_pretty(&output).context("serialize output")?;
|
||||||
|
println!("{output}");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,13 +1,15 @@
|
|||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
|
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use camino::Utf8Path;
|
use camino::{Utf8Path, Utf8PathBuf};
|
||||||
use clap::Subcommand;
|
use clap::Subcommand;
|
||||||
use pageserver::context::{DownloadBehavior, RequestContext};
|
use pageserver::context::{DownloadBehavior, RequestContext};
|
||||||
use pageserver::task_mgr::TaskKind;
|
use pageserver::task_mgr::TaskKind;
|
||||||
use pageserver::tenant::block_io::BlockCursor;
|
use pageserver::tenant::block_io::BlockCursor;
|
||||||
use pageserver::tenant::disk_btree::DiskBtreeReader;
|
use pageserver::tenant::disk_btree::DiskBtreeReader;
|
||||||
use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary};
|
use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary};
|
||||||
|
use pageserver::tenant::storage_layer::{delta_layer, image_layer};
|
||||||
|
use pageserver::tenant::storage_layer::{DeltaLayer, ImageLayer};
|
||||||
use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
||||||
use pageserver::{page_cache, virtual_file};
|
use pageserver::{page_cache, virtual_file};
|
||||||
use pageserver::{
|
use pageserver::{
|
||||||
@@ -20,6 +22,7 @@ use pageserver::{
|
|||||||
};
|
};
|
||||||
use std::fs;
|
use std::fs;
|
||||||
use utils::bin_ser::BeSer;
|
use utils::bin_ser::BeSer;
|
||||||
|
use utils::id::{TenantId, TimelineId};
|
||||||
|
|
||||||
use crate::layer_map_analyzer::parse_filename;
|
use crate::layer_map_analyzer::parse_filename;
|
||||||
|
|
||||||
@@ -45,6 +48,13 @@ pub(crate) enum LayerCmd {
|
|||||||
/// The id from list-layer command
|
/// The id from list-layer command
|
||||||
id: usize,
|
id: usize,
|
||||||
},
|
},
|
||||||
|
RewriteSummary {
|
||||||
|
layer_file_path: Utf8PathBuf,
|
||||||
|
#[clap(long)]
|
||||||
|
new_tenant_id: Option<TenantId>,
|
||||||
|
#[clap(long)]
|
||||||
|
new_timeline_id: Option<TimelineId>,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
|
async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
|
||||||
@@ -100,6 +110,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
|||||||
println!("- timeline {}", timeline.file_name().to_string_lossy());
|
println!("- timeline {}", timeline.file_name().to_string_lossy());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
LayerCmd::ListLayer {
|
LayerCmd::ListLayer {
|
||||||
path,
|
path,
|
||||||
@@ -128,6 +139,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
|||||||
idx += 1;
|
idx += 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
LayerCmd::DumpLayer {
|
LayerCmd::DumpLayer {
|
||||||
path,
|
path,
|
||||||
@@ -168,7 +180,63 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
|||||||
idx += 1;
|
idx += 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
LayerCmd::RewriteSummary {
|
||||||
|
layer_file_path,
|
||||||
|
new_tenant_id,
|
||||||
|
new_timeline_id,
|
||||||
|
} => {
|
||||||
|
pageserver::virtual_file::init(10);
|
||||||
|
pageserver::page_cache::init(100);
|
||||||
|
|
||||||
|
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||||
|
|
||||||
|
macro_rules! rewrite_closure {
|
||||||
|
($($summary_ty:tt)*) => {{
|
||||||
|
|summary| $($summary_ty)* {
|
||||||
|
tenant_id: new_tenant_id.unwrap_or(summary.tenant_id),
|
||||||
|
timeline_id: new_timeline_id.unwrap_or(summary.timeline_id),
|
||||||
|
..summary
|
||||||
|
}
|
||||||
|
}};
|
||||||
|
}
|
||||||
|
|
||||||
|
let res = ImageLayer::rewrite_summary(
|
||||||
|
layer_file_path,
|
||||||
|
rewrite_closure!(image_layer::Summary),
|
||||||
|
&ctx,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
match res {
|
||||||
|
Ok(()) => {
|
||||||
|
println!("Successfully rewrote summary of image layer {layer_file_path}");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
Err(image_layer::RewriteSummaryError::MagicMismatch) => (), // fallthrough
|
||||||
|
Err(image_layer::RewriteSummaryError::Other(e)) => {
|
||||||
|
return Err(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let res = DeltaLayer::rewrite_summary(
|
||||||
|
layer_file_path,
|
||||||
|
rewrite_closure!(delta_layer::Summary),
|
||||||
|
&ctx,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
match res {
|
||||||
|
Ok(()) => {
|
||||||
|
println!("Successfully rewrote summary of delta layer {layer_file_path}");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
Err(delta_layer::RewriteSummaryError::MagicMismatch) => (), // fallthrough
|
||||||
|
Err(delta_layer::RewriteSummaryError::Other(e)) => {
|
||||||
|
return Err(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
anyhow::bail!("not an image or delta layer: {layer_file_path}");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,11 +5,13 @@
|
|||||||
//! Separate, `metadata` subcommand allows to print and update pageserver's metadata file.
|
//! Separate, `metadata` subcommand allows to print and update pageserver's metadata file.
|
||||||
|
|
||||||
mod draw_timeline_dir;
|
mod draw_timeline_dir;
|
||||||
|
mod index_part;
|
||||||
mod layer_map_analyzer;
|
mod layer_map_analyzer;
|
||||||
mod layers;
|
mod layers;
|
||||||
|
|
||||||
use camino::{Utf8Path, Utf8PathBuf};
|
use camino::{Utf8Path, Utf8PathBuf};
|
||||||
use clap::{Parser, Subcommand};
|
use clap::{Parser, Subcommand};
|
||||||
|
use index_part::IndexPartCmd;
|
||||||
use layers::LayerCmd;
|
use layers::LayerCmd;
|
||||||
use pageserver::{
|
use pageserver::{
|
||||||
context::{DownloadBehavior, RequestContext},
|
context::{DownloadBehavior, RequestContext},
|
||||||
@@ -38,6 +40,8 @@ struct CliOpts {
|
|||||||
#[derive(Subcommand)]
|
#[derive(Subcommand)]
|
||||||
enum Commands {
|
enum Commands {
|
||||||
Metadata(MetadataCmd),
|
Metadata(MetadataCmd),
|
||||||
|
#[command(subcommand)]
|
||||||
|
IndexPart(IndexPartCmd),
|
||||||
PrintLayerFile(PrintLayerFileCmd),
|
PrintLayerFile(PrintLayerFileCmd),
|
||||||
DrawTimeline {},
|
DrawTimeline {},
|
||||||
AnalyzeLayerMap(AnalyzeLayerMapCmd),
|
AnalyzeLayerMap(AnalyzeLayerMapCmd),
|
||||||
@@ -83,6 +87,9 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
Commands::Metadata(cmd) => {
|
Commands::Metadata(cmd) => {
|
||||||
handle_metadata(&cmd)?;
|
handle_metadata(&cmd)?;
|
||||||
}
|
}
|
||||||
|
Commands::IndexPart(cmd) => {
|
||||||
|
index_part::main(&cmd).await?;
|
||||||
|
}
|
||||||
Commands::DrawTimeline {} => {
|
Commands::DrawTimeline {} => {
|
||||||
draw_timeline_dir::main()?;
|
draw_timeline_dir::main()?;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -103,7 +103,11 @@ fn main() -> anyhow::Result<()> {
|
|||||||
} else {
|
} else {
|
||||||
TracingErrorLayerEnablement::Disabled
|
TracingErrorLayerEnablement::Disabled
|
||||||
};
|
};
|
||||||
logging::init(conf.log_format, tracing_error_layer_enablement)?;
|
logging::init(
|
||||||
|
conf.log_format,
|
||||||
|
tracing_error_layer_enablement,
|
||||||
|
logging::Output::Stdout,
|
||||||
|
)?;
|
||||||
|
|
||||||
// mind the order required here: 1. logging, 2. panic_hook, 3. sentry.
|
// mind the order required here: 1. logging, 2. panic_hook, 3. sentry.
|
||||||
// disarming this hook on pageserver, because we never tear down tracing.
|
// disarming this hook on pageserver, because we never tear down tracing.
|
||||||
@@ -366,13 +370,18 @@ fn start_pageserver(
|
|||||||
// Top-level cancellation token for the process
|
// Top-level cancellation token for the process
|
||||||
let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
|
let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
|
||||||
|
|
||||||
|
pageserver::PAGESERVER_SHUTDOWN_TOKEN
|
||||||
|
.set(shutdown_pageserver.clone())
|
||||||
|
.map_err(|_| ())
|
||||||
|
.expect("cannot be set already");
|
||||||
|
|
||||||
// Set up remote storage client
|
// Set up remote storage client
|
||||||
let remote_storage = create_remote_storage_client(conf)?;
|
let remote_storage = create_remote_storage_client(conf)?;
|
||||||
|
|
||||||
// Set up deletion queue
|
// Set up deletion queue
|
||||||
let (deletion_queue, deletion_workers) = DeletionQueue::new(
|
let (deletion_queue, deletion_workers) = DeletionQueue::new(
|
||||||
remote_storage.clone(),
|
remote_storage.clone(),
|
||||||
ControlPlaneClient::new(conf, &shutdown_pageserver),
|
ControlPlaneClient::new(conf, shutdown_pageserver.child_token()),
|
||||||
conf,
|
conf,
|
||||||
);
|
);
|
||||||
if let Some(deletion_workers) = deletion_workers {
|
if let Some(deletion_workers) = deletion_workers {
|
||||||
@@ -398,15 +407,11 @@ fn start_pageserver(
|
|||||||
let (init_remote_done_tx, init_remote_done_rx) = utils::completion::channel();
|
let (init_remote_done_tx, init_remote_done_rx) = utils::completion::channel();
|
||||||
let (init_done_tx, init_done_rx) = utils::completion::channel();
|
let (init_done_tx, init_done_rx) = utils::completion::channel();
|
||||||
|
|
||||||
let (init_logical_size_done_tx, init_logical_size_done_rx) = utils::completion::channel();
|
|
||||||
|
|
||||||
let (background_jobs_can_start, background_jobs_barrier) = utils::completion::channel();
|
let (background_jobs_can_start, background_jobs_barrier) = utils::completion::channel();
|
||||||
|
|
||||||
let order = pageserver::InitializationOrder {
|
let order = pageserver::InitializationOrder {
|
||||||
initial_tenant_load_remote: Some(init_done_tx),
|
initial_tenant_load_remote: Some(init_done_tx),
|
||||||
initial_tenant_load: Some(init_remote_done_tx),
|
initial_tenant_load: Some(init_remote_done_tx),
|
||||||
initial_logical_size_can_start: init_done_rx.clone(),
|
|
||||||
initial_logical_size_attempt: Some(init_logical_size_done_tx),
|
|
||||||
background_jobs_can_start: background_jobs_barrier.clone(),
|
background_jobs_can_start: background_jobs_barrier.clone(),
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -420,13 +425,12 @@ fn start_pageserver(
|
|||||||
deletion_queue_client,
|
deletion_queue_client,
|
||||||
},
|
},
|
||||||
order,
|
order,
|
||||||
shutdown_pageserver.clone(),
|
shutdown_pageserver.child_token(),
|
||||||
))?;
|
))?;
|
||||||
let tenant_manager = Arc::new(tenant_manager);
|
let tenant_manager = Arc::new(tenant_manager);
|
||||||
|
|
||||||
BACKGROUND_RUNTIME.spawn({
|
BACKGROUND_RUNTIME.spawn({
|
||||||
let init_done_rx = init_done_rx;
|
let shutdown_pageserver = shutdown_pageserver.child_token();
|
||||||
let shutdown_pageserver = shutdown_pageserver.clone();
|
|
||||||
let drive_init = async move {
|
let drive_init = async move {
|
||||||
// NOTE: unlike many futures in pageserver, this one is cancellation-safe
|
// NOTE: unlike many futures in pageserver, this one is cancellation-safe
|
||||||
let guard = scopeguard::guard_on_success((), |_| {
|
let guard = scopeguard::guard_on_success((), |_| {
|
||||||
@@ -460,7 +464,7 @@ fn start_pageserver(
|
|||||||
});
|
});
|
||||||
|
|
||||||
let WaitForPhaseResult {
|
let WaitForPhaseResult {
|
||||||
timeout_remaining: timeout,
|
timeout_remaining: _timeout,
|
||||||
skipped: init_load_skipped,
|
skipped: init_load_skipped,
|
||||||
} = wait_for_phase("initial_tenant_load", init_load_done, timeout).await;
|
} = wait_for_phase("initial_tenant_load", init_load_done, timeout).await;
|
||||||
|
|
||||||
@@ -468,26 +472,6 @@ fn start_pageserver(
|
|||||||
|
|
||||||
scopeguard::ScopeGuard::into_inner(guard);
|
scopeguard::ScopeGuard::into_inner(guard);
|
||||||
|
|
||||||
let guard = scopeguard::guard_on_success((), |_| {
|
|
||||||
tracing::info!("Cancelled before initial logical sizes completed")
|
|
||||||
});
|
|
||||||
|
|
||||||
let logical_sizes_done = std::pin::pin!(async {
|
|
||||||
init_logical_size_done_rx.wait().await;
|
|
||||||
startup_checkpoint(
|
|
||||||
started_startup_at,
|
|
||||||
"initial_logical_sizes",
|
|
||||||
"Initial logical sizes completed",
|
|
||||||
);
|
|
||||||
});
|
|
||||||
|
|
||||||
let WaitForPhaseResult {
|
|
||||||
timeout_remaining: _,
|
|
||||||
skipped: logical_sizes_skipped,
|
|
||||||
} = wait_for_phase("initial_logical_sizes", logical_sizes_done, timeout).await;
|
|
||||||
|
|
||||||
scopeguard::ScopeGuard::into_inner(guard);
|
|
||||||
|
|
||||||
// allow background jobs to start: we either completed prior stages, or they reached timeout
|
// allow background jobs to start: we either completed prior stages, or they reached timeout
|
||||||
// and were skipped. It is important that we do not let them block background jobs indefinitely,
|
// and were skipped. It is important that we do not let them block background jobs indefinitely,
|
||||||
// because things like consumption metrics for billing are blocked by this barrier.
|
// because things like consumption metrics for billing are blocked by this barrier.
|
||||||
@@ -510,9 +494,6 @@ fn start_pageserver(
|
|||||||
if let Some(f) = init_load_skipped {
|
if let Some(f) = init_load_skipped {
|
||||||
f.await;
|
f.await;
|
||||||
}
|
}
|
||||||
if let Some(f) = logical_sizes_skipped {
|
|
||||||
f.await;
|
|
||||||
}
|
|
||||||
scopeguard::ScopeGuard::into_inner(guard);
|
scopeguard::ScopeGuard::into_inner(guard);
|
||||||
|
|
||||||
startup_checkpoint(started_startup_at, "complete", "Startup complete");
|
startup_checkpoint(started_startup_at, "complete", "Startup complete");
|
||||||
@@ -540,6 +521,7 @@ fn start_pageserver(
|
|||||||
remote_storage.clone(),
|
remote_storage.clone(),
|
||||||
disk_usage_eviction_state.clone(),
|
disk_usage_eviction_state.clone(),
|
||||||
background_jobs_barrier.clone(),
|
background_jobs_barrier.clone(),
|
||||||
|
shutdown_pageserver.child_token(),
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -560,13 +542,16 @@ fn start_pageserver(
|
|||||||
)
|
)
|
||||||
.context("Failed to initialize router state")?,
|
.context("Failed to initialize router state")?,
|
||||||
);
|
);
|
||||||
|
|
||||||
|
let cancel = shutdown_pageserver.child_token();
|
||||||
|
|
||||||
let router = http::make_router(router_state, launch_ts, http_auth.clone())?
|
let router = http::make_router(router_state, launch_ts, http_auth.clone())?
|
||||||
.build()
|
.build()
|
||||||
.map_err(|err| anyhow!(err))?;
|
.map_err(|err| anyhow!(err))?;
|
||||||
let service = utils::http::RouterService::new(router).unwrap();
|
let service = utils::http::RouterService::new(router).unwrap();
|
||||||
let server = hyper::Server::from_tcp(http_listener)?
|
let server = hyper::Server::from_tcp(http_listener)?
|
||||||
.serve(service)
|
.serve(service)
|
||||||
.with_graceful_shutdown(task_mgr::shutdown_watcher());
|
.with_graceful_shutdown(cancel.clone().cancelled_owned());
|
||||||
|
|
||||||
task_mgr::spawn(
|
task_mgr::spawn(
|
||||||
MGMT_REQUEST_RUNTIME.handle(),
|
MGMT_REQUEST_RUNTIME.handle(),
|
||||||
@@ -575,6 +560,7 @@ fn start_pageserver(
|
|||||||
None,
|
None,
|
||||||
"http endpoint listener",
|
"http endpoint listener",
|
||||||
true,
|
true,
|
||||||
|
cancel,
|
||||||
async {
|
async {
|
||||||
server.await?;
|
server.await?;
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -583,7 +569,6 @@ fn start_pageserver(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
|
if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
|
||||||
let background_jobs_barrier = background_jobs_barrier;
|
|
||||||
let metrics_ctx = RequestContext::todo_child(
|
let metrics_ctx = RequestContext::todo_child(
|
||||||
TaskKind::MetricsCollection,
|
TaskKind::MetricsCollection,
|
||||||
// This task itself shouldn't download anything.
|
// This task itself shouldn't download anything.
|
||||||
@@ -601,6 +586,7 @@ fn start_pageserver(
|
|||||||
None,
|
None,
|
||||||
"consumption metrics collection",
|
"consumption metrics collection",
|
||||||
true,
|
true,
|
||||||
|
shutdown_pageserver.child_token(),
|
||||||
async move {
|
async move {
|
||||||
// first wait until background jobs are cleared to launch.
|
// first wait until background jobs are cleared to launch.
|
||||||
//
|
//
|
||||||
@@ -621,6 +607,7 @@ fn start_pageserver(
|
|||||||
conf.synthetic_size_calculation_interval,
|
conf.synthetic_size_calculation_interval,
|
||||||
conf.id,
|
conf.id,
|
||||||
local_disk_storage,
|
local_disk_storage,
|
||||||
|
cancel,
|
||||||
metrics_ctx,
|
metrics_ctx,
|
||||||
)
|
)
|
||||||
.instrument(info_span!("metrics_collection"))
|
.instrument(info_span!("metrics_collection"))
|
||||||
@@ -648,6 +635,7 @@ fn start_pageserver(
|
|||||||
None,
|
None,
|
||||||
"libpq endpoint listener",
|
"libpq endpoint listener",
|
||||||
true,
|
true,
|
||||||
|
shutdown_pageserver.child_token(),
|
||||||
async move {
|
async move {
|
||||||
page_service::libpq_listener_main(
|
page_service::libpq_listener_main(
|
||||||
conf,
|
conf,
|
||||||
@@ -681,9 +669,8 @@ fn start_pageserver(
|
|||||||
signal.name()
|
signal.name()
|
||||||
);
|
);
|
||||||
|
|
||||||
// This cancels the `shutdown_pageserver` cancellation tree.
|
// This cancels the `shutdown_pageserver` cancellation tree and signals cancellation to
|
||||||
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
|
// all tasks in the system.
|
||||||
// The plan is to change that over time.
|
|
||||||
shutdown_pageserver.take();
|
shutdown_pageserver.take();
|
||||||
let bg_remote_storage = remote_storage.clone();
|
let bg_remote_storage = remote_storage.clone();
|
||||||
let bg_deletion_queue = deletion_queue.clone();
|
let bg_deletion_queue = deletion_queue.clone();
|
||||||
|
|||||||
@@ -5,6 +5,7 @@
|
|||||||
//! See also `settings.md` for better description on every parameter.
|
//! See also `settings.md` for better description on every parameter.
|
||||||
|
|
||||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||||
|
use pageserver_api::shard::TenantShardId;
|
||||||
use remote_storage::{RemotePath, RemoteStorageConfig};
|
use remote_storage::{RemotePath, RemoteStorageConfig};
|
||||||
use serde::de::IntoDeserializer;
|
use serde::de::IntoDeserializer;
|
||||||
use std::env;
|
use std::env;
|
||||||
@@ -25,7 +26,7 @@ use toml_edit::{Document, Item};
|
|||||||
use camino::{Utf8Path, Utf8PathBuf};
|
use camino::{Utf8Path, Utf8PathBuf};
|
||||||
use postgres_backend::AuthType;
|
use postgres_backend::AuthType;
|
||||||
use utils::{
|
use utils::{
|
||||||
id::{NodeId, TenantId, TimelineId},
|
id::{NodeId, TimelineId},
|
||||||
logging::LogFormat,
|
logging::LogFormat,
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -628,12 +629,13 @@ impl PageServerConf {
|
|||||||
self.deletion_prefix().join(format!("header-{VERSION:02x}"))
|
self.deletion_prefix().join(format!("header-{VERSION:02x}"))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn tenant_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
|
pub fn tenant_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
|
||||||
self.tenants_path().join(tenant_id.to_string())
|
self.tenants_path().join(tenant_shard_id.to_string())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn tenant_ignore_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
|
pub fn tenant_ignore_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
|
||||||
self.tenant_path(tenant_id).join(IGNORED_TENANT_FILE_NAME)
|
self.tenant_path(tenant_shard_id)
|
||||||
|
.join(IGNORED_TENANT_FILE_NAME)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Points to a place in pageserver's local directory,
|
/// Points to a place in pageserver's local directory,
|
||||||
@@ -641,47 +643,53 @@ impl PageServerConf {
|
|||||||
///
|
///
|
||||||
/// Legacy: superseded by tenant_location_config_path. Eventually
|
/// Legacy: superseded by tenant_location_config_path. Eventually
|
||||||
/// remove this function.
|
/// remove this function.
|
||||||
pub fn tenant_config_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
|
pub fn tenant_config_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
|
||||||
self.tenant_path(tenant_id).join(TENANT_CONFIG_NAME)
|
self.tenant_path(tenant_shard_id).join(TENANT_CONFIG_NAME)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn tenant_location_config_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
|
pub fn tenant_location_config_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
|
||||||
self.tenant_path(tenant_id)
|
self.tenant_path(tenant_shard_id)
|
||||||
.join(TENANT_LOCATION_CONFIG_NAME)
|
.join(TENANT_LOCATION_CONFIG_NAME)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn timelines_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
|
pub fn timelines_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
|
||||||
self.tenant_path(tenant_id).join(TIMELINES_SEGMENT_NAME)
|
self.tenant_path(tenant_shard_id)
|
||||||
|
.join(TIMELINES_SEGMENT_NAME)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn timeline_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> Utf8PathBuf {
|
pub fn timeline_path(
|
||||||
self.timelines_path(tenant_id).join(timeline_id.to_string())
|
&self,
|
||||||
|
tenant_shard_id: &TenantShardId,
|
||||||
|
timeline_id: &TimelineId,
|
||||||
|
) -> Utf8PathBuf {
|
||||||
|
self.timelines_path(tenant_shard_id)
|
||||||
|
.join(timeline_id.to_string())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn timeline_uninit_mark_file_path(
|
pub fn timeline_uninit_mark_file_path(
|
||||||
&self,
|
&self,
|
||||||
tenant_id: TenantId,
|
tenant_shard_id: TenantShardId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
) -> Utf8PathBuf {
|
) -> Utf8PathBuf {
|
||||||
path_with_suffix_extension(
|
path_with_suffix_extension(
|
||||||
self.timeline_path(&tenant_id, &timeline_id),
|
self.timeline_path(&tenant_shard_id, &timeline_id),
|
||||||
TIMELINE_UNINIT_MARK_SUFFIX,
|
TIMELINE_UNINIT_MARK_SUFFIX,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn timeline_delete_mark_file_path(
|
pub fn timeline_delete_mark_file_path(
|
||||||
&self,
|
&self,
|
||||||
tenant_id: TenantId,
|
tenant_shard_id: TenantShardId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
) -> Utf8PathBuf {
|
) -> Utf8PathBuf {
|
||||||
path_with_suffix_extension(
|
path_with_suffix_extension(
|
||||||
self.timeline_path(&tenant_id, &timeline_id),
|
self.timeline_path(&tenant_shard_id, &timeline_id),
|
||||||
TIMELINE_DELETE_MARK_SUFFIX,
|
TIMELINE_DELETE_MARK_SUFFIX,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn tenant_deleted_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
|
pub fn tenant_deleted_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
|
||||||
self.tenant_path(tenant_id)
|
self.tenant_path(tenant_shard_id)
|
||||||
.join(TENANT_DELETED_MARKER_FILE_NAME)
|
.join(TENANT_DELETED_MARKER_FILE_NAME)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -691,20 +699,24 @@ impl PageServerConf {
|
|||||||
|
|
||||||
pub fn trace_path(
|
pub fn trace_path(
|
||||||
&self,
|
&self,
|
||||||
tenant_id: &TenantId,
|
tenant_shard_id: &TenantShardId,
|
||||||
timeline_id: &TimelineId,
|
timeline_id: &TimelineId,
|
||||||
connection_id: &ConnectionId,
|
connection_id: &ConnectionId,
|
||||||
) -> Utf8PathBuf {
|
) -> Utf8PathBuf {
|
||||||
self.traces_path()
|
self.traces_path()
|
||||||
.join(tenant_id.to_string())
|
.join(tenant_shard_id.to_string())
|
||||||
.join(timeline_id.to_string())
|
.join(timeline_id.to_string())
|
||||||
.join(connection_id.to_string())
|
.join(connection_id.to_string())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Points to a place in pageserver's local directory,
|
/// Points to a place in pageserver's local directory,
|
||||||
/// where certain timeline's metadata file should be located.
|
/// where certain timeline's metadata file should be located.
|
||||||
pub fn metadata_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> Utf8PathBuf {
|
pub fn metadata_path(
|
||||||
self.timeline_path(tenant_id, timeline_id)
|
&self,
|
||||||
|
tenant_shard_id: &TenantShardId,
|
||||||
|
timeline_id: &TimelineId,
|
||||||
|
) -> Utf8PathBuf {
|
||||||
|
self.timeline_path(tenant_shard_id, timeline_id)
|
||||||
.join(METADATA_FILE_NAME)
|
.join(METADATA_FILE_NAME)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -767,7 +779,7 @@ impl PageServerConf {
|
|||||||
builder.remote_storage_config(RemoteStorageConfig::from_toml(item)?)
|
builder.remote_storage_config(RemoteStorageConfig::from_toml(item)?)
|
||||||
}
|
}
|
||||||
"tenant_config" => {
|
"tenant_config" => {
|
||||||
t_conf = Self::parse_toml_tenant_conf(item)?;
|
t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
|
||||||
}
|
}
|
||||||
"id" => builder.id(NodeId(parse_toml_u64(key, item)?)),
|
"id" => builder.id(NodeId(parse_toml_u64(key, item)?)),
|
||||||
"broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
|
"broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
|
||||||
@@ -841,114 +853,10 @@ impl PageServerConf {
|
|||||||
Ok(conf)
|
Ok(conf)
|
||||||
}
|
}
|
||||||
|
|
||||||
// subroutine of parse_and_validate to parse `[tenant_conf]` section
|
|
||||||
|
|
||||||
pub fn parse_toml_tenant_conf(item: &toml_edit::Item) -> Result<TenantConfOpt> {
|
|
||||||
let mut t_conf: TenantConfOpt = Default::default();
|
|
||||||
if let Some(checkpoint_distance) = item.get("checkpoint_distance") {
|
|
||||||
t_conf.checkpoint_distance =
|
|
||||||
Some(parse_toml_u64("checkpoint_distance", checkpoint_distance)?);
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(checkpoint_timeout) = item.get("checkpoint_timeout") {
|
|
||||||
t_conf.checkpoint_timeout = Some(parse_toml_duration(
|
|
||||||
"checkpoint_timeout",
|
|
||||||
checkpoint_timeout,
|
|
||||||
)?);
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(compaction_target_size) = item.get("compaction_target_size") {
|
|
||||||
t_conf.compaction_target_size = Some(parse_toml_u64(
|
|
||||||
"compaction_target_size",
|
|
||||||
compaction_target_size,
|
|
||||||
)?);
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(compaction_period) = item.get("compaction_period") {
|
|
||||||
t_conf.compaction_period =
|
|
||||||
Some(parse_toml_duration("compaction_period", compaction_period)?);
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(compaction_threshold) = item.get("compaction_threshold") {
|
|
||||||
t_conf.compaction_threshold =
|
|
||||||
Some(parse_toml_u64("compaction_threshold", compaction_threshold)?.try_into()?);
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(image_creation_threshold) = item.get("image_creation_threshold") {
|
|
||||||
t_conf.image_creation_threshold = Some(
|
|
||||||
parse_toml_u64("image_creation_threshold", image_creation_threshold)?.try_into()?,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(gc_horizon) = item.get("gc_horizon") {
|
|
||||||
t_conf.gc_horizon = Some(parse_toml_u64("gc_horizon", gc_horizon)?);
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(gc_period) = item.get("gc_period") {
|
|
||||||
t_conf.gc_period = Some(parse_toml_duration("gc_period", gc_period)?);
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(pitr_interval) = item.get("pitr_interval") {
|
|
||||||
t_conf.pitr_interval = Some(parse_toml_duration("pitr_interval", pitr_interval)?);
|
|
||||||
}
|
|
||||||
if let Some(walreceiver_connect_timeout) = item.get("walreceiver_connect_timeout") {
|
|
||||||
t_conf.walreceiver_connect_timeout = Some(parse_toml_duration(
|
|
||||||
"walreceiver_connect_timeout",
|
|
||||||
walreceiver_connect_timeout,
|
|
||||||
)?);
|
|
||||||
}
|
|
||||||
if let Some(lagging_wal_timeout) = item.get("lagging_wal_timeout") {
|
|
||||||
t_conf.lagging_wal_timeout = Some(parse_toml_duration(
|
|
||||||
"lagging_wal_timeout",
|
|
||||||
lagging_wal_timeout,
|
|
||||||
)?);
|
|
||||||
}
|
|
||||||
if let Some(max_lsn_wal_lag) = item.get("max_lsn_wal_lag") {
|
|
||||||
t_conf.max_lsn_wal_lag =
|
|
||||||
Some(deserialize_from_item("max_lsn_wal_lag", max_lsn_wal_lag)?);
|
|
||||||
}
|
|
||||||
if let Some(trace_read_requests) = item.get("trace_read_requests") {
|
|
||||||
t_conf.trace_read_requests =
|
|
||||||
Some(trace_read_requests.as_bool().with_context(|| {
|
|
||||||
"configure option trace_read_requests is not a bool".to_string()
|
|
||||||
})?);
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(eviction_policy) = item.get("eviction_policy") {
|
|
||||||
t_conf.eviction_policy = Some(
|
|
||||||
deserialize_from_item("eviction_policy", eviction_policy)
|
|
||||||
.context("parse eviction_policy")?,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(item) = item.get("min_resident_size_override") {
|
|
||||||
t_conf.min_resident_size_override = Some(
|
|
||||||
deserialize_from_item("min_resident_size_override", item)
|
|
||||||
.context("parse min_resident_size_override")?,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(item) = item.get("evictions_low_residence_duration_metric_threshold") {
|
|
||||||
t_conf.evictions_low_residence_duration_metric_threshold = Some(parse_toml_duration(
|
|
||||||
"evictions_low_residence_duration_metric_threshold",
|
|
||||||
item,
|
|
||||||
)?);
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(gc_feedback) = item.get("gc_feedback") {
|
|
||||||
t_conf.gc_feedback = Some(
|
|
||||||
gc_feedback
|
|
||||||
.as_bool()
|
|
||||||
.with_context(|| "configure option gc_feedback is not a bool".to_string())?,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(t_conf)
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub fn test_repo_dir(test_name: &str) -> Utf8PathBuf {
|
pub fn test_repo_dir(test_name: &str) -> Utf8PathBuf {
|
||||||
Utf8PathBuf::from(format!("../tmp_check/test_{test_name}"))
|
let test_output_dir = std::env::var("TEST_OUTPUT").unwrap_or("../tmp_check".into());
|
||||||
|
Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}"))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self {
|
pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self {
|
||||||
@@ -1417,6 +1325,37 @@ trace_read_requests = {trace_read_requests}"#,
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_incorrect_tenant_config() -> anyhow::Result<()> {
|
||||||
|
let config_string = r#"
|
||||||
|
[tenant_config]
|
||||||
|
checkpoint_distance = -1 # supposed to be an u64
|
||||||
|
"#
|
||||||
|
.to_string();
|
||||||
|
|
||||||
|
let toml: Document = config_string.parse()?;
|
||||||
|
let item = toml.get("tenant_config").unwrap();
|
||||||
|
let error = TenantConfOpt::try_from(item.to_owned()).unwrap_err();
|
||||||
|
|
||||||
|
let expected_error_str = "checkpoint_distance: invalid value: integer `-1`, expected u64";
|
||||||
|
assert_eq!(error.to_string(), expected_error_str);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_override_tenant_config() -> anyhow::Result<()> {
|
||||||
|
let config_string = r#"tenant_config={ min_resident_size_override = 400 }"#.to_string();
|
||||||
|
|
||||||
|
let toml: Document = config_string.parse()?;
|
||||||
|
let item = toml.get("tenant_config").unwrap();
|
||||||
|
let conf = TenantConfOpt::try_from(item.to_owned()).unwrap();
|
||||||
|
|
||||||
|
assert_eq!(conf.min_resident_size_override, Some(400));
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn eviction_pageserver_config_parse() -> anyhow::Result<()> {
|
fn eviction_pageserver_config_parse() -> anyhow::Result<()> {
|
||||||
let tempdir = tempdir()?;
|
let tempdir = tempdir()?;
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
use crate::context::{DownloadBehavior, RequestContext};
|
use crate::context::{DownloadBehavior, RequestContext};
|
||||||
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
|
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
|
||||||
use crate::tenant::tasks::BackgroundLoopKind;
|
use crate::tenant::tasks::BackgroundLoopKind;
|
||||||
use crate::tenant::{mgr, LogicalSizeCalculationCause};
|
use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError, Tenant};
|
||||||
use camino::Utf8PathBuf;
|
use camino::Utf8PathBuf;
|
||||||
use consumption_metrics::EventType;
|
use consumption_metrics::EventType;
|
||||||
use pageserver_api::models::TenantState;
|
use pageserver_api::models::TenantState;
|
||||||
@@ -12,6 +12,7 @@ use std::collections::HashMap;
|
|||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::time::{Duration, SystemTime};
|
use std::time::{Duration, SystemTime};
|
||||||
use tokio::time::Instant;
|
use tokio::time::Instant;
|
||||||
|
use tokio_util::sync::CancellationToken;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
use utils::id::NodeId;
|
use utils::id::NodeId;
|
||||||
|
|
||||||
@@ -37,6 +38,7 @@ type RawMetric = (MetricsKey, (EventType, u64));
|
|||||||
type Cache = HashMap<MetricsKey, (EventType, u64)>;
|
type Cache = HashMap<MetricsKey, (EventType, u64)>;
|
||||||
|
|
||||||
/// Main thread that serves metrics collection
|
/// Main thread that serves metrics collection
|
||||||
|
#[allow(clippy::too_many_arguments)]
|
||||||
pub async fn collect_metrics(
|
pub async fn collect_metrics(
|
||||||
metric_collection_endpoint: &Url,
|
metric_collection_endpoint: &Url,
|
||||||
metric_collection_interval: Duration,
|
metric_collection_interval: Duration,
|
||||||
@@ -44,6 +46,7 @@ pub async fn collect_metrics(
|
|||||||
synthetic_size_calculation_interval: Duration,
|
synthetic_size_calculation_interval: Duration,
|
||||||
node_id: NodeId,
|
node_id: NodeId,
|
||||||
local_disk_storage: Utf8PathBuf,
|
local_disk_storage: Utf8PathBuf,
|
||||||
|
cancel: CancellationToken,
|
||||||
ctx: RequestContext,
|
ctx: RequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
if _cached_metric_collection_interval != Duration::ZERO {
|
if _cached_metric_collection_interval != Duration::ZERO {
|
||||||
@@ -62,10 +65,15 @@ pub async fn collect_metrics(
|
|||||||
None,
|
None,
|
||||||
"synthetic size calculation",
|
"synthetic size calculation",
|
||||||
false,
|
false,
|
||||||
|
cancel.child_token(),
|
||||||
async move {
|
async move {
|
||||||
calculate_synthetic_size_worker(synthetic_size_calculation_interval, &worker_ctx)
|
calculate_synthetic_size_worker(
|
||||||
.instrument(info_span!("synthetic_size_worker"))
|
synthetic_size_calculation_interval,
|
||||||
.await?;
|
&cancel,
|
||||||
|
&worker_ctx,
|
||||||
|
)
|
||||||
|
.instrument(info_span!("synthetic_size_worker"))
|
||||||
|
.await?;
|
||||||
Ok(())
|
Ok(())
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
@@ -241,6 +249,7 @@ async fn reschedule(
|
|||||||
/// Caclculate synthetic size for each active tenant
|
/// Caclculate synthetic size for each active tenant
|
||||||
async fn calculate_synthetic_size_worker(
|
async fn calculate_synthetic_size_worker(
|
||||||
synthetic_size_calculation_interval: Duration,
|
synthetic_size_calculation_interval: Duration,
|
||||||
|
cancel: &CancellationToken,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
info!("starting calculate_synthetic_size_worker");
|
info!("starting calculate_synthetic_size_worker");
|
||||||
@@ -248,8 +257,6 @@ async fn calculate_synthetic_size_worker(
|
|||||||
info!("calculate_synthetic_size_worker stopped");
|
info!("calculate_synthetic_size_worker stopped");
|
||||||
};
|
};
|
||||||
|
|
||||||
let cause = LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;
|
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
let started_at = Instant::now();
|
let started_at = Instant::now();
|
||||||
|
|
||||||
@@ -261,21 +268,25 @@ async fn calculate_synthetic_size_worker(
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
for (tenant_id, tenant_state) in tenants {
|
for (tenant_shard_id, tenant_state) in tenants {
|
||||||
if tenant_state != TenantState::Active {
|
if tenant_state != TenantState::Active {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Ok(tenant) = mgr::get_tenant(tenant_id, true) {
|
if !tenant_shard_id.is_zero() {
|
||||||
// TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
|
// We only send consumption metrics from shard 0, so don't waste time calculating
|
||||||
// We can put in some prioritization for consumption metrics.
|
// synthetic size on other shards.
|
||||||
// Same for the loop that fetches computed metrics.
|
continue;
|
||||||
// By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
|
|
||||||
// which turns out is really handy to understand the system.
|
|
||||||
if let Err(e) = tenant.calculate_synthetic_size(cause, ctx).await {
|
|
||||||
error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let Ok(tenant) = mgr::get_tenant(tenant_shard_id, true) else {
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
|
||||||
|
// there is never any reason to exit calculate_synthetic_size_worker following any
|
||||||
|
// return value -- we don't need to care about shutdown because no tenant is found when
|
||||||
|
// pageserver is shut down.
|
||||||
|
calculate_and_log(&tenant, cancel, ctx).await;
|
||||||
}
|
}
|
||||||
|
|
||||||
crate::tenant::tasks::warn_when_period_overrun(
|
crate::tenant::tasks::warn_when_period_overrun(
|
||||||
@@ -286,7 +297,7 @@ async fn calculate_synthetic_size_worker(
|
|||||||
|
|
||||||
let res = tokio::time::timeout_at(
|
let res = tokio::time::timeout_at(
|
||||||
started_at + synthetic_size_calculation_interval,
|
started_at + synthetic_size_calculation_interval,
|
||||||
task_mgr::shutdown_token().cancelled(),
|
cancel.cancelled(),
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
if res.is_ok() {
|
if res.is_ok() {
|
||||||
@@ -294,3 +305,31 @@ async fn calculate_synthetic_size_worker(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &RequestContext) {
|
||||||
|
const CAUSE: LogicalSizeCalculationCause =
|
||||||
|
LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;
|
||||||
|
|
||||||
|
// TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
|
||||||
|
// We can put in some prioritization for consumption metrics.
|
||||||
|
// Same for the loop that fetches computed metrics.
|
||||||
|
// By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
|
||||||
|
// which turns out is really handy to understand the system.
|
||||||
|
let Err(e) = tenant.calculate_synthetic_size(CAUSE, cancel, ctx).await else {
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
|
||||||
|
// this error can be returned if timeline is shutting down, but it does not
|
||||||
|
// mean the synthetic size worker should terminate. we do not need any checks
|
||||||
|
// in this function because `mgr::get_tenant` will error out after shutdown has
|
||||||
|
// progressed to shutting down tenants.
|
||||||
|
let shutting_down = matches!(
|
||||||
|
e.downcast_ref::<PageReconstructError>(),
|
||||||
|
Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
|
||||||
|
);
|
||||||
|
|
||||||
|
if !shutting_down {
|
||||||
|
let tenant_shard_id = tenant.tenant_shard_id();
|
||||||
|
error!("failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
use crate::context::RequestContext;
|
use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogicalSize};
|
||||||
use anyhow::Context;
|
|
||||||
use chrono::{DateTime, Utc};
|
use chrono::{DateTime, Utc};
|
||||||
use consumption_metrics::EventType;
|
use consumption_metrics::EventType;
|
||||||
use futures::stream::StreamExt;
|
use futures::stream::StreamExt;
|
||||||
@@ -198,12 +197,12 @@ pub(super) async fn collect_all_metrics(
|
|||||||
};
|
};
|
||||||
|
|
||||||
let tenants = futures::stream::iter(tenants).filter_map(|(id, state)| async move {
|
let tenants = futures::stream::iter(tenants).filter_map(|(id, state)| async move {
|
||||||
if state != TenantState::Active {
|
if state != TenantState::Active || !id.is_zero() {
|
||||||
None
|
None
|
||||||
} else {
|
} else {
|
||||||
crate::tenant::mgr::get_tenant(id, true)
|
crate::tenant::mgr::get_tenant(id, true)
|
||||||
.ok()
|
.ok()
|
||||||
.map(|tenant| (id, tenant))
|
.map(|tenant| (id.tenant_id, tenant))
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -351,14 +350,17 @@ impl TimelineSnapshot {
|
|||||||
let last_record_lsn = t.get_last_record_lsn();
|
let last_record_lsn = t.get_last_record_lsn();
|
||||||
|
|
||||||
let current_exact_logical_size = {
|
let current_exact_logical_size = {
|
||||||
let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_id, timeline_id = %t.timeline_id);
|
let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_shard_id.tenant_id, timeline_id = %t.timeline_id);
|
||||||
let res = span
|
let size = span.in_scope(|| {
|
||||||
.in_scope(|| t.get_current_logical_size(ctx))
|
t.get_current_logical_size(
|
||||||
.context("get_current_logical_size");
|
crate::tenant::timeline::GetLogicalSizePriority::Background,
|
||||||
match res? {
|
ctx,
|
||||||
|
)
|
||||||
|
});
|
||||||
|
match size {
|
||||||
// Only send timeline logical size when it is fully calculated.
|
// Only send timeline logical size when it is fully calculated.
|
||||||
(size, is_exact) if is_exact => Some(size),
|
CurrentLogicalSize::Exact(ref size) => Some(size.into()),
|
||||||
(_, _) => None,
|
CurrentLogicalSize::Approximate(_) => None,
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -1,16 +1,15 @@
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
|
||||||
use pageserver_api::control_api::{
|
use pageserver_api::{
|
||||||
ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
|
control_api::{
|
||||||
|
ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
|
||||||
|
},
|
||||||
|
shard::TenantShardId,
|
||||||
};
|
};
|
||||||
use serde::{de::DeserializeOwned, Serialize};
|
use serde::{de::DeserializeOwned, Serialize};
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
use url::Url;
|
use url::Url;
|
||||||
use utils::{
|
use utils::{backoff, generation::Generation, id::NodeId};
|
||||||
backoff,
|
|
||||||
generation::Generation,
|
|
||||||
id::{NodeId, TenantId},
|
|
||||||
};
|
|
||||||
|
|
||||||
use crate::config::PageServerConf;
|
use crate::config::PageServerConf;
|
||||||
|
|
||||||
@@ -31,17 +30,17 @@ pub enum RetryForeverError {
|
|||||||
|
|
||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
pub trait ControlPlaneGenerationsApi {
|
pub trait ControlPlaneGenerationsApi {
|
||||||
async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError>;
|
async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError>;
|
||||||
async fn validate(
|
async fn validate(
|
||||||
&self,
|
&self,
|
||||||
tenants: Vec<(TenantId, Generation)>,
|
tenants: Vec<(TenantShardId, Generation)>,
|
||||||
) -> Result<HashMap<TenantId, bool>, RetryForeverError>;
|
) -> Result<HashMap<TenantShardId, bool>, RetryForeverError>;
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ControlPlaneClient {
|
impl ControlPlaneClient {
|
||||||
/// A None return value indicates that the input `conf` object does not have control
|
/// A None return value indicates that the input `conf` object does not have control
|
||||||
/// plane API enabled.
|
/// plane API enabled.
|
||||||
pub fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option<Self> {
|
pub fn new(conf: &'static PageServerConf, cancel: CancellationToken) -> Option<Self> {
|
||||||
let mut url = match conf.control_plane_api.as_ref() {
|
let mut url = match conf.control_plane_api.as_ref() {
|
||||||
Some(u) => u.clone(),
|
Some(u) => u.clone(),
|
||||||
None => return None,
|
None => return None,
|
||||||
@@ -68,7 +67,7 @@ impl ControlPlaneClient {
|
|||||||
http_client: client.build().expect("Failed to construct HTTP client"),
|
http_client: client.build().expect("Failed to construct HTTP client"),
|
||||||
base_url: url,
|
base_url: url,
|
||||||
node_id: conf.id,
|
node_id: conf.id,
|
||||||
cancel: cancel.clone(),
|
cancel,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -127,7 +126,7 @@ impl ControlPlaneClient {
|
|||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
||||||
/// Block until we get a successful response, or error out if we are shut down
|
/// Block until we get a successful response, or error out if we are shut down
|
||||||
async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError> {
|
async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
|
||||||
let re_attach_path = self
|
let re_attach_path = self
|
||||||
.base_url
|
.base_url
|
||||||
.join("re-attach")
|
.join("re-attach")
|
||||||
@@ -154,8 +153,8 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
|||||||
/// Block until we get a successful response, or error out if we are shut down
|
/// Block until we get a successful response, or error out if we are shut down
|
||||||
async fn validate(
|
async fn validate(
|
||||||
&self,
|
&self,
|
||||||
tenants: Vec<(TenantId, Generation)>,
|
tenants: Vec<(TenantShardId, Generation)>,
|
||||||
) -> Result<HashMap<TenantId, bool>, RetryForeverError> {
|
) -> Result<HashMap<TenantShardId, bool>, RetryForeverError> {
|
||||||
let re_attach_path = self
|
let re_attach_path = self
|
||||||
.base_url
|
.base_url
|
||||||
.join("validate")
|
.join("validate")
|
||||||
|
|||||||
@@ -10,11 +10,12 @@ use crate::control_plane_client::ControlPlaneGenerationsApi;
|
|||||||
use crate::metrics;
|
use crate::metrics;
|
||||||
use crate::tenant::remote_timeline_client::remote_layer_path;
|
use crate::tenant::remote_timeline_client::remote_layer_path;
|
||||||
use crate::tenant::remote_timeline_client::remote_timeline_path;
|
use crate::tenant::remote_timeline_client::remote_timeline_path;
|
||||||
|
use crate::tenant::remote_timeline_client::LayerFileMetadata;
|
||||||
use crate::virtual_file::MaybeFatalIo;
|
use crate::virtual_file::MaybeFatalIo;
|
||||||
use crate::virtual_file::VirtualFile;
|
use crate::virtual_file::VirtualFile;
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use camino::Utf8PathBuf;
|
use camino::Utf8PathBuf;
|
||||||
use hex::FromHex;
|
use pageserver_api::shard::TenantShardId;
|
||||||
use remote_storage::{GenericRemoteStorage, RemotePath};
|
use remote_storage::{GenericRemoteStorage, RemotePath};
|
||||||
use serde::Deserialize;
|
use serde::Deserialize;
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
@@ -25,7 +26,7 @@ use tracing::Instrument;
|
|||||||
use tracing::{self, debug, error};
|
use tracing::{self, debug, error};
|
||||||
use utils::crashsafe::path_with_suffix_extension;
|
use utils::crashsafe::path_with_suffix_extension;
|
||||||
use utils::generation::Generation;
|
use utils::generation::Generation;
|
||||||
use utils::id::{TenantId, TimelineId};
|
use utils::id::TimelineId;
|
||||||
use utils::lsn::AtomicLsn;
|
use utils::lsn::AtomicLsn;
|
||||||
use utils::lsn::Lsn;
|
use utils::lsn::Lsn;
|
||||||
|
|
||||||
@@ -159,11 +160,10 @@ pub struct DeletionQueueClient {
|
|||||||
lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
|
lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize)]
|
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
|
||||||
struct TenantDeletionList {
|
struct TenantDeletionList {
|
||||||
/// For each Timeline, a list of key fragments to append to the timeline remote path
|
/// For each Timeline, a list of key fragments to append to the timeline remote path
|
||||||
/// when reconstructing a full key
|
/// when reconstructing a full key
|
||||||
#[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")]
|
|
||||||
timelines: HashMap<TimelineId, Vec<String>>,
|
timelines: HashMap<TimelineId, Vec<String>>,
|
||||||
|
|
||||||
/// The generation in which this deletion was emitted: note that this may not be the
|
/// The generation in which this deletion was emitted: note that this may not be the
|
||||||
@@ -178,43 +178,11 @@ impl TenantDeletionList {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// For HashMaps using a `hex` compatible key, where we would like to encode the key as a string
|
|
||||||
fn to_hex_map<S, V, I>(input: &HashMap<I, V>, serializer: S) -> Result<S::Ok, S::Error>
|
|
||||||
where
|
|
||||||
S: serde::Serializer,
|
|
||||||
V: Serialize,
|
|
||||||
I: AsRef<[u8]>,
|
|
||||||
{
|
|
||||||
let transformed = input.iter().map(|(k, v)| (hex::encode(k), v));
|
|
||||||
|
|
||||||
transformed
|
|
||||||
.collect::<HashMap<String, &V>>()
|
|
||||||
.serialize(serializer)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// For HashMaps using a FromHex key, where we would like to decode the key
|
|
||||||
fn from_hex_map<'de, D, V, I>(deserializer: D) -> Result<HashMap<I, V>, D::Error>
|
|
||||||
where
|
|
||||||
D: serde::de::Deserializer<'de>,
|
|
||||||
V: Deserialize<'de>,
|
|
||||||
I: FromHex + std::hash::Hash + Eq,
|
|
||||||
{
|
|
||||||
let hex_map = HashMap::<String, V>::deserialize(deserializer)?;
|
|
||||||
hex_map
|
|
||||||
.into_iter()
|
|
||||||
.map(|(k, v)| {
|
|
||||||
I::from_hex(k)
|
|
||||||
.map(|k| (k, v))
|
|
||||||
.map_err(|_| serde::de::Error::custom("Invalid hex ID"))
|
|
||||||
})
|
|
||||||
.collect()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Files ending with this suffix will be ignored and erased
|
/// Files ending with this suffix will be ignored and erased
|
||||||
/// during recovery as startup.
|
/// during recovery as startup.
|
||||||
const TEMP_SUFFIX: &str = "tmp";
|
const TEMP_SUFFIX: &str = "tmp";
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize)]
|
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
|
||||||
struct DeletionList {
|
struct DeletionList {
|
||||||
/// Serialization version, for future use
|
/// Serialization version, for future use
|
||||||
version: u8,
|
version: u8,
|
||||||
@@ -226,8 +194,7 @@ struct DeletionList {
|
|||||||
/// nested HashMaps by TenantTimelineID. Each Tenant only appears once
|
/// nested HashMaps by TenantTimelineID. Each Tenant only appears once
|
||||||
/// with one unique generation ID: if someone tries to push a second generation
|
/// with one unique generation ID: if someone tries to push a second generation
|
||||||
/// ID for the same tenant, we will start a new DeletionList.
|
/// ID for the same tenant, we will start a new DeletionList.
|
||||||
#[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")]
|
tenants: HashMap<TenantShardId, TenantDeletionList>,
|
||||||
tenants: HashMap<TenantId, TenantDeletionList>,
|
|
||||||
|
|
||||||
/// Avoid having to walk `tenants` to calculate the number of keys in
|
/// Avoid having to walk `tenants` to calculate the number of keys in
|
||||||
/// the nested deletion lists
|
/// the nested deletion lists
|
||||||
@@ -299,7 +266,7 @@ impl DeletionList {
|
|||||||
/// deletion list.
|
/// deletion list.
|
||||||
fn push(
|
fn push(
|
||||||
&mut self,
|
&mut self,
|
||||||
tenant: &TenantId,
|
tenant: &TenantShardId,
|
||||||
timeline: &TimelineId,
|
timeline: &TimelineId,
|
||||||
generation: Generation,
|
generation: Generation,
|
||||||
objects: &mut Vec<RemotePath>,
|
objects: &mut Vec<RemotePath>,
|
||||||
@@ -391,7 +358,7 @@ struct TenantLsnState {
|
|||||||
|
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
struct VisibleLsnUpdates {
|
struct VisibleLsnUpdates {
|
||||||
tenants: HashMap<TenantId, TenantLsnState>,
|
tenants: HashMap<TenantShardId, TenantLsnState>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl VisibleLsnUpdates {
|
impl VisibleLsnUpdates {
|
||||||
@@ -448,7 +415,7 @@ impl DeletionQueueClient {
|
|||||||
|
|
||||||
pub(crate) fn recover(
|
pub(crate) fn recover(
|
||||||
&self,
|
&self,
|
||||||
attached_tenants: HashMap<TenantId, Generation>,
|
attached_tenants: HashMap<TenantShardId, Generation>,
|
||||||
) -> Result<(), DeletionQueueError> {
|
) -> Result<(), DeletionQueueError> {
|
||||||
self.do_push(
|
self.do_push(
|
||||||
&self.tx,
|
&self.tx,
|
||||||
@@ -465,7 +432,7 @@ impl DeletionQueueClient {
|
|||||||
/// backend will later wake up and notice that the tenant's generation requires validation.
|
/// backend will later wake up and notice that the tenant's generation requires validation.
|
||||||
pub(crate) async fn update_remote_consistent_lsn(
|
pub(crate) async fn update_remote_consistent_lsn(
|
||||||
&self,
|
&self,
|
||||||
tenant_id: TenantId,
|
tenant_shard_id: TenantShardId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
current_generation: Generation,
|
current_generation: Generation,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
@@ -476,10 +443,13 @@ impl DeletionQueueClient {
|
|||||||
.write()
|
.write()
|
||||||
.expect("Lock should never be poisoned");
|
.expect("Lock should never be poisoned");
|
||||||
|
|
||||||
let tenant_entry = locked.tenants.entry(tenant_id).or_insert(TenantLsnState {
|
let tenant_entry = locked
|
||||||
timelines: HashMap::new(),
|
.tenants
|
||||||
generation: current_generation,
|
.entry(tenant_shard_id)
|
||||||
});
|
.or_insert(TenantLsnState {
|
||||||
|
timelines: HashMap::new(),
|
||||||
|
generation: current_generation,
|
||||||
|
});
|
||||||
|
|
||||||
if tenant_entry.generation != current_generation {
|
if tenant_entry.generation != current_generation {
|
||||||
// Generation might have changed if we were detached and then re-attached: in this case,
|
// Generation might have changed if we were detached and then re-attached: in this case,
|
||||||
@@ -506,27 +476,29 @@ impl DeletionQueueClient {
|
|||||||
/// generations in `layers` are the generations in which those layers were written.
|
/// generations in `layers` are the generations in which those layers were written.
|
||||||
pub(crate) async fn push_layers(
|
pub(crate) async fn push_layers(
|
||||||
&self,
|
&self,
|
||||||
tenant_id: TenantId,
|
tenant_shard_id: TenantShardId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
current_generation: Generation,
|
current_generation: Generation,
|
||||||
layers: Vec<(LayerFileName, Generation)>,
|
layers: Vec<(LayerFileName, LayerFileMetadata)>,
|
||||||
) -> Result<(), DeletionQueueError> {
|
) -> Result<(), DeletionQueueError> {
|
||||||
if current_generation.is_none() {
|
if current_generation.is_none() {
|
||||||
debug!("Enqueuing deletions in legacy mode, skipping queue");
|
debug!("Enqueuing deletions in legacy mode, skipping queue");
|
||||||
|
|
||||||
let mut layer_paths = Vec::new();
|
let mut layer_paths = Vec::new();
|
||||||
for (layer, generation) in layers {
|
for (layer, meta) in layers {
|
||||||
layer_paths.push(remote_layer_path(
|
layer_paths.push(remote_layer_path(
|
||||||
&tenant_id,
|
&tenant_shard_id.tenant_id,
|
||||||
&timeline_id,
|
&timeline_id,
|
||||||
|
meta.shard,
|
||||||
&layer,
|
&layer,
|
||||||
generation,
|
meta.generation,
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
self.push_immediate(layer_paths).await?;
|
self.push_immediate(layer_paths).await?;
|
||||||
return self.flush_immediate().await;
|
return self.flush_immediate().await;
|
||||||
}
|
}
|
||||||
|
|
||||||
self.push_layers_sync(tenant_id, timeline_id, current_generation, layers)
|
self.push_layers_sync(tenant_shard_id, timeline_id, current_generation, layers)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// When a Tenant has a generation, push_layers is always synchronous because
|
/// When a Tenant has a generation, push_layers is always synchronous because
|
||||||
@@ -536,10 +508,10 @@ impl DeletionQueueClient {
|
|||||||
/// support (`<https://github.com/neondatabase/neon/issues/5395>`)
|
/// support (`<https://github.com/neondatabase/neon/issues/5395>`)
|
||||||
pub(crate) fn push_layers_sync(
|
pub(crate) fn push_layers_sync(
|
||||||
&self,
|
&self,
|
||||||
tenant_id: TenantId,
|
tenant_shard_id: TenantShardId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
current_generation: Generation,
|
current_generation: Generation,
|
||||||
layers: Vec<(LayerFileName, Generation)>,
|
layers: Vec<(LayerFileName, LayerFileMetadata)>,
|
||||||
) -> Result<(), DeletionQueueError> {
|
) -> Result<(), DeletionQueueError> {
|
||||||
metrics::DELETION_QUEUE
|
metrics::DELETION_QUEUE
|
||||||
.keys_submitted
|
.keys_submitted
|
||||||
@@ -547,7 +519,7 @@ impl DeletionQueueClient {
|
|||||||
self.do_push(
|
self.do_push(
|
||||||
&self.tx,
|
&self.tx,
|
||||||
ListWriterQueueMessage::Delete(DeletionOp {
|
ListWriterQueueMessage::Delete(DeletionOp {
|
||||||
tenant_id,
|
tenant_shard_id,
|
||||||
timeline_id,
|
timeline_id,
|
||||||
layers,
|
layers,
|
||||||
generation: current_generation,
|
generation: current_generation,
|
||||||
@@ -750,6 +722,7 @@ impl DeletionQueue {
|
|||||||
mod test {
|
mod test {
|
||||||
use camino::Utf8Path;
|
use camino::Utf8Path;
|
||||||
use hex_literal::hex;
|
use hex_literal::hex;
|
||||||
|
use pageserver_api::shard::ShardIndex;
|
||||||
use std::{io::ErrorKind, time::Duration};
|
use std::{io::ErrorKind, time::Duration};
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
|
|
||||||
@@ -814,12 +787,12 @@ mod test {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn set_latest_generation(&self, gen: Generation) {
|
fn set_latest_generation(&self, gen: Generation) {
|
||||||
let tenant_id = self.harness.tenant_id;
|
let tenant_shard_id = self.harness.tenant_shard_id;
|
||||||
self.mock_control_plane
|
self.mock_control_plane
|
||||||
.latest_generation
|
.latest_generation
|
||||||
.lock()
|
.lock()
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.insert(tenant_id, gen);
|
.insert(tenant_shard_id, gen);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns remote layer file name, suitable for use in assert_remote_files
|
/// Returns remote layer file name, suitable for use in assert_remote_files
|
||||||
@@ -828,8 +801,8 @@ mod test {
|
|||||||
file_name: LayerFileName,
|
file_name: LayerFileName,
|
||||||
gen: Generation,
|
gen: Generation,
|
||||||
) -> anyhow::Result<String> {
|
) -> anyhow::Result<String> {
|
||||||
let tenant_id = self.harness.tenant_id;
|
let tenant_shard_id = self.harness.tenant_shard_id;
|
||||||
let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
|
let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID);
|
||||||
let remote_timeline_path = self.remote_fs_dir.join(relative_remote_path.get_path());
|
let remote_timeline_path = self.remote_fs_dir.join(relative_remote_path.get_path());
|
||||||
std::fs::create_dir_all(&remote_timeline_path)?;
|
std::fs::create_dir_all(&remote_timeline_path)?;
|
||||||
let remote_layer_file_name = format!("{}{}", file_name, gen.get_suffix());
|
let remote_layer_file_name = format!("{}{}", file_name, gen.get_suffix());
|
||||||
@@ -847,7 +820,7 @@ mod test {
|
|||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
struct MockControlPlane {
|
struct MockControlPlane {
|
||||||
pub latest_generation: std::sync::Arc<std::sync::Mutex<HashMap<TenantId, Generation>>>,
|
pub latest_generation: std::sync::Arc<std::sync::Mutex<HashMap<TenantShardId, Generation>>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl MockControlPlane {
|
impl MockControlPlane {
|
||||||
@@ -861,20 +834,20 @@ mod test {
|
|||||||
#[async_trait::async_trait]
|
#[async_trait::async_trait]
|
||||||
impl ControlPlaneGenerationsApi for MockControlPlane {
|
impl ControlPlaneGenerationsApi for MockControlPlane {
|
||||||
#[allow(clippy::diverging_sub_expression)] // False positive via async_trait
|
#[allow(clippy::diverging_sub_expression)] // False positive via async_trait
|
||||||
async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError> {
|
async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
|
||||||
unimplemented!()
|
unimplemented!()
|
||||||
}
|
}
|
||||||
async fn validate(
|
async fn validate(
|
||||||
&self,
|
&self,
|
||||||
tenants: Vec<(TenantId, Generation)>,
|
tenants: Vec<(TenantShardId, Generation)>,
|
||||||
) -> Result<HashMap<TenantId, bool>, RetryForeverError> {
|
) -> Result<HashMap<TenantShardId, bool>, RetryForeverError> {
|
||||||
let mut result = HashMap::new();
|
let mut result = HashMap::new();
|
||||||
|
|
||||||
let latest_generation = self.latest_generation.lock().unwrap();
|
let latest_generation = self.latest_generation.lock().unwrap();
|
||||||
|
|
||||||
for (tenant_id, generation) in tenants {
|
for (tenant_shard_id, generation) in tenants {
|
||||||
if let Some(latest) = latest_generation.get(&tenant_id) {
|
if let Some(latest) = latest_generation.get(&tenant_shard_id) {
|
||||||
result.insert(tenant_id, *latest == generation);
|
result.insert(tenant_shard_id, *latest == generation);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -978,10 +951,10 @@ mod test {
|
|||||||
client.recover(HashMap::new())?;
|
client.recover(HashMap::new())?;
|
||||||
|
|
||||||
let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
|
let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
|
||||||
let tenant_id = ctx.harness.tenant_id;
|
let tenant_shard_id = ctx.harness.tenant_shard_id;
|
||||||
|
|
||||||
let content: Vec<u8> = "victim1 contents".into();
|
let content: Vec<u8> = "victim1 contents".into();
|
||||||
let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
|
let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID);
|
||||||
let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
|
let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
|
||||||
let deletion_prefix = ctx.harness.conf.deletion_prefix();
|
let deletion_prefix = ctx.harness.conf.deletion_prefix();
|
||||||
|
|
||||||
@@ -989,6 +962,8 @@ mod test {
|
|||||||
// we delete, and the generation of the running Tenant.
|
// we delete, and the generation of the running Tenant.
|
||||||
let layer_generation = Generation::new(0xdeadbeef);
|
let layer_generation = Generation::new(0xdeadbeef);
|
||||||
let now_generation = Generation::new(0xfeedbeef);
|
let now_generation = Generation::new(0xfeedbeef);
|
||||||
|
let layer_metadata =
|
||||||
|
LayerFileMetadata::new(0xf00, layer_generation, ShardIndex::unsharded());
|
||||||
|
|
||||||
let remote_layer_file_name_1 =
|
let remote_layer_file_name_1 =
|
||||||
format!("{}{}", layer_file_name_1, layer_generation.get_suffix());
|
format!("{}{}", layer_file_name_1, layer_generation.get_suffix());
|
||||||
@@ -1009,10 +984,10 @@ mod test {
|
|||||||
info!("Pushing");
|
info!("Pushing");
|
||||||
client
|
client
|
||||||
.push_layers(
|
.push_layers(
|
||||||
tenant_id,
|
tenant_shard_id,
|
||||||
TIMELINE_ID,
|
TIMELINE_ID,
|
||||||
now_generation,
|
now_generation,
|
||||||
[(layer_file_name_1.clone(), layer_generation)].to_vec(),
|
[(layer_file_name_1.clone(), layer_metadata)].to_vec(),
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
|
assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
|
||||||
@@ -1051,11 +1026,13 @@ mod test {
|
|||||||
let stale_generation = latest_generation.previous();
|
let stale_generation = latest_generation.previous();
|
||||||
// Generation that our example layer file was written with
|
// Generation that our example layer file was written with
|
||||||
let layer_generation = stale_generation.previous();
|
let layer_generation = stale_generation.previous();
|
||||||
|
let layer_metadata =
|
||||||
|
LayerFileMetadata::new(0xf00, layer_generation, ShardIndex::unsharded());
|
||||||
|
|
||||||
ctx.set_latest_generation(latest_generation);
|
ctx.set_latest_generation(latest_generation);
|
||||||
|
|
||||||
let tenant_id = ctx.harness.tenant_id;
|
let tenant_shard_id = ctx.harness.tenant_shard_id;
|
||||||
let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
|
let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID);
|
||||||
let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
|
let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
|
||||||
|
|
||||||
// Initial state: a remote layer exists
|
// Initial state: a remote layer exists
|
||||||
@@ -1065,10 +1042,10 @@ mod test {
|
|||||||
tracing::debug!("Pushing...");
|
tracing::debug!("Pushing...");
|
||||||
client
|
client
|
||||||
.push_layers(
|
.push_layers(
|
||||||
tenant_id,
|
tenant_shard_id,
|
||||||
TIMELINE_ID,
|
TIMELINE_ID,
|
||||||
stale_generation,
|
stale_generation,
|
||||||
[(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(),
|
[(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
@@ -1080,10 +1057,10 @@ mod test {
|
|||||||
tracing::debug!("Pushing...");
|
tracing::debug!("Pushing...");
|
||||||
client
|
client
|
||||||
.push_layers(
|
.push_layers(
|
||||||
tenant_id,
|
tenant_shard_id,
|
||||||
TIMELINE_ID,
|
TIMELINE_ID,
|
||||||
latest_generation,
|
latest_generation,
|
||||||
[(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(),
|
[(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
@@ -1102,14 +1079,16 @@ mod test {
|
|||||||
let client = ctx.deletion_queue.new_client();
|
let client = ctx.deletion_queue.new_client();
|
||||||
client.recover(HashMap::new())?;
|
client.recover(HashMap::new())?;
|
||||||
|
|
||||||
let tenant_id = ctx.harness.tenant_id;
|
let tenant_shard_id = ctx.harness.tenant_shard_id;
|
||||||
|
|
||||||
let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
|
let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID);
|
||||||
let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
|
let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
|
||||||
let deletion_prefix = ctx.harness.conf.deletion_prefix();
|
let deletion_prefix = ctx.harness.conf.deletion_prefix();
|
||||||
|
|
||||||
let layer_generation = Generation::new(0xdeadbeef);
|
let layer_generation = Generation::new(0xdeadbeef);
|
||||||
let now_generation = Generation::new(0xfeedbeef);
|
let now_generation = Generation::new(0xfeedbeef);
|
||||||
|
let layer_metadata =
|
||||||
|
LayerFileMetadata::new(0xf00, layer_generation, ShardIndex::unsharded());
|
||||||
|
|
||||||
// Inject a deletion in the generation before generation_now: after restart,
|
// Inject a deletion in the generation before generation_now: after restart,
|
||||||
// this deletion should _not_ get executed (only the immediately previous
|
// this deletion should _not_ get executed (only the immediately previous
|
||||||
@@ -1118,10 +1097,10 @@ mod test {
|
|||||||
ctx.write_remote_layer(EXAMPLE_LAYER_NAME, layer_generation)?;
|
ctx.write_remote_layer(EXAMPLE_LAYER_NAME, layer_generation)?;
|
||||||
client
|
client
|
||||||
.push_layers(
|
.push_layers(
|
||||||
tenant_id,
|
tenant_shard_id,
|
||||||
TIMELINE_ID,
|
TIMELINE_ID,
|
||||||
now_generation.previous(),
|
now_generation.previous(),
|
||||||
[(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(),
|
[(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
@@ -1132,10 +1111,10 @@ mod test {
|
|||||||
ctx.write_remote_layer(EXAMPLE_LAYER_NAME_ALT, layer_generation)?;
|
ctx.write_remote_layer(EXAMPLE_LAYER_NAME_ALT, layer_generation)?;
|
||||||
client
|
client
|
||||||
.push_layers(
|
.push_layers(
|
||||||
tenant_id,
|
tenant_shard_id,
|
||||||
TIMELINE_ID,
|
TIMELINE_ID,
|
||||||
now_generation,
|
now_generation,
|
||||||
[(EXAMPLE_LAYER_NAME_ALT.clone(), layer_generation)].to_vec(),
|
[(EXAMPLE_LAYER_NAME_ALT.clone(), layer_metadata.clone())].to_vec(),
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
@@ -1163,7 +1142,7 @@ mod test {
|
|||||||
drop(client);
|
drop(client);
|
||||||
ctx.restart().await;
|
ctx.restart().await;
|
||||||
let client = ctx.deletion_queue.new_client();
|
let client = ctx.deletion_queue.new_client();
|
||||||
client.recover(HashMap::from([(tenant_id, now_generation)]))?;
|
client.recover(HashMap::from([(tenant_shard_id, now_generation)]))?;
|
||||||
|
|
||||||
info!("Flush-executing");
|
info!("Flush-executing");
|
||||||
client.flush_execute().await?;
|
client.flush_execute().await?;
|
||||||
@@ -1225,12 +1204,13 @@ pub(crate) mod mock {
|
|||||||
match msg {
|
match msg {
|
||||||
ListWriterQueueMessage::Delete(op) => {
|
ListWriterQueueMessage::Delete(op) => {
|
||||||
let mut objects = op.objects;
|
let mut objects = op.objects;
|
||||||
for (layer, generation) in op.layers {
|
for (layer, meta) in op.layers {
|
||||||
objects.push(remote_layer_path(
|
objects.push(remote_layer_path(
|
||||||
&op.tenant_id,
|
&op.tenant_shard_id.tenant_id,
|
||||||
&op.timeline_id,
|
&op.timeline_id,
|
||||||
|
meta.shard,
|
||||||
&layer,
|
&layer,
|
||||||
generation,
|
meta.generation,
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1310,4 +1290,34 @@ pub(crate) mod mock {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Test round-trip serialization/deserialization, and test stability of the format
|
||||||
|
/// vs. a static expected string for the serialized version.
|
||||||
|
#[test]
|
||||||
|
fn deletion_list_serialization() -> anyhow::Result<()> {
|
||||||
|
let tenant_id = "ad6c1a56f5680419d3a16ff55d97ec3c"
|
||||||
|
.to_string()
|
||||||
|
.parse::<TenantShardId>()?;
|
||||||
|
let timeline_id = "be322c834ed9e709e63b5c9698691910"
|
||||||
|
.to_string()
|
||||||
|
.parse::<TimelineId>()?;
|
||||||
|
let generation = Generation::new(123);
|
||||||
|
|
||||||
|
let object =
|
||||||
|
RemotePath::from_string(&format!("tenants/{tenant_id}/timelines/{timeline_id}/foo"))?;
|
||||||
|
let mut objects = [object].to_vec();
|
||||||
|
|
||||||
|
let mut example = DeletionList::new(1);
|
||||||
|
example.push(&tenant_id, &timeline_id, generation, &mut objects);
|
||||||
|
|
||||||
|
let encoded = serde_json::to_string(&example)?;
|
||||||
|
|
||||||
|
let expected = "{\"version\":1,\"sequence\":1,\"tenants\":{\"ad6c1a56f5680419d3a16ff55d97ec3c\":{\"timelines\":{\"be322c834ed9e709e63b5c9698691910\":[\"foo\"]},\"generation\":123}},\"size\":1}".to_string();
|
||||||
|
assert_eq!(encoded, expected);
|
||||||
|
|
||||||
|
let decoded = serde_json::from_str::<DeletionList>(&encoded)?;
|
||||||
|
assert_eq!(example, decoded);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ use std::collections::HashMap;
|
|||||||
use std::fs::create_dir_all;
|
use std::fs::create_dir_all;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use pageserver_api::shard::TenantShardId;
|
||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
use remote_storage::RemotePath;
|
use remote_storage::RemotePath;
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
@@ -26,13 +27,13 @@ use tracing::debug;
|
|||||||
use tracing::info;
|
use tracing::info;
|
||||||
use tracing::warn;
|
use tracing::warn;
|
||||||
use utils::generation::Generation;
|
use utils::generation::Generation;
|
||||||
use utils::id::TenantId;
|
|
||||||
use utils::id::TimelineId;
|
use utils::id::TimelineId;
|
||||||
|
|
||||||
use crate::config::PageServerConf;
|
use crate::config::PageServerConf;
|
||||||
use crate::deletion_queue::TEMP_SUFFIX;
|
use crate::deletion_queue::TEMP_SUFFIX;
|
||||||
use crate::metrics;
|
use crate::metrics;
|
||||||
use crate::tenant::remote_timeline_client::remote_layer_path;
|
use crate::tenant::remote_timeline_client::remote_layer_path;
|
||||||
|
use crate::tenant::remote_timeline_client::LayerFileMetadata;
|
||||||
use crate::tenant::storage_layer::LayerFileName;
|
use crate::tenant::storage_layer::LayerFileName;
|
||||||
use crate::virtual_file::on_fatal_io_error;
|
use crate::virtual_file::on_fatal_io_error;
|
||||||
use crate::virtual_file::MaybeFatalIo;
|
use crate::virtual_file::MaybeFatalIo;
|
||||||
@@ -53,22 +54,22 @@ const FRONTEND_FLUSHING_TIMEOUT: Duration = Duration::from_millis(100);
|
|||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub(super) struct DeletionOp {
|
pub(super) struct DeletionOp {
|
||||||
pub(super) tenant_id: TenantId,
|
pub(super) tenant_shard_id: TenantShardId,
|
||||||
pub(super) timeline_id: TimelineId,
|
pub(super) timeline_id: TimelineId,
|
||||||
// `layers` and `objects` are both just lists of objects. `layers` is used if you do not
|
// `layers` and `objects` are both just lists of objects. `layers` is used if you do not
|
||||||
// have a config object handy to project it to a remote key, and need the consuming worker
|
// have a config object handy to project it to a remote key, and need the consuming worker
|
||||||
// to do it for you.
|
// to do it for you.
|
||||||
pub(super) layers: Vec<(LayerFileName, Generation)>,
|
pub(super) layers: Vec<(LayerFileName, LayerFileMetadata)>,
|
||||||
pub(super) objects: Vec<RemotePath>,
|
pub(super) objects: Vec<RemotePath>,
|
||||||
|
|
||||||
/// The _current_ generation of the Tenant attachment in which we are enqueuing
|
/// The _current_ generation of the Tenant shard attachment in which we are enqueuing
|
||||||
/// this deletion.
|
/// this deletion.
|
||||||
pub(super) generation: Generation,
|
pub(super) generation: Generation,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub(super) struct RecoverOp {
|
pub(super) struct RecoverOp {
|
||||||
pub(super) attached_tenants: HashMap<TenantId, Generation>,
|
pub(super) attached_tenants: HashMap<TenantShardId, Generation>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
@@ -205,7 +206,7 @@ impl ListWriter {
|
|||||||
|
|
||||||
async fn recover(
|
async fn recover(
|
||||||
&mut self,
|
&mut self,
|
||||||
attached_tenants: HashMap<TenantId, Generation>,
|
attached_tenants: HashMap<TenantShardId, Generation>,
|
||||||
) -> Result<(), anyhow::Error> {
|
) -> Result<(), anyhow::Error> {
|
||||||
debug!(
|
debug!(
|
||||||
"recovering with {} attached tenants",
|
"recovering with {} attached tenants",
|
||||||
@@ -308,10 +309,21 @@ impl ListWriter {
|
|||||||
// generation was issued to another node in the interval while we restarted,
|
// generation was issued to another node in the interval while we restarted,
|
||||||
// then we may treat deletion lists from the previous generation as if they
|
// then we may treat deletion lists from the previous generation as if they
|
||||||
// belong to our currently attached generation, and proceed to validate & execute.
|
// belong to our currently attached generation, and proceed to validate & execute.
|
||||||
for (tenant_id, tenant_list) in &mut deletion_list.tenants {
|
for (tenant_shard_id, tenant_list) in &mut deletion_list.tenants {
|
||||||
if let Some(attached_gen) = attached_tenants.get(tenant_id) {
|
if let Some(attached_gen) = attached_tenants.get(tenant_shard_id) {
|
||||||
if attached_gen.previous() == tenant_list.generation {
|
if attached_gen.previous() == tenant_list.generation {
|
||||||
|
info!(
|
||||||
|
seq=%s, tenant_id=%tenant_shard_id.tenant_id,
|
||||||
|
shard_id=%tenant_shard_id.shard_slug(),
|
||||||
|
old_gen=?tenant_list.generation, new_gen=?attached_gen,
|
||||||
|
"Updating gen on recovered list");
|
||||||
tenant_list.generation = *attached_gen;
|
tenant_list.generation = *attached_gen;
|
||||||
|
} else {
|
||||||
|
info!(
|
||||||
|
seq=%s, tenant_id=%tenant_shard_id.tenant_id,
|
||||||
|
shard_id=%tenant_shard_id.shard_slug(),
|
||||||
|
old_gen=?tenant_list.generation, new_gen=?attached_gen,
|
||||||
|
"Encountered stale generation on recovered list");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -387,25 +399,26 @@ impl ListWriter {
|
|||||||
);
|
);
|
||||||
|
|
||||||
let mut layer_paths = Vec::new();
|
let mut layer_paths = Vec::new();
|
||||||
for (layer, generation) in op.layers {
|
for (layer, meta) in op.layers {
|
||||||
layer_paths.push(remote_layer_path(
|
layer_paths.push(remote_layer_path(
|
||||||
&op.tenant_id,
|
&op.tenant_shard_id.tenant_id,
|
||||||
&op.timeline_id,
|
&op.timeline_id,
|
||||||
|
meta.shard,
|
||||||
&layer,
|
&layer,
|
||||||
generation,
|
meta.generation,
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
layer_paths.extend(op.objects);
|
layer_paths.extend(op.objects);
|
||||||
|
|
||||||
if !self.pending.push(
|
if !self.pending.push(
|
||||||
&op.tenant_id,
|
&op.tenant_shard_id,
|
||||||
&op.timeline_id,
|
&op.timeline_id,
|
||||||
op.generation,
|
op.generation,
|
||||||
&mut layer_paths,
|
&mut layer_paths,
|
||||||
) {
|
) {
|
||||||
self.flush().await;
|
self.flush().await;
|
||||||
let retry_succeeded = self.pending.push(
|
let retry_succeeded = self.pending.push(
|
||||||
&op.tenant_id,
|
&op.tenant_shard_id,
|
||||||
&op.timeline_id,
|
&op.timeline_id,
|
||||||
op.generation,
|
op.generation,
|
||||||
&mut layer_paths,
|
&mut layer_paths,
|
||||||
|
|||||||
@@ -178,7 +178,14 @@ where
|
|||||||
.unwrap_or(false);
|
.unwrap_or(false);
|
||||||
|
|
||||||
if valid && *validated_generation == tenant_lsn_state.generation {
|
if valid && *validated_generation == tenant_lsn_state.generation {
|
||||||
for (_timeline_id, pending_lsn) in tenant_lsn_state.timelines {
|
for (timeline_id, pending_lsn) in tenant_lsn_state.timelines {
|
||||||
|
tracing::debug!(
|
||||||
|
%tenant_id,
|
||||||
|
%timeline_id,
|
||||||
|
current = %pending_lsn.result_slot.load(),
|
||||||
|
projected = %pending_lsn.projected,
|
||||||
|
"advancing validated remote_consistent_lsn",
|
||||||
|
);
|
||||||
pending_lsn.result_slot.store(pending_lsn.projected);
|
pending_lsn.result_slot.store(pending_lsn.projected);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -42,7 +42,6 @@
|
|||||||
// reading these fields. We use the Debug impl for semi-structured logging, though.
|
// reading these fields. We use the Debug impl for semi-structured logging, though.
|
||||||
|
|
||||||
use std::{
|
use std::{
|
||||||
collections::HashMap,
|
|
||||||
sync::Arc,
|
sync::Arc,
|
||||||
time::{Duration, SystemTime},
|
time::{Duration, SystemTime},
|
||||||
};
|
};
|
||||||
@@ -88,6 +87,7 @@ pub fn launch_disk_usage_global_eviction_task(
|
|||||||
storage: GenericRemoteStorage,
|
storage: GenericRemoteStorage,
|
||||||
state: Arc<State>,
|
state: Arc<State>,
|
||||||
background_jobs_barrier: completion::Barrier,
|
background_jobs_barrier: completion::Barrier,
|
||||||
|
cancel: CancellationToken,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let Some(task_config) = &conf.disk_usage_based_eviction else {
|
let Some(task_config) = &conf.disk_usage_based_eviction else {
|
||||||
info!("disk usage based eviction task not configured");
|
info!("disk usage based eviction task not configured");
|
||||||
@@ -103,6 +103,7 @@ pub fn launch_disk_usage_global_eviction_task(
|
|||||||
None,
|
None,
|
||||||
"disk usage based eviction",
|
"disk usage based eviction",
|
||||||
false,
|
false,
|
||||||
|
cancel,
|
||||||
async move {
|
async move {
|
||||||
let cancel = task_mgr::shutdown_token();
|
let cancel = task_mgr::shutdown_token();
|
||||||
|
|
||||||
@@ -125,7 +126,7 @@ pub fn launch_disk_usage_global_eviction_task(
|
|||||||
async fn disk_usage_eviction_task(
|
async fn disk_usage_eviction_task(
|
||||||
state: &State,
|
state: &State,
|
||||||
task_config: &DiskUsageEvictionTaskConfig,
|
task_config: &DiskUsageEvictionTaskConfig,
|
||||||
_storage: &GenericRemoteStorage,
|
storage: &GenericRemoteStorage,
|
||||||
tenants_dir: &Utf8Path,
|
tenants_dir: &Utf8Path,
|
||||||
cancel: CancellationToken,
|
cancel: CancellationToken,
|
||||||
) {
|
) {
|
||||||
@@ -149,8 +150,14 @@ async fn disk_usage_eviction_task(
|
|||||||
let start = Instant::now();
|
let start = Instant::now();
|
||||||
|
|
||||||
async {
|
async {
|
||||||
let res =
|
let res = disk_usage_eviction_task_iteration(
|
||||||
disk_usage_eviction_task_iteration(state, task_config, tenants_dir, &cancel).await;
|
state,
|
||||||
|
task_config,
|
||||||
|
storage,
|
||||||
|
tenants_dir,
|
||||||
|
&cancel,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
|
||||||
match res {
|
match res {
|
||||||
Ok(()) => {}
|
Ok(()) => {}
|
||||||
@@ -181,12 +188,13 @@ pub trait Usage: Clone + Copy + std::fmt::Debug {
|
|||||||
async fn disk_usage_eviction_task_iteration(
|
async fn disk_usage_eviction_task_iteration(
|
||||||
state: &State,
|
state: &State,
|
||||||
task_config: &DiskUsageEvictionTaskConfig,
|
task_config: &DiskUsageEvictionTaskConfig,
|
||||||
|
storage: &GenericRemoteStorage,
|
||||||
tenants_dir: &Utf8Path,
|
tenants_dir: &Utf8Path,
|
||||||
cancel: &CancellationToken,
|
cancel: &CancellationToken,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
|
let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
|
||||||
.context("get filesystem-level disk usage before evictions")?;
|
.context("get filesystem-level disk usage before evictions")?;
|
||||||
let res = disk_usage_eviction_task_iteration_impl(state, usage_pre, cancel).await;
|
let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
|
||||||
match res {
|
match res {
|
||||||
Ok(outcome) => {
|
Ok(outcome) => {
|
||||||
debug!(?outcome, "disk_usage_eviction_iteration finished");
|
debug!(?outcome, "disk_usage_eviction_iteration finished");
|
||||||
@@ -268,8 +276,9 @@ struct LayerCount {
|
|||||||
count: usize,
|
count: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||||
state: &State,
|
state: &State,
|
||||||
|
_storage: &GenericRemoteStorage,
|
||||||
usage_pre: U,
|
usage_pre: U,
|
||||||
cancel: &CancellationToken,
|
cancel: &CancellationToken,
|
||||||
) -> anyhow::Result<IterationOutcome<U>> {
|
) -> anyhow::Result<IterationOutcome<U>> {
|
||||||
@@ -310,7 +319,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
|||||||
.unwrap()
|
.unwrap()
|
||||||
.as_micros(),
|
.as_micros(),
|
||||||
partition,
|
partition,
|
||||||
desc.tenant_id,
|
desc.tenant_shard_id,
|
||||||
desc.timeline_id,
|
desc.timeline_id,
|
||||||
candidate.layer,
|
candidate.layer,
|
||||||
);
|
);
|
||||||
@@ -321,16 +330,16 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
|||||||
// Walk through the list of candidates, until we have accumulated enough layers to get
|
// Walk through the list of candidates, until we have accumulated enough layers to get
|
||||||
// us back under the pressure threshold. 'usage_planned' is updated so that it tracks
|
// us back under the pressure threshold. 'usage_planned' is updated so that it tracks
|
||||||
// how much disk space would be used after evicting all the layers up to the current
|
// how much disk space would be used after evicting all the layers up to the current
|
||||||
// point in the list. The layers are collected in 'batched', grouped per timeline.
|
// point in the list.
|
||||||
//
|
//
|
||||||
// If we get far enough in the list that we start to evict layers that are below
|
// If we get far enough in the list that we start to evict layers that are below
|
||||||
// the tenant's min-resident-size threshold, print a warning, and memorize the disk
|
// the tenant's min-resident-size threshold, print a warning, and memorize the disk
|
||||||
// usage at that point, in 'usage_planned_min_resident_size_respecting'.
|
// usage at that point, in 'usage_planned_min_resident_size_respecting'.
|
||||||
let mut batched: HashMap<_, Vec<_>> = HashMap::new();
|
|
||||||
let mut warned = None;
|
let mut warned = None;
|
||||||
let mut usage_planned = usage_pre;
|
let mut usage_planned = usage_pre;
|
||||||
let mut max_batch_size = 0;
|
let mut evicted_amount = 0;
|
||||||
for (i, (partition, candidate)) in candidates.into_iter().enumerate() {
|
|
||||||
|
for (i, (partition, candidate)) in candidates.iter().enumerate() {
|
||||||
if !usage_planned.has_pressure() {
|
if !usage_planned.has_pressure() {
|
||||||
debug!(
|
debug!(
|
||||||
no_candidates_evicted = i,
|
no_candidates_evicted = i,
|
||||||
@@ -339,25 +348,13 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if partition == MinResidentSizePartition::Below && warned.is_none() {
|
if partition == &MinResidentSizePartition::Below && warned.is_none() {
|
||||||
warn!(?usage_pre, ?usage_planned, candidate_no=i, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
|
warn!(?usage_pre, ?usage_planned, candidate_no=i, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
|
||||||
warned = Some(usage_planned);
|
warned = Some(usage_planned);
|
||||||
}
|
}
|
||||||
|
|
||||||
usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);
|
usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);
|
||||||
|
evicted_amount += 1;
|
||||||
// FIXME: batching makes no sense anymore because of no layermap locking, should just spawn
|
|
||||||
// tasks to evict all seen layers until we have evicted enough
|
|
||||||
|
|
||||||
let batch = batched.entry(TimelineKey(candidate.timeline)).or_default();
|
|
||||||
|
|
||||||
// semaphore will later be used to limit eviction concurrency, and we can express at
|
|
||||||
// most u32 number of permits. unlikely we would have u32::MAX layers to be evicted,
|
|
||||||
// but fail gracefully by not making batches larger.
|
|
||||||
if batch.len() < u32::MAX as usize {
|
|
||||||
batch.push(candidate.layer);
|
|
||||||
max_batch_size = max_batch_size.max(batch.len());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let usage_planned = match warned {
|
let usage_planned = match warned {
|
||||||
@@ -372,100 +369,79 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
|||||||
};
|
};
|
||||||
debug!(?usage_planned, "usage planned");
|
debug!(?usage_planned, "usage planned");
|
||||||
|
|
||||||
// phase2: evict victims batched by timeline
|
// phase2: evict layers
|
||||||
|
|
||||||
let mut js = tokio::task::JoinSet::new();
|
let mut js = tokio::task::JoinSet::new();
|
||||||
|
let limit = 1000;
|
||||||
|
|
||||||
// ratelimit to 1k files or any higher max batch size
|
let mut evicted = candidates.into_iter().take(evicted_amount).fuse();
|
||||||
let limit = Arc::new(tokio::sync::Semaphore::new(1000.max(max_batch_size)));
|
let mut consumed_all = false;
|
||||||
|
|
||||||
for (timeline, batch) in batched {
|
// After the evictions, `usage_assumed` is the post-eviction usage,
|
||||||
let tenant_id = timeline.tenant_id;
|
// according to internal accounting.
|
||||||
let timeline_id = timeline.timeline_id;
|
let mut usage_assumed = usage_pre;
|
||||||
let batch_size =
|
let mut evictions_failed = LayerCount::default();
|
||||||
u32::try_from(batch.len()).expect("batch size limited to u32::MAX during partitioning");
|
|
||||||
|
|
||||||
// I dislike naming of `available_permits` but it means current total amount of permits
|
let evict_layers = async move {
|
||||||
// because permits can be added
|
loop {
|
||||||
assert!(batch_size as usize <= limit.available_permits());
|
let next = if js.len() >= limit || consumed_all {
|
||||||
|
js.join_next().await
|
||||||
|
} else if !js.is_empty() {
|
||||||
|
// opportunistically consume ready result, one per each new evicted
|
||||||
|
futures::future::FutureExt::now_or_never(js.join_next()).and_then(|x| x)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
debug!(%timeline_id, "evicting batch for timeline");
|
if let Some(next) = next {
|
||||||
|
match next {
|
||||||
let evict = {
|
Ok(Ok(file_size)) => {
|
||||||
let limit = limit.clone();
|
usage_assumed.add_available_bytes(file_size);
|
||||||
let cancel = cancel.clone();
|
|
||||||
async move {
|
|
||||||
let mut evicted_bytes = 0;
|
|
||||||
let mut evictions_failed = LayerCount::default();
|
|
||||||
|
|
||||||
let Ok(_permit) = limit.acquire_many_owned(batch_size).await else {
|
|
||||||
// semaphore closing means cancelled
|
|
||||||
return (evicted_bytes, evictions_failed);
|
|
||||||
};
|
|
||||||
|
|
||||||
let results = timeline.evict_layers(&batch).await;
|
|
||||||
|
|
||||||
match results {
|
|
||||||
Ok(results) => {
|
|
||||||
assert_eq!(results.len(), batch.len());
|
|
||||||
for (result, layer) in results.into_iter().zip(batch.iter()) {
|
|
||||||
let file_size = layer.layer_desc().file_size;
|
|
||||||
match result {
|
|
||||||
Some(Ok(())) => {
|
|
||||||
evicted_bytes += file_size;
|
|
||||||
}
|
|
||||||
Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
|
|
||||||
evictions_failed.file_sizes += file_size;
|
|
||||||
evictions_failed.count += 1;
|
|
||||||
}
|
|
||||||
None => {
|
|
||||||
assert!(cancel.is_cancelled());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Ok(Err((file_size, EvictionError::NotFound | EvictionError::Downloaded))) => {
|
||||||
warn!("failed to evict batch: {:#}", e);
|
evictions_failed.file_sizes += file_size;
|
||||||
|
evictions_failed.count += 1;
|
||||||
}
|
}
|
||||||
|
Err(je) if je.is_cancelled() => unreachable!("not used"),
|
||||||
|
Err(je) if je.is_panic() => { /* already logged */ }
|
||||||
|
Err(je) => tracing::error!("unknown JoinError: {je:?}"),
|
||||||
}
|
}
|
||||||
(evicted_bytes, evictions_failed)
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
.instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size));
|
|
||||||
|
|
||||||
js.spawn(evict);
|
if consumed_all && js.is_empty() {
|
||||||
|
break;
|
||||||
// spwaning multiple thousands of these is essentially blocking, so give already spawned a
|
|
||||||
// chance of making progress
|
|
||||||
tokio::task::yield_now().await;
|
|
||||||
}
|
|
||||||
|
|
||||||
let join_all = async move {
|
|
||||||
// After the evictions, `usage_assumed` is the post-eviction usage,
|
|
||||||
// according to internal accounting.
|
|
||||||
let mut usage_assumed = usage_pre;
|
|
||||||
let mut evictions_failed = LayerCount::default();
|
|
||||||
|
|
||||||
while let Some(res) = js.join_next().await {
|
|
||||||
match res {
|
|
||||||
Ok((evicted_bytes, failed)) => {
|
|
||||||
usage_assumed.add_available_bytes(evicted_bytes);
|
|
||||||
evictions_failed.file_sizes += failed.file_sizes;
|
|
||||||
evictions_failed.count += failed.count;
|
|
||||||
}
|
|
||||||
Err(je) if je.is_cancelled() => unreachable!("not used"),
|
|
||||||
Err(je) if je.is_panic() => { /* already logged */ }
|
|
||||||
Err(je) => tracing::error!("unknown JoinError: {je:?}"),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// calling again when consumed_all is fine as evicted is fused.
|
||||||
|
let Some((_partition, candidate)) = evicted.next() else {
|
||||||
|
consumed_all = true;
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
|
||||||
|
js.spawn(async move {
|
||||||
|
let rtc = candidate.timeline.remote_client.as_ref().expect(
|
||||||
|
"holding the witness, all timelines must have a remote timeline client",
|
||||||
|
);
|
||||||
|
let file_size = candidate.layer.layer_desc().file_size;
|
||||||
|
candidate
|
||||||
|
.layer
|
||||||
|
.evict_and_wait(rtc)
|
||||||
|
.await
|
||||||
|
.map(|()| file_size)
|
||||||
|
.map_err(|e| (file_size, e))
|
||||||
|
});
|
||||||
|
|
||||||
|
tokio::task::yield_now().await;
|
||||||
}
|
}
|
||||||
|
|
||||||
(usage_assumed, evictions_failed)
|
(usage_assumed, evictions_failed)
|
||||||
};
|
};
|
||||||
|
|
||||||
let (usage_assumed, evictions_failed) = tokio::select! {
|
let (usage_assumed, evictions_failed) = tokio::select! {
|
||||||
tuple = join_all => { tuple },
|
tuple = evict_layers => { tuple },
|
||||||
_ = cancel.cancelled() => {
|
_ = cancel.cancelled() => {
|
||||||
// close the semaphore to stop any pending acquires
|
// dropping joinset will abort all pending evict_and_waits and that is fine, our
|
||||||
limit.close();
|
// requests will still stand
|
||||||
return Ok(IterationOutcome::Cancelled);
|
return Ok(IterationOutcome::Cancelled);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@@ -572,7 +548,7 @@ async fn collect_eviction_candidates(
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
let info = tl.get_local_layers_for_disk_usage_eviction().await;
|
let info = tl.get_local_layers_for_disk_usage_eviction().await;
|
||||||
debug!(tenant_id=%tl.tenant_id, timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
|
debug!(tenant_id=%tl.tenant_shard_id.tenant_id, shard_id=%tl.tenant_shard_id.shard_slug(), timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
|
||||||
tenant_candidates.extend(
|
tenant_candidates.extend(
|
||||||
info.resident_layers
|
info.resident_layers
|
||||||
.into_iter()
|
.into_iter()
|
||||||
|
|||||||
@@ -84,7 +84,6 @@ paths:
|
|||||||
required: true
|
required: true
|
||||||
schema:
|
schema:
|
||||||
type: string
|
type: string
|
||||||
format: hex
|
|
||||||
get:
|
get:
|
||||||
description: Get tenant status
|
description: Get tenant status
|
||||||
responses:
|
responses:
|
||||||
@@ -181,7 +180,6 @@ paths:
|
|||||||
required: true
|
required: true
|
||||||
schema:
|
schema:
|
||||||
type: string
|
type: string
|
||||||
format: hex
|
|
||||||
get:
|
get:
|
||||||
description: Get timelines for tenant
|
description: Get timelines for tenant
|
||||||
responses:
|
responses:
|
||||||
@@ -232,7 +230,6 @@ paths:
|
|||||||
required: true
|
required: true
|
||||||
schema:
|
schema:
|
||||||
type: string
|
type: string
|
||||||
format: hex
|
|
||||||
- name: timeline_id
|
- name: timeline_id
|
||||||
in: path
|
in: path
|
||||||
required: true
|
required: true
|
||||||
@@ -338,7 +335,6 @@ paths:
|
|||||||
required: true
|
required: true
|
||||||
schema:
|
schema:
|
||||||
type: string
|
type: string
|
||||||
format: hex
|
|
||||||
- name: timeline_id
|
- name: timeline_id
|
||||||
in: path
|
in: path
|
||||||
required: true
|
required: true
|
||||||
@@ -401,7 +397,6 @@ paths:
|
|||||||
required: true
|
required: true
|
||||||
schema:
|
schema:
|
||||||
type: string
|
type: string
|
||||||
format: hex
|
|
||||||
- name: timeline_id
|
- name: timeline_id
|
||||||
in: path
|
in: path
|
||||||
required: true
|
required: true
|
||||||
@@ -469,7 +464,6 @@ paths:
|
|||||||
required: true
|
required: true
|
||||||
schema:
|
schema:
|
||||||
type: string
|
type: string
|
||||||
format: hex
|
|
||||||
- name: timeline_id
|
- name: timeline_id
|
||||||
in: path
|
in: path
|
||||||
required: true
|
required: true
|
||||||
@@ -523,7 +517,6 @@ paths:
|
|||||||
required: true
|
required: true
|
||||||
schema:
|
schema:
|
||||||
type: string
|
type: string
|
||||||
format: hex
|
|
||||||
post:
|
post:
|
||||||
description: |
|
description: |
|
||||||
Schedules attach operation to happen in the background for the given tenant.
|
Schedules attach operation to happen in the background for the given tenant.
|
||||||
@@ -624,6 +617,98 @@ paths:
|
|||||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||||
|
|
||||||
|
|
||||||
|
/v1/tenant/{tenant_id}/location_config:
|
||||||
|
parameters:
|
||||||
|
- name: tenant_id
|
||||||
|
in: path
|
||||||
|
required: true
|
||||||
|
schema:
|
||||||
|
type: string
|
||||||
|
- name: flush_ms
|
||||||
|
in: query
|
||||||
|
required: false
|
||||||
|
schema:
|
||||||
|
type: integer
|
||||||
|
put:
|
||||||
|
description: |
|
||||||
|
Configures a _tenant location_, that is how a particular pageserver handles
|
||||||
|
a particular tenant. This includes _attached_ tenants, i.e. those ingesting WAL
|
||||||
|
and page service requests, and _secondary_ tenants, i.e. those which are just keeping
|
||||||
|
a warm cache in anticipation of transitioning to attached state in the future.
|
||||||
|
|
||||||
|
This is a declarative, idempotent API: there are not separate endpoints
|
||||||
|
for different tenant location configurations. Rather, this single endpoint accepts
|
||||||
|
a description of the desired location configuration, and makes whatever changes
|
||||||
|
are required to reach that state.
|
||||||
|
|
||||||
|
In imperative terms, this API is used to attach and detach tenants, and
|
||||||
|
to transition tenants to and from secondary mode.
|
||||||
|
|
||||||
|
This is a synchronous API: there is no 202 response. State transitions should always
|
||||||
|
be fast (milliseconds), with the exception of requests setting `flush_ms`, in which case
|
||||||
|
the caller controls the runtime of the request.
|
||||||
|
|
||||||
|
In some state transitions, it makes sense to flush dirty data to remote storage: this includes transitions
|
||||||
|
to AttachedStale and Detached. Flushing is never necessary for correctness, but is an
|
||||||
|
important optimization when doing migrations. The `flush_ms` parameter controls whether
|
||||||
|
flushing should be attempted, and how much time is allowed for flushing. If the time limit expires,
|
||||||
|
the requested transition will continue without waiting for any outstanding data to flush. Callers
|
||||||
|
should use a duration which is substantially less than their HTTP client's request
|
||||||
|
timeout. It is safe to supply flush_ms irrespective of the request body: in state transitions
|
||||||
|
where flushing doesn't make sense, the server will ignore it.
|
||||||
|
|
||||||
|
It is safe to retry requests, but if one receives a 409 or 503 response, it is not
|
||||||
|
useful to retry aggressively: there is probably an existing request still ongoing.
|
||||||
|
requestBody:
|
||||||
|
required: false
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/TenantLocationConfigRequest"
|
||||||
|
responses:
|
||||||
|
"200":
|
||||||
|
description: Tenant is now in requested state
|
||||||
|
"503":
|
||||||
|
description: Tenant's state cannot be changed right now. Wait a few seconds and retry.
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/Error"
|
||||||
|
"401":
|
||||||
|
description: Unauthorized Error
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/UnauthorizedError"
|
||||||
|
"403":
|
||||||
|
description: Forbidden Error
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/ForbiddenError"
|
||||||
|
"409":
|
||||||
|
description: |
|
||||||
|
The tenant is already known to Pageserver in some way,
|
||||||
|
and hence this `/attach` call has been rejected.
|
||||||
|
|
||||||
|
Some examples of how this can happen:
|
||||||
|
- tenant was created on this pageserver
|
||||||
|
- tenant attachment was started by an earlier call to `/attach`.
|
||||||
|
|
||||||
|
Callers should poll the tenant status's `attachment_status` field,
|
||||||
|
like for status 202. See the longer description for `POST /attach`
|
||||||
|
for details.
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/ConflictError"
|
||||||
|
"500":
|
||||||
|
description: Generic operation error
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/Error"
|
||||||
|
|
||||||
/v1/tenant/{tenant_id}/detach:
|
/v1/tenant/{tenant_id}/detach:
|
||||||
parameters:
|
parameters:
|
||||||
- name: tenant_id
|
- name: tenant_id
|
||||||
@@ -631,7 +716,6 @@ paths:
|
|||||||
required: true
|
required: true
|
||||||
schema:
|
schema:
|
||||||
type: string
|
type: string
|
||||||
format: hex
|
|
||||||
- name: detach_ignored
|
- name: detach_ignored
|
||||||
in: query
|
in: query
|
||||||
required: false
|
required: false
|
||||||
@@ -691,7 +775,6 @@ paths:
|
|||||||
required: true
|
required: true
|
||||||
schema:
|
schema:
|
||||||
type: string
|
type: string
|
||||||
format: hex
|
|
||||||
post:
|
post:
|
||||||
description: |
|
description: |
|
||||||
Remove tenant data (including all corresponding timelines) from pageserver's memory.
|
Remove tenant data (including all corresponding timelines) from pageserver's memory.
|
||||||
@@ -740,7 +823,6 @@ paths:
|
|||||||
required: true
|
required: true
|
||||||
schema:
|
schema:
|
||||||
type: string
|
type: string
|
||||||
format: hex
|
|
||||||
post:
|
post:
|
||||||
description: |
|
description: |
|
||||||
Schedules an operation that attempts to load a tenant from the local disk and
|
Schedules an operation that attempts to load a tenant from the local disk and
|
||||||
@@ -797,7 +879,6 @@ paths:
|
|||||||
required: true
|
required: true
|
||||||
schema:
|
schema:
|
||||||
type: string
|
type: string
|
||||||
format: hex
|
|
||||||
get:
|
get:
|
||||||
description: |
|
description: |
|
||||||
Calculate tenant's synthetic size
|
Calculate tenant's synthetic size
|
||||||
@@ -840,7 +921,6 @@ paths:
|
|||||||
required: true
|
required: true
|
||||||
schema:
|
schema:
|
||||||
type: string
|
type: string
|
||||||
format: hex
|
|
||||||
- name: inputs_only
|
- name: inputs_only
|
||||||
in: query
|
in: query
|
||||||
required: false
|
required: false
|
||||||
@@ -910,7 +990,6 @@ paths:
|
|||||||
required: true
|
required: true
|
||||||
schema:
|
schema:
|
||||||
type: string
|
type: string
|
||||||
format: hex
|
|
||||||
post:
|
post:
|
||||||
description: |
|
description: |
|
||||||
Create a timeline. Returns new timeline id on success.\
|
Create a timeline. Returns new timeline id on success.\
|
||||||
@@ -935,6 +1014,9 @@ paths:
|
|||||||
format: hex
|
format: hex
|
||||||
pg_version:
|
pg_version:
|
||||||
type: integer
|
type: integer
|
||||||
|
existing_initdb_timeline_id:
|
||||||
|
type: string
|
||||||
|
format: hex
|
||||||
responses:
|
responses:
|
||||||
"201":
|
"201":
|
||||||
description: TimelineInfo
|
description: TimelineInfo
|
||||||
@@ -1041,7 +1123,6 @@ paths:
|
|||||||
application/json:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
type: string
|
type: string
|
||||||
format: hex
|
|
||||||
"400":
|
"400":
|
||||||
description: Malformed tenant create request
|
description: Malformed tenant create request
|
||||||
content:
|
content:
|
||||||
@@ -1138,7 +1219,6 @@ paths:
|
|||||||
required: true
|
required: true
|
||||||
schema:
|
schema:
|
||||||
type: string
|
type: string
|
||||||
format: hex
|
|
||||||
get:
|
get:
|
||||||
description: |
|
description: |
|
||||||
Returns tenant's config description: specific config overrides a tenant has
|
Returns tenant's config description: specific config overrides a tenant has
|
||||||
@@ -1244,7 +1324,6 @@ components:
|
|||||||
properties:
|
properties:
|
||||||
new_tenant_id:
|
new_tenant_id:
|
||||||
type: string
|
type: string
|
||||||
format: hex
|
|
||||||
generation:
|
generation:
|
||||||
type: integer
|
type: integer
|
||||||
description: Attachment generation number.
|
description: Attachment generation number.
|
||||||
@@ -1273,7 +1352,30 @@ components:
|
|||||||
properties:
|
properties:
|
||||||
tenant_id:
|
tenant_id:
|
||||||
type: string
|
type: string
|
||||||
format: hex
|
TenantLocationConfigRequest:
|
||||||
|
type: object
|
||||||
|
required:
|
||||||
|
- tenant_id
|
||||||
|
properties:
|
||||||
|
tenant_id:
|
||||||
|
type: string
|
||||||
|
mode:
|
||||||
|
type: string
|
||||||
|
enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"]
|
||||||
|
description: Mode of functionality that this pageserver will run in for this tenant.
|
||||||
|
generation:
|
||||||
|
type: integer
|
||||||
|
description: Attachment generation number, mandatory when `mode` is an attached state
|
||||||
|
secondary_conf:
|
||||||
|
$ref: '#/components/schemas/SecondaryConfig'
|
||||||
|
tenant_conf:
|
||||||
|
$ref: '#/components/schemas/TenantConfig'
|
||||||
|
SecondaryConfig:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
warm:
|
||||||
|
type: boolean
|
||||||
|
description: Whether to poll remote storage for layers to download. If false, secondary locations don't download anything.
|
||||||
TenantConfig:
|
TenantConfig:
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
@@ -1325,7 +1427,6 @@ components:
|
|||||||
format: hex
|
format: hex
|
||||||
tenant_id:
|
tenant_id:
|
||||||
type: string
|
type: string
|
||||||
format: hex
|
|
||||||
last_record_lsn:
|
last_record_lsn:
|
||||||
type: string
|
type: string
|
||||||
format: hex
|
format: hex
|
||||||
|
|||||||
@@ -4,8 +4,10 @@
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
use anyhow::{anyhow, Context, Result};
|
use anyhow::{anyhow, Context, Result};
|
||||||
|
use enumset::EnumSet;
|
||||||
use futures::TryFutureExt;
|
use futures::TryFutureExt;
|
||||||
use humantime::format_rfc3339;
|
use humantime::format_rfc3339;
|
||||||
use hyper::header;
|
use hyper::header;
|
||||||
@@ -42,6 +44,7 @@ use crate::tenant::mgr::{
|
|||||||
};
|
};
|
||||||
use crate::tenant::size::ModelInputs;
|
use crate::tenant::size::ModelInputs;
|
||||||
use crate::tenant::storage_layer::LayerAccessStatsReset;
|
use crate::tenant::storage_layer::LayerAccessStatsReset;
|
||||||
|
use crate::tenant::timeline::CompactFlags;
|
||||||
use crate::tenant::timeline::Timeline;
|
use crate::tenant::timeline::Timeline;
|
||||||
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSharedResources};
|
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSharedResources};
|
||||||
use crate::{config::PageServerConf, tenant::mgr};
|
use crate::{config::PageServerConf, tenant::mgr};
|
||||||
@@ -133,11 +136,6 @@ impl From<PageReconstructError> for ApiError {
|
|||||||
fn from(pre: PageReconstructError) -> ApiError {
|
fn from(pre: PageReconstructError) -> ApiError {
|
||||||
match pre {
|
match pre {
|
||||||
PageReconstructError::Other(pre) => ApiError::InternalServerError(pre),
|
PageReconstructError::Other(pre) => ApiError::InternalServerError(pre),
|
||||||
PageReconstructError::NeedsDownload(_, _) => {
|
|
||||||
// This shouldn't happen, because we use a RequestContext that requests to
|
|
||||||
// download any missing layer files on-demand.
|
|
||||||
ApiError::InternalServerError(anyhow::anyhow!("need to download remote layer file"))
|
|
||||||
}
|
|
||||||
PageReconstructError::Cancelled => {
|
PageReconstructError::Cancelled => {
|
||||||
ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
|
ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
|
||||||
}
|
}
|
||||||
@@ -316,6 +314,7 @@ async fn build_timeline_info_common(
|
|||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> anyhow::Result<TimelineInfo> {
|
) -> anyhow::Result<TimelineInfo> {
|
||||||
crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
|
crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
|
let initdb_lsn = timeline.initdb_lsn;
|
||||||
let last_record_lsn = timeline.get_last_record_lsn();
|
let last_record_lsn = timeline.get_last_record_lsn();
|
||||||
let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = {
|
let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = {
|
||||||
let guard = timeline.last_received_wal.lock().unwrap();
|
let guard = timeline.last_received_wal.lock().unwrap();
|
||||||
@@ -335,13 +334,8 @@ async fn build_timeline_info_common(
|
|||||||
Lsn(0) => None,
|
Lsn(0) => None,
|
||||||
lsn @ Lsn(_) => Some(lsn),
|
lsn @ Lsn(_) => Some(lsn),
|
||||||
};
|
};
|
||||||
let current_logical_size = match timeline.get_current_logical_size(ctx) {
|
let current_logical_size =
|
||||||
Ok((size, _)) => Some(size),
|
timeline.get_current_logical_size(tenant::timeline::GetLogicalSizePriority::User, ctx);
|
||||||
Err(err) => {
|
|
||||||
error!("Timeline info creation failed to get current logical size: {err:?}");
|
|
||||||
None
|
|
||||||
}
|
|
||||||
};
|
|
||||||
let current_physical_size = Some(timeline.layer_size_sum().await);
|
let current_physical_size = Some(timeline.layer_size_sum().await);
|
||||||
let state = timeline.current_state();
|
let state = timeline.current_state();
|
||||||
let remote_consistent_lsn_projected = timeline
|
let remote_consistent_lsn_projected = timeline
|
||||||
@@ -354,17 +348,22 @@ async fn build_timeline_info_common(
|
|||||||
let walreceiver_status = timeline.walreceiver_status();
|
let walreceiver_status = timeline.walreceiver_status();
|
||||||
|
|
||||||
let info = TimelineInfo {
|
let info = TimelineInfo {
|
||||||
tenant_id: timeline.tenant_id,
|
tenant_id: timeline.tenant_shard_id,
|
||||||
timeline_id: timeline.timeline_id,
|
timeline_id: timeline.timeline_id,
|
||||||
ancestor_timeline_id,
|
ancestor_timeline_id,
|
||||||
ancestor_lsn,
|
ancestor_lsn,
|
||||||
disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
|
disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
|
||||||
remote_consistent_lsn: remote_consistent_lsn_projected,
|
remote_consistent_lsn: remote_consistent_lsn_projected,
|
||||||
remote_consistent_lsn_visible,
|
remote_consistent_lsn_visible,
|
||||||
|
initdb_lsn,
|
||||||
last_record_lsn,
|
last_record_lsn,
|
||||||
prev_record_lsn: Some(timeline.get_prev_record_lsn()),
|
prev_record_lsn: Some(timeline.get_prev_record_lsn()),
|
||||||
latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
|
latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
|
||||||
current_logical_size,
|
current_logical_size: current_logical_size.size_dont_care_about_accuracy(),
|
||||||
|
current_logical_size_is_accurate: match current_logical_size.accuracy() {
|
||||||
|
tenant::timeline::logical_size::Accuracy::Approximate => false,
|
||||||
|
tenant::timeline::logical_size::Accuracy::Exact => true,
|
||||||
|
},
|
||||||
current_physical_size,
|
current_physical_size,
|
||||||
current_logical_size_non_incremental: None,
|
current_logical_size_non_incremental: None,
|
||||||
timeline_dir_layer_file_size_sum: None,
|
timeline_dir_layer_file_size_sum: None,
|
||||||
@@ -437,6 +436,7 @@ async fn timeline_create_handler(
|
|||||||
request_data.ancestor_timeline_id.map(TimelineId::from),
|
request_data.ancestor_timeline_id.map(TimelineId::from),
|
||||||
request_data.ancestor_start_lsn,
|
request_data.ancestor_start_lsn,
|
||||||
request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
|
request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
|
||||||
|
request_data.existing_initdb_timeline_id,
|
||||||
state.broker_client.clone(),
|
state.broker_client.clone(),
|
||||||
&ctx,
|
&ctx,
|
||||||
)
|
)
|
||||||
@@ -476,15 +476,15 @@ async fn timeline_list_handler(
|
|||||||
request: Request<Body>,
|
request: Request<Body>,
|
||||||
_cancel: CancellationToken,
|
_cancel: CancellationToken,
|
||||||
) -> Result<Response<Body>, ApiError> {
|
) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||||
let include_non_incremental_logical_size: Option<bool> =
|
let include_non_incremental_logical_size: Option<bool> =
|
||||||
parse_query_param(&request, "include-non-incremental-logical-size")?;
|
parse_query_param(&request, "include-non-incremental-logical-size")?;
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||||
|
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||||
|
|
||||||
let response_data = async {
|
let response_data = async {
|
||||||
let tenant = mgr::get_tenant(tenant_id, true)?;
|
let tenant = mgr::get_tenant(tenant_shard_id, true)?;
|
||||||
let timelines = tenant.list_timelines();
|
let timelines = tenant.list_timelines();
|
||||||
|
|
||||||
let mut response_data = Vec::with_capacity(timelines.len());
|
let mut response_data = Vec::with_capacity(timelines.len());
|
||||||
@@ -503,7 +503,9 @@ async fn timeline_list_handler(
|
|||||||
}
|
}
|
||||||
Ok::<Vec<TimelineInfo>, ApiError>(response_data)
|
Ok::<Vec<TimelineInfo>, ApiError>(response_data)
|
||||||
}
|
}
|
||||||
.instrument(info_span!("timeline_list", %tenant_id))
|
.instrument(info_span!("timeline_list",
|
||||||
|
tenant_id = %tenant_shard_id.tenant_id,
|
||||||
|
shard_id = %tenant_shard_id.shard_slug()))
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
json_response(StatusCode::OK, response_data)
|
json_response(StatusCode::OK, response_data)
|
||||||
@@ -513,17 +515,17 @@ async fn timeline_detail_handler(
|
|||||||
request: Request<Body>,
|
request: Request<Body>,
|
||||||
_cancel: CancellationToken,
|
_cancel: CancellationToken,
|
||||||
) -> Result<Response<Body>, ApiError> {
|
) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||||
let include_non_incremental_logical_size: Option<bool> =
|
let include_non_incremental_logical_size: Option<bool> =
|
||||||
parse_query_param(&request, "include-non-incremental-logical-size")?;
|
parse_query_param(&request, "include-non-incremental-logical-size")?;
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||||
|
|
||||||
// Logical size calculation needs downloading.
|
// Logical size calculation needs downloading.
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||||
|
|
||||||
let timeline_info = async {
|
let timeline_info = async {
|
||||||
let tenant = mgr::get_tenant(tenant_id, true)?;
|
let tenant = mgr::get_tenant(tenant_shard_id, true)?;
|
||||||
|
|
||||||
let timeline = tenant
|
let timeline = tenant
|
||||||
.get_timeline(timeline_id, false)
|
.get_timeline(timeline_id, false)
|
||||||
@@ -540,7 +542,10 @@ async fn timeline_detail_handler(
|
|||||||
|
|
||||||
Ok::<_, ApiError>(timeline_info)
|
Ok::<_, ApiError>(timeline_info)
|
||||||
}
|
}
|
||||||
.instrument(info_span!("timeline_detail", %tenant_id, %timeline_id))
|
.instrument(info_span!("timeline_detail",
|
||||||
|
tenant_id = %tenant_shard_id.tenant_id,
|
||||||
|
shard_id = %tenant_shard_id.shard_slug(),
|
||||||
|
%timeline_id))
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
json_response(StatusCode::OK, timeline_info)
|
json_response(StatusCode::OK, timeline_info)
|
||||||
@@ -548,10 +553,17 @@ async fn timeline_detail_handler(
|
|||||||
|
|
||||||
async fn get_lsn_by_timestamp_handler(
|
async fn get_lsn_by_timestamp_handler(
|
||||||
request: Request<Body>,
|
request: Request<Body>,
|
||||||
_cancel: CancellationToken,
|
cancel: CancellationToken,
|
||||||
) -> Result<Response<Body>, ApiError> {
|
) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||||
|
|
||||||
|
if !tenant_shard_id.is_zero() {
|
||||||
|
// Requires SLRU contents, which are only stored on shard zero
|
||||||
|
return Err(ApiError::BadRequest(anyhow!(
|
||||||
|
"Size calculations are only available on shard zero"
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
|
||||||
let version: Option<u8> = parse_query_param(&request, "version")?;
|
let version: Option<u8> = parse_query_param(&request, "version")?;
|
||||||
|
|
||||||
@@ -563,8 +575,10 @@ async fn get_lsn_by_timestamp_handler(
|
|||||||
let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);
|
let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);
|
||||||
|
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||||
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
|
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
|
||||||
let result = timeline.find_lsn_for_timestamp(timestamp_pg, &ctx).await?;
|
let result = timeline
|
||||||
|
.find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx)
|
||||||
|
.await?;
|
||||||
|
|
||||||
if version.unwrap_or(0) > 1 {
|
if version.unwrap_or(0) > 1 {
|
||||||
#[derive(serde::Serialize)]
|
#[derive(serde::Serialize)]
|
||||||
@@ -596,8 +610,15 @@ async fn get_timestamp_of_lsn_handler(
|
|||||||
request: Request<Body>,
|
request: Request<Body>,
|
||||||
_cancel: CancellationToken,
|
_cancel: CancellationToken,
|
||||||
) -> Result<Response<Body>, ApiError> {
|
) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||||
|
|
||||||
|
if !tenant_shard_id.is_zero() {
|
||||||
|
// Requires SLRU contents, which are only stored on shard zero
|
||||||
|
return Err(ApiError::BadRequest(anyhow!(
|
||||||
|
"Size calculations are only available on shard zero"
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
|
||||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||||
|
|
||||||
@@ -607,7 +628,7 @@ async fn get_timestamp_of_lsn_handler(
|
|||||||
.map_err(ApiError::BadRequest)?;
|
.map_err(ApiError::BadRequest)?;
|
||||||
|
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||||
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
|
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
|
||||||
let result = timeline.get_timestamp_for_lsn(lsn, &ctx).await?;
|
let result = timeline.get_timestamp_for_lsn(lsn, &ctx).await?;
|
||||||
|
|
||||||
match result {
|
match result {
|
||||||
@@ -703,6 +724,26 @@ async fn tenant_detach_handler(
|
|||||||
json_response(StatusCode::OK, ())
|
json_response(StatusCode::OK, ())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn tenant_reset_handler(
|
||||||
|
request: Request<Body>,
|
||||||
|
_cancel: CancellationToken,
|
||||||
|
) -> Result<Response<Body>, ApiError> {
|
||||||
|
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||||
|
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||||
|
|
||||||
|
let drop_cache: Option<bool> = parse_query_param(&request, "drop_cache")?;
|
||||||
|
|
||||||
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||||
|
let state = get_state(&request);
|
||||||
|
state
|
||||||
|
.tenant_manager
|
||||||
|
.reset_tenant(tenant_shard_id, drop_cache.unwrap_or(false), ctx)
|
||||||
|
.await
|
||||||
|
.map_err(ApiError::InternalServerError)?;
|
||||||
|
|
||||||
|
json_response(StatusCode::OK, ())
|
||||||
|
}
|
||||||
|
|
||||||
async fn tenant_load_handler(
|
async fn tenant_load_handler(
|
||||||
mut request: Request<Body>,
|
mut request: Request<Body>,
|
||||||
_cancel: CancellationToken,
|
_cancel: CancellationToken,
|
||||||
@@ -779,11 +820,11 @@ async fn tenant_status(
|
|||||||
request: Request<Body>,
|
request: Request<Body>,
|
||||||
_cancel: CancellationToken,
|
_cancel: CancellationToken,
|
||||||
) -> Result<Response<Body>, ApiError> {
|
) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||||
|
|
||||||
let tenant_info = async {
|
let tenant_info = async {
|
||||||
let tenant = mgr::get_tenant(tenant_id, false)?;
|
let tenant = mgr::get_tenant(tenant_shard_id, false)?;
|
||||||
|
|
||||||
// Calculate total physical size of all timelines
|
// Calculate total physical size of all timelines
|
||||||
let mut current_physical_size = 0;
|
let mut current_physical_size = 0;
|
||||||
@@ -793,13 +834,15 @@ async fn tenant_status(
|
|||||||
|
|
||||||
let state = tenant.current_state();
|
let state = tenant.current_state();
|
||||||
Result::<_, ApiError>::Ok(TenantInfo {
|
Result::<_, ApiError>::Ok(TenantInfo {
|
||||||
id: tenant_id,
|
id: tenant_shard_id,
|
||||||
state: state.clone(),
|
state: state.clone(),
|
||||||
current_physical_size: Some(current_physical_size),
|
current_physical_size: Some(current_physical_size),
|
||||||
attachment_status: state.attachment_status(),
|
attachment_status: state.attachment_status(),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
.instrument(info_span!("tenant_status_handler", %tenant_id))
|
.instrument(info_span!("tenant_status_handler",
|
||||||
|
tenant_id = %tenant_shard_id.tenant_id,
|
||||||
|
shard_id = %tenant_shard_id.shard_slug()))
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
json_response(StatusCode::OK, tenant_info)
|
json_response(StatusCode::OK, tenant_info)
|
||||||
@@ -818,7 +861,7 @@ async fn tenant_delete_handler(
|
|||||||
mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_shard_id)
|
mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_shard_id)
|
||||||
.instrument(info_span!("tenant_delete_handler",
|
.instrument(info_span!("tenant_delete_handler",
|
||||||
tenant_id = %tenant_shard_id.tenant_id,
|
tenant_id = %tenant_shard_id.tenant_id,
|
||||||
shard = tenant_shard_id.shard_slug()
|
shard = %tenant_shard_id.shard_slug()
|
||||||
))
|
))
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
@@ -840,22 +883,29 @@ async fn tenant_delete_handler(
|
|||||||
/// without modifying anything anyway.
|
/// without modifying anything anyway.
|
||||||
async fn tenant_size_handler(
|
async fn tenant_size_handler(
|
||||||
request: Request<Body>,
|
request: Request<Body>,
|
||||||
_cancel: CancellationToken,
|
cancel: CancellationToken,
|
||||||
) -> Result<Response<Body>, ApiError> {
|
) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||||
let inputs_only: Option<bool> = parse_query_param(&request, "inputs_only")?;
|
let inputs_only: Option<bool> = parse_query_param(&request, "inputs_only")?;
|
||||||
let retention_period: Option<u64> = parse_query_param(&request, "retention_period")?;
|
let retention_period: Option<u64> = parse_query_param(&request, "retention_period")?;
|
||||||
let headers = request.headers();
|
let headers = request.headers();
|
||||||
|
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||||
let tenant = mgr::get_tenant(tenant_id, true)?;
|
let tenant = mgr::get_tenant(tenant_shard_id, true)?;
|
||||||
|
|
||||||
|
if !tenant_shard_id.is_zero() {
|
||||||
|
return Err(ApiError::BadRequest(anyhow!(
|
||||||
|
"Size calculations are only available on shard zero"
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
|
||||||
// this can be long operation
|
// this can be long operation
|
||||||
let inputs = tenant
|
let inputs = tenant
|
||||||
.gather_size_inputs(
|
.gather_size_inputs(
|
||||||
retention_period,
|
retention_period,
|
||||||
LogicalSizeCalculationCause::TenantSizeHandler,
|
LogicalSizeCalculationCause::TenantSizeHandler,
|
||||||
|
&cancel,
|
||||||
&ctx,
|
&ctx,
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
@@ -900,7 +950,7 @@ async fn tenant_size_handler(
|
|||||||
json_response(
|
json_response(
|
||||||
StatusCode::OK,
|
StatusCode::OK,
|
||||||
TenantHistorySize {
|
TenantHistorySize {
|
||||||
id: tenant_id,
|
id: tenant_shard_id.tenant_id,
|
||||||
size: sizes.as_ref().map(|x| x.total_size),
|
size: sizes.as_ref().map(|x| x.total_size),
|
||||||
segment_sizes: sizes.map(|x| x.segments),
|
segment_sizes: sizes.map(|x| x.segments),
|
||||||
inputs,
|
inputs,
|
||||||
@@ -912,14 +962,14 @@ async fn layer_map_info_handler(
|
|||||||
request: Request<Body>,
|
request: Request<Body>,
|
||||||
_cancel: CancellationToken,
|
_cancel: CancellationToken,
|
||||||
) -> Result<Response<Body>, ApiError> {
|
) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||||
let reset: LayerAccessStatsReset =
|
let reset: LayerAccessStatsReset =
|
||||||
parse_query_param(&request, "reset")?.unwrap_or(LayerAccessStatsReset::NoReset);
|
parse_query_param(&request, "reset")?.unwrap_or(LayerAccessStatsReset::NoReset);
|
||||||
|
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||||
|
|
||||||
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
|
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
|
||||||
let layer_map_info = timeline.layer_map_info(reset).await;
|
let layer_map_info = timeline.layer_map_info(reset).await;
|
||||||
|
|
||||||
json_response(StatusCode::OK, layer_map_info)
|
json_response(StatusCode::OK, layer_map_info)
|
||||||
@@ -929,13 +979,12 @@ async fn layer_download_handler(
|
|||||||
request: Request<Body>,
|
request: Request<Body>,
|
||||||
_cancel: CancellationToken,
|
_cancel: CancellationToken,
|
||||||
) -> Result<Response<Body>, ApiError> {
|
) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||||
check_permission(&request, Some(tenant_id))?;
|
|
||||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||||
let layer_file_name = get_request_param(&request, "layer_file_name")?;
|
let layer_file_name = get_request_param(&request, "layer_file_name")?;
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||||
|
|
||||||
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
|
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
|
||||||
let downloaded = timeline
|
let downloaded = timeline
|
||||||
.download_layer(layer_file_name)
|
.download_layer(layer_file_name)
|
||||||
.await
|
.await
|
||||||
@@ -946,7 +995,7 @@ async fn layer_download_handler(
|
|||||||
Some(false) => json_response(StatusCode::NOT_MODIFIED, ()),
|
Some(false) => json_response(StatusCode::NOT_MODIFIED, ()),
|
||||||
None => json_response(
|
None => json_response(
|
||||||
StatusCode::BAD_REQUEST,
|
StatusCode::BAD_REQUEST,
|
||||||
format!("Layer {tenant_id}/{timeline_id}/{layer_file_name} not found"),
|
format!("Layer {tenant_shard_id}/{timeline_id}/{layer_file_name} not found"),
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -955,12 +1004,12 @@ async fn evict_timeline_layer_handler(
|
|||||||
request: Request<Body>,
|
request: Request<Body>,
|
||||||
_cancel: CancellationToken,
|
_cancel: CancellationToken,
|
||||||
) -> Result<Response<Body>, ApiError> {
|
) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||||
let layer_file_name = get_request_param(&request, "layer_file_name")?;
|
let layer_file_name = get_request_param(&request, "layer_file_name")?;
|
||||||
|
|
||||||
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
|
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
|
||||||
let evicted = timeline
|
let evicted = timeline
|
||||||
.evict_layer(layer_file_name)
|
.evict_layer(layer_file_name)
|
||||||
.await
|
.await
|
||||||
@@ -971,7 +1020,7 @@ async fn evict_timeline_layer_handler(
|
|||||||
Some(false) => json_response(StatusCode::NOT_MODIFIED, ()),
|
Some(false) => json_response(StatusCode::NOT_MODIFIED, ()),
|
||||||
None => json_response(
|
None => json_response(
|
||||||
StatusCode::BAD_REQUEST,
|
StatusCode::BAD_REQUEST,
|
||||||
format!("Layer {tenant_id}/{timeline_id}/{layer_file_name} not found"),
|
format!("Layer {tenant_shard_id}/{timeline_id}/{layer_file_name} not found"),
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1103,10 +1152,10 @@ async fn get_tenant_config_handler(
|
|||||||
request: Request<Body>,
|
request: Request<Body>,
|
||||||
_cancel: CancellationToken,
|
_cancel: CancellationToken,
|
||||||
) -> Result<Response<Body>, ApiError> {
|
) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||||
|
|
||||||
let tenant = mgr::get_tenant(tenant_id, false)?;
|
let tenant = mgr::get_tenant(tenant_shard_id, false)?;
|
||||||
|
|
||||||
let response = HashMap::from([
|
let response = HashMap::from([
|
||||||
(
|
(
|
||||||
@@ -1152,6 +1201,7 @@ async fn put_tenant_location_config_handler(
|
|||||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||||
|
|
||||||
let request_data: TenantLocationConfigRequest = json_request(&mut request).await?;
|
let request_data: TenantLocationConfigRequest = json_request(&mut request).await?;
|
||||||
|
let flush = parse_query_param(&request, "flush_ms")?.map(Duration::from_millis);
|
||||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||||
|
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||||
@@ -1165,7 +1215,7 @@ async fn put_tenant_location_config_handler(
|
|||||||
mgr::detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
|
mgr::detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
|
||||||
.instrument(info_span!("tenant_detach",
|
.instrument(info_span!("tenant_detach",
|
||||||
tenant_id = %tenant_shard_id.tenant_id,
|
tenant_id = %tenant_shard_id.tenant_id,
|
||||||
shard = tenant_shard_id.shard_slug()
|
shard = %tenant_shard_id.shard_slug()
|
||||||
))
|
))
|
||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
@@ -1184,7 +1234,7 @@ async fn put_tenant_location_config_handler(
|
|||||||
|
|
||||||
state
|
state
|
||||||
.tenant_manager
|
.tenant_manager
|
||||||
.upsert_location(tenant_shard_id, location_conf, &ctx)
|
.upsert_location(tenant_shard_id, location_conf, flush, &ctx)
|
||||||
.await
|
.await
|
||||||
// TODO: badrequest assumes the caller was asking for something unreasonable, but in
|
// TODO: badrequest assumes the caller was asking for something unreasonable, but in
|
||||||
// principle we might have hit something like concurrent API calls to the same tenant,
|
// principle we might have hit something like concurrent API calls to the same tenant,
|
||||||
@@ -1199,9 +1249,9 @@ async fn handle_tenant_break(
|
|||||||
r: Request<Body>,
|
r: Request<Body>,
|
||||||
_cancel: CancellationToken,
|
_cancel: CancellationToken,
|
||||||
) -> Result<Response<Body>, ApiError> {
|
) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?;
|
let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?;
|
||||||
|
|
||||||
let tenant = crate::tenant::mgr::get_tenant(tenant_id, true)
|
let tenant = crate::tenant::mgr::get_tenant(tenant_shard_id, true)
|
||||||
.map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;
|
.map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;
|
||||||
|
|
||||||
tenant.set_broken("broken from test".to_owned()).await;
|
tenant.set_broken("broken from test".to_owned()).await;
|
||||||
@@ -1240,16 +1290,17 @@ async fn failpoints_handler(
|
|||||||
// Run GC immediately on given timeline.
|
// Run GC immediately on given timeline.
|
||||||
async fn timeline_gc_handler(
|
async fn timeline_gc_handler(
|
||||||
mut request: Request<Body>,
|
mut request: Request<Body>,
|
||||||
_cancel: CancellationToken,
|
cancel: CancellationToken,
|
||||||
) -> Result<Response<Body>, ApiError> {
|
) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||||
|
|
||||||
let gc_req: TimelineGcRequest = json_request(&mut request).await?;
|
let gc_req: TimelineGcRequest = json_request(&mut request).await?;
|
||||||
|
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||||
let wait_task_done = mgr::immediate_gc(tenant_id, timeline_id, gc_req, &ctx).await?;
|
let wait_task_done =
|
||||||
|
mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?;
|
||||||
let gc_result = wait_task_done
|
let gc_result = wait_task_done
|
||||||
.await
|
.await
|
||||||
.context("wait for gc task")
|
.context("wait for gc task")
|
||||||
@@ -1264,20 +1315,24 @@ async fn timeline_compact_handler(
|
|||||||
request: Request<Body>,
|
request: Request<Body>,
|
||||||
cancel: CancellationToken,
|
cancel: CancellationToken,
|
||||||
) -> Result<Response<Body>, ApiError> {
|
) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||||
|
|
||||||
|
let mut flags = EnumSet::empty();
|
||||||
|
if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
|
||||||
|
flags |= CompactFlags::ForceRepartition;
|
||||||
|
}
|
||||||
async {
|
async {
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||||
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
|
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
|
||||||
timeline
|
timeline
|
||||||
.compact(&cancel, &ctx)
|
.compact(&cancel, flags, &ctx)
|
||||||
.await
|
.await
|
||||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||||
json_response(StatusCode::OK, ())
|
json_response(StatusCode::OK, ())
|
||||||
}
|
}
|
||||||
.instrument(info_span!("manual_compaction", %tenant_id, %timeline_id))
|
.instrument(info_span!("manual_compaction", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1286,24 +1341,29 @@ async fn timeline_checkpoint_handler(
|
|||||||
request: Request<Body>,
|
request: Request<Body>,
|
||||||
cancel: CancellationToken,
|
cancel: CancellationToken,
|
||||||
) -> Result<Response<Body>, ApiError> {
|
) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||||
|
|
||||||
|
let mut flags = EnumSet::empty();
|
||||||
|
if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
|
||||||
|
flags |= CompactFlags::ForceRepartition;
|
||||||
|
}
|
||||||
async {
|
async {
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||||
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
|
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
|
||||||
timeline
|
timeline
|
||||||
.freeze_and_flush()
|
.freeze_and_flush()
|
||||||
.await
|
.await
|
||||||
.map_err(ApiError::InternalServerError)?;
|
.map_err(ApiError::InternalServerError)?;
|
||||||
timeline
|
timeline
|
||||||
.compact(&cancel, &ctx)
|
.compact(&cancel, flags, &ctx)
|
||||||
.await
|
.await
|
||||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||||
|
|
||||||
json_response(StatusCode::OK, ())
|
json_response(StatusCode::OK, ())
|
||||||
}
|
}
|
||||||
.instrument(info_span!("manual_checkpoint", %tenant_id, %timeline_id))
|
.instrument(info_span!("manual_checkpoint", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1311,12 +1371,12 @@ async fn timeline_download_remote_layers_handler_post(
|
|||||||
mut request: Request<Body>,
|
mut request: Request<Body>,
|
||||||
_cancel: CancellationToken,
|
_cancel: CancellationToken,
|
||||||
) -> Result<Response<Body>, ApiError> {
|
) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||||
let body: DownloadRemoteLayersTaskSpawnRequest = json_request(&mut request).await?;
|
let body: DownloadRemoteLayersTaskSpawnRequest = json_request(&mut request).await?;
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||||
|
|
||||||
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
|
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
|
||||||
match timeline.spawn_download_all_remote_layers(body).await {
|
match timeline.spawn_download_all_remote_layers(body).await {
|
||||||
Ok(st) => json_response(StatusCode::ACCEPTED, st),
|
Ok(st) => json_response(StatusCode::ACCEPTED, st),
|
||||||
Err(st) => json_response(StatusCode::CONFLICT, st),
|
Err(st) => json_response(StatusCode::CONFLICT, st),
|
||||||
@@ -1327,11 +1387,11 @@ async fn timeline_download_remote_layers_handler_get(
|
|||||||
request: Request<Body>,
|
request: Request<Body>,
|
||||||
_cancel: CancellationToken,
|
_cancel: CancellationToken,
|
||||||
) -> Result<Response<Body>, ApiError> {
|
) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||||
|
|
||||||
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
|
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
|
||||||
let info = timeline
|
let info = timeline
|
||||||
.get_download_all_remote_layers_task_info()
|
.get_download_all_remote_layers_task_info()
|
||||||
.context("task never started since last pageserver process start")
|
.context("task never started since last pageserver process start")
|
||||||
@@ -1377,9 +1437,9 @@ async fn getpage_at_lsn_handler(
|
|||||||
request: Request<Body>,
|
request: Request<Body>,
|
||||||
_cancel: CancellationToken,
|
_cancel: CancellationToken,
|
||||||
) -> Result<Response<Body>, ApiError> {
|
) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||||
|
|
||||||
struct Key(crate::repository::Key);
|
struct Key(crate::repository::Key);
|
||||||
|
|
||||||
@@ -1398,7 +1458,7 @@ async fn getpage_at_lsn_handler(
|
|||||||
|
|
||||||
async {
|
async {
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||||
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
|
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
|
||||||
|
|
||||||
let page = timeline.get(key.0, lsn, &ctx).await?;
|
let page = timeline.get(key.0, lsn, &ctx).await?;
|
||||||
|
|
||||||
@@ -1410,7 +1470,7 @@ async fn getpage_at_lsn_handler(
|
|||||||
.unwrap(),
|
.unwrap(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
.instrument(info_span!("timeline_get", %tenant_id, %timeline_id))
|
.instrument(info_span!("timeline_get", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1418,9 +1478,9 @@ async fn timeline_collect_keyspace(
|
|||||||
request: Request<Body>,
|
request: Request<Body>,
|
||||||
_cancel: CancellationToken,
|
_cancel: CancellationToken,
|
||||||
) -> Result<Response<Body>, ApiError> {
|
) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||||
|
|
||||||
struct Partitioning {
|
struct Partitioning {
|
||||||
keys: crate::keyspace::KeySpace,
|
keys: crate::keyspace::KeySpace,
|
||||||
@@ -1489,7 +1549,7 @@ async fn timeline_collect_keyspace(
|
|||||||
|
|
||||||
async {
|
async {
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||||
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
|
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
|
||||||
let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
|
let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
|
||||||
let keys = timeline
|
let keys = timeline
|
||||||
.collect_keyspace(at_lsn, &ctx)
|
.collect_keyspace(at_lsn, &ctx)
|
||||||
@@ -1498,15 +1558,15 @@ async fn timeline_collect_keyspace(
|
|||||||
|
|
||||||
json_response(StatusCode::OK, Partitioning { keys, at_lsn })
|
json_response(StatusCode::OK, Partitioning { keys, at_lsn })
|
||||||
}
|
}
|
||||||
.instrument(info_span!("timeline_collect_keyspace", %tenant_id, %timeline_id))
|
.instrument(info_span!("timeline_collect_keyspace", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn active_timeline_of_active_tenant(
|
async fn active_timeline_of_active_tenant(
|
||||||
tenant_id: TenantId,
|
tenant_shard_id: TenantShardId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
) -> Result<Arc<Timeline>, ApiError> {
|
) -> Result<Arc<Timeline>, ApiError> {
|
||||||
let tenant = mgr::get_tenant(tenant_id, true)?;
|
let tenant = mgr::get_tenant(tenant_shard_id, true)?;
|
||||||
tenant
|
tenant
|
||||||
.get_timeline(timeline_id, true)
|
.get_timeline(timeline_id, true)
|
||||||
.map_err(|e| ApiError::NotFound(e.into()))
|
.map_err(|e| ApiError::NotFound(e.into()))
|
||||||
@@ -1528,7 +1588,7 @@ async fn always_panic_handler(
|
|||||||
|
|
||||||
async fn disk_usage_eviction_run(
|
async fn disk_usage_eviction_run(
|
||||||
mut r: Request<Body>,
|
mut r: Request<Body>,
|
||||||
_cancel: CancellationToken,
|
cancel: CancellationToken,
|
||||||
) -> Result<Response<Body>, ApiError> {
|
) -> Result<Response<Body>, ApiError> {
|
||||||
check_permission(&r, None)?;
|
check_permission(&r, None)?;
|
||||||
|
|
||||||
@@ -1565,48 +1625,26 @@ async fn disk_usage_eviction_run(
|
|||||||
freed_bytes: 0,
|
freed_bytes: 0,
|
||||||
};
|
};
|
||||||
|
|
||||||
let (tx, rx) = tokio::sync::oneshot::channel();
|
|
||||||
|
|
||||||
let state = get_state(&r);
|
let state = get_state(&r);
|
||||||
|
|
||||||
if state.remote_storage.as_ref().is_none() {
|
let Some(storage) = state.remote_storage.as_ref() else {
|
||||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||||
"remote storage not configured, cannot run eviction iteration"
|
"remote storage not configured, cannot run eviction iteration"
|
||||||
)));
|
)));
|
||||||
}
|
};
|
||||||
|
|
||||||
let state = state.disk_usage_eviction_state.clone();
|
let state = state.disk_usage_eviction_state.clone();
|
||||||
|
|
||||||
let cancel = CancellationToken::new();
|
let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
|
||||||
let child_cancel = cancel.clone();
|
&state, storage, usage, &cancel,
|
||||||
let _g = cancel.drop_guard();
|
)
|
||||||
|
.await;
|
||||||
|
|
||||||
crate::task_mgr::spawn(
|
info!(?res, "disk_usage_eviction_task_iteration_impl finished");
|
||||||
crate::task_mgr::BACKGROUND_RUNTIME.handle(),
|
|
||||||
TaskKind::DiskUsageEviction,
|
|
||||||
None,
|
|
||||||
None,
|
|
||||||
"ondemand disk usage eviction",
|
|
||||||
false,
|
|
||||||
async move {
|
|
||||||
let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
|
|
||||||
&state,
|
|
||||||
usage,
|
|
||||||
&child_cancel,
|
|
||||||
)
|
|
||||||
.await;
|
|
||||||
|
|
||||||
info!(?res, "disk_usage_eviction_task_iteration_impl finished");
|
let res = res.map_err(ApiError::InternalServerError)?;
|
||||||
|
|
||||||
let _ = tx.send(res);
|
json_response(StatusCode::OK, res)
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
.in_current_span(),
|
|
||||||
);
|
|
||||||
|
|
||||||
let response = rx.await.unwrap().map_err(ApiError::InternalServerError)?;
|
|
||||||
|
|
||||||
json_response(StatusCode::OK, response)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
|
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
@@ -1675,8 +1713,24 @@ where
|
|||||||
let token_cloned = token.clone();
|
let token_cloned = token.clone();
|
||||||
let result = handler(r, token).await;
|
let result = handler(r, token).await;
|
||||||
if token_cloned.is_cancelled() {
|
if token_cloned.is_cancelled() {
|
||||||
info!("Cancelled request finished");
|
// dropguard has executed: we will never turn this result into response.
|
||||||
|
//
|
||||||
|
// at least temporarily do {:?} logging; these failures are rare enough but
|
||||||
|
// could hide difficult errors.
|
||||||
|
match &result {
|
||||||
|
Ok(response) => {
|
||||||
|
let status = response.status();
|
||||||
|
info!(%status, "Cancelled request finished successfully")
|
||||||
|
}
|
||||||
|
Err(e) => error!("Cancelled request finished with an error: {e:?}"),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
// only logging for cancelled panicked request handlers is the tracing_panic_hook,
|
||||||
|
// which should suffice.
|
||||||
|
//
|
||||||
|
// there is still a chance to lose the result due to race between
|
||||||
|
// returning from here and the actual connection closing happening
|
||||||
|
// before outer task gets to execute. leaving that up for #5815.
|
||||||
result
|
result
|
||||||
}
|
}
|
||||||
.in_current_span(),
|
.in_current_span(),
|
||||||
@@ -1767,23 +1821,25 @@ pub fn make_router(
|
|||||||
})
|
})
|
||||||
.get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
|
.get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
|
||||||
.post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
|
.post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
|
||||||
.get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
|
.get("/v1/tenant/:tenant_shard_id", |r| {
|
||||||
|
api_handler(r, tenant_status)
|
||||||
|
})
|
||||||
.delete("/v1/tenant/:tenant_shard_id", |r| {
|
.delete("/v1/tenant/:tenant_shard_id", |r| {
|
||||||
api_handler(r, tenant_delete_handler)
|
api_handler(r, tenant_delete_handler)
|
||||||
})
|
})
|
||||||
.get("/v1/tenant/:tenant_id/synthetic_size", |r| {
|
.get("/v1/tenant/:tenant_shard_id/synthetic_size", |r| {
|
||||||
api_handler(r, tenant_size_handler)
|
api_handler(r, tenant_size_handler)
|
||||||
})
|
})
|
||||||
.put("/v1/tenant/config", |r| {
|
.put("/v1/tenant/config", |r| {
|
||||||
api_handler(r, update_tenant_config_handler)
|
api_handler(r, update_tenant_config_handler)
|
||||||
})
|
})
|
||||||
.get("/v1/tenant/:tenant_id/config", |r| {
|
.get("/v1/tenant/:tenant_shard_id/config", |r| {
|
||||||
api_handler(r, get_tenant_config_handler)
|
api_handler(r, get_tenant_config_handler)
|
||||||
})
|
})
|
||||||
.put("/v1/tenant/:tenant_shard_id/location_config", |r| {
|
.put("/v1/tenant/:tenant_shard_id/location_config", |r| {
|
||||||
api_handler(r, put_tenant_location_config_handler)
|
api_handler(r, put_tenant_location_config_handler)
|
||||||
})
|
})
|
||||||
.get("/v1/tenant/:tenant_id/timeline", |r| {
|
.get("/v1/tenant/:tenant_shard_id/timeline", |r| {
|
||||||
api_handler(r, timeline_list_handler)
|
api_handler(r, timeline_list_handler)
|
||||||
})
|
})
|
||||||
.post("/v1/tenant/:tenant_shard_id/timeline", |r| {
|
.post("/v1/tenant/:tenant_shard_id/timeline", |r| {
|
||||||
@@ -1795,53 +1851,59 @@ pub fn make_router(
|
|||||||
.post("/v1/tenant/:tenant_id/detach", |r| {
|
.post("/v1/tenant/:tenant_id/detach", |r| {
|
||||||
api_handler(r, tenant_detach_handler)
|
api_handler(r, tenant_detach_handler)
|
||||||
})
|
})
|
||||||
|
.post("/v1/tenant/:tenant_shard_id/reset", |r| {
|
||||||
|
api_handler(r, tenant_reset_handler)
|
||||||
|
})
|
||||||
.post("/v1/tenant/:tenant_id/load", |r| {
|
.post("/v1/tenant/:tenant_id/load", |r| {
|
||||||
api_handler(r, tenant_load_handler)
|
api_handler(r, tenant_load_handler)
|
||||||
})
|
})
|
||||||
.post("/v1/tenant/:tenant_id/ignore", |r| {
|
.post("/v1/tenant/:tenant_id/ignore", |r| {
|
||||||
api_handler(r, tenant_ignore_handler)
|
api_handler(r, tenant_ignore_handler)
|
||||||
})
|
})
|
||||||
.get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
|
.get("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
|
||||||
api_handler(r, timeline_detail_handler)
|
api_handler(r, timeline_detail_handler)
|
||||||
})
|
})
|
||||||
.get(
|
.get(
|
||||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/get_lsn_by_timestamp",
|
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_lsn_by_timestamp",
|
||||||
|r| api_handler(r, get_lsn_by_timestamp_handler),
|
|r| api_handler(r, get_lsn_by_timestamp_handler),
|
||||||
)
|
)
|
||||||
.get(
|
.get(
|
||||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/get_timestamp_of_lsn",
|
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_timestamp_of_lsn",
|
||||||
|r| api_handler(r, get_timestamp_of_lsn_handler),
|
|r| api_handler(r, get_timestamp_of_lsn_handler),
|
||||||
)
|
)
|
||||||
.put("/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", |r| {
|
|
||||||
api_handler(r, timeline_gc_handler)
|
|
||||||
})
|
|
||||||
.put("/v1/tenant/:tenant_id/timeline/:timeline_id/compact", |r| {
|
|
||||||
testing_api_handler("run timeline compaction", r, timeline_compact_handler)
|
|
||||||
})
|
|
||||||
.put(
|
.put(
|
||||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint",
|
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/do_gc",
|
||||||
|
|r| api_handler(r, timeline_gc_handler),
|
||||||
|
)
|
||||||
|
.put(
|
||||||
|
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
|
||||||
|
|r| testing_api_handler("run timeline compaction", r, timeline_compact_handler),
|
||||||
|
)
|
||||||
|
.put(
|
||||||
|
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint",
|
||||||
|r| testing_api_handler("run timeline checkpoint", r, timeline_checkpoint_handler),
|
|r| testing_api_handler("run timeline checkpoint", r, timeline_checkpoint_handler),
|
||||||
)
|
)
|
||||||
.post(
|
.post(
|
||||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
|
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_remote_layers",
|
||||||
|r| api_handler(r, timeline_download_remote_layers_handler_post),
|
|r| api_handler(r, timeline_download_remote_layers_handler_post),
|
||||||
)
|
)
|
||||||
.get(
|
.get(
|
||||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
|
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_remote_layers",
|
||||||
|r| api_handler(r, timeline_download_remote_layers_handler_get),
|
|r| api_handler(r, timeline_download_remote_layers_handler_get),
|
||||||
)
|
)
|
||||||
.delete("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
|
.delete("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
|
||||||
api_handler(r, timeline_delete_handler)
|
api_handler(r, timeline_delete_handler)
|
||||||
})
|
})
|
||||||
.get("/v1/tenant/:tenant_id/timeline/:timeline_id/layer", |r| {
|
|
||||||
api_handler(r, layer_map_info_handler)
|
|
||||||
})
|
|
||||||
.get(
|
.get(
|
||||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
|
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer",
|
||||||
|
|r| api_handler(r, layer_map_info_handler),
|
||||||
|
)
|
||||||
|
.get(
|
||||||
|
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
|
||||||
|r| api_handler(r, layer_download_handler),
|
|r| api_handler(r, layer_download_handler),
|
||||||
)
|
)
|
||||||
.delete(
|
.delete(
|
||||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
|
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
|
||||||
|r| api_handler(r, evict_timeline_layer_handler),
|
|r| api_handler(r, evict_timeline_layer_handler),
|
||||||
)
|
)
|
||||||
.put("/v1/disk_usage_eviction/run", |r| {
|
.put("/v1/disk_usage_eviction/run", |r| {
|
||||||
@@ -1850,18 +1912,19 @@ pub fn make_router(
|
|||||||
.put("/v1/deletion_queue/flush", |r| {
|
.put("/v1/deletion_queue/flush", |r| {
|
||||||
api_handler(r, deletion_queue_flush)
|
api_handler(r, deletion_queue_flush)
|
||||||
})
|
})
|
||||||
.put("/v1/tenant/:tenant_id/break", |r| {
|
.put("/v1/tenant/:tenant_shard_id/break", |r| {
|
||||||
testing_api_handler("set tenant state to broken", r, handle_tenant_break)
|
testing_api_handler("set tenant state to broken", r, handle_tenant_break)
|
||||||
})
|
})
|
||||||
.get("/v1/panic", |r| api_handler(r, always_panic_handler))
|
.get("/v1/panic", |r| api_handler(r, always_panic_handler))
|
||||||
.post("/v1/tracing/event", |r| {
|
.post("/v1/tracing/event", |r| {
|
||||||
testing_api_handler("emit a tracing event", r, post_tracing_event_handler)
|
testing_api_handler("emit a tracing event", r, post_tracing_event_handler)
|
||||||
})
|
})
|
||||||
.get("/v1/tenant/:tenant_id/timeline/:timeline_id/getpage", |r| {
|
|
||||||
testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler)
|
|
||||||
})
|
|
||||||
.get(
|
.get(
|
||||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/keyspace",
|
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/getpage",
|
||||||
|
|r| testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler),
|
||||||
|
)
|
||||||
|
.get(
|
||||||
|
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/keyspace",
|
||||||
|r| testing_api_handler("read out the keyspace", r, timeline_collect_keyspace),
|
|r| testing_api_handler("read out the keyspace", r, timeline_collect_keyspace),
|
||||||
)
|
)
|
||||||
.any(handler_404))
|
.any(handler_404))
|
||||||
|
|||||||
@@ -2,19 +2,27 @@
|
|||||||
//! Import data and WAL from a PostgreSQL data directory and WAL segments into
|
//! Import data and WAL from a PostgreSQL data directory and WAL segments into
|
||||||
//! a neon Timeline.
|
//! a neon Timeline.
|
||||||
//!
|
//!
|
||||||
|
use std::io::SeekFrom;
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
|
|
||||||
use anyhow::{bail, ensure, Context, Result};
|
use anyhow::{bail, ensure, Context, Result};
|
||||||
|
use async_compression::tokio::bufread::ZstdDecoder;
|
||||||
|
use async_compression::{tokio::write::ZstdEncoder, zstd::CParameter, Level};
|
||||||
use bytes::Bytes;
|
use bytes::Bytes;
|
||||||
use camino::Utf8Path;
|
use camino::Utf8Path;
|
||||||
use futures::StreamExt;
|
use futures::StreamExt;
|
||||||
use tokio::io::{AsyncRead, AsyncReadExt};
|
use nix::NixPath;
|
||||||
|
use tokio::fs::{File, OpenOptions};
|
||||||
|
use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt};
|
||||||
use tokio_tar::Archive;
|
use tokio_tar::Archive;
|
||||||
|
use tokio_tar::Builder;
|
||||||
|
use tokio_tar::HeaderMode;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
use walkdir::WalkDir;
|
use walkdir::WalkDir;
|
||||||
|
|
||||||
use crate::context::RequestContext;
|
use crate::context::RequestContext;
|
||||||
use crate::pgdatadir_mapping::*;
|
use crate::pgdatadir_mapping::*;
|
||||||
|
use crate::tenant::remote_timeline_client::INITDB_PATH;
|
||||||
use crate::tenant::Timeline;
|
use crate::tenant::Timeline;
|
||||||
use crate::walingest::WalIngest;
|
use crate::walingest::WalIngest;
|
||||||
use crate::walrecord::DecodedWALRecord;
|
use crate::walrecord::DecodedWALRecord;
|
||||||
@@ -33,7 +41,9 @@ use utils::lsn::Lsn;
|
|||||||
pub fn get_lsn_from_controlfile(path: &Utf8Path) -> Result<Lsn> {
|
pub fn get_lsn_from_controlfile(path: &Utf8Path) -> Result<Lsn> {
|
||||||
// Read control file to extract the LSN
|
// Read control file to extract the LSN
|
||||||
let controlfile_path = path.join("global").join("pg_control");
|
let controlfile_path = path.join("global").join("pg_control");
|
||||||
let controlfile = ControlFileData::decode(&std::fs::read(controlfile_path)?)?;
|
let controlfile_buf = std::fs::read(&controlfile_path)
|
||||||
|
.with_context(|| format!("reading controlfile: {controlfile_path}"))?;
|
||||||
|
let controlfile = ControlFileData::decode(&controlfile_buf)?;
|
||||||
let lsn = controlfile.checkPoint;
|
let lsn = controlfile.checkPoint;
|
||||||
|
|
||||||
Ok(Lsn(lsn))
|
Ok(Lsn(lsn))
|
||||||
@@ -618,3 +628,65 @@ async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> Result<Bytes>
|
|||||||
reader.read_to_end(&mut buf).await?;
|
reader.read_to_end(&mut buf).await?;
|
||||||
Ok(Bytes::from(buf))
|
Ok(Bytes::from(buf))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn create_tar_zst(pgdata_path: &Utf8Path, tmp_path: &Utf8Path) -> Result<(File, u64)> {
|
||||||
|
let file = OpenOptions::new()
|
||||||
|
.create(true)
|
||||||
|
.truncate(true)
|
||||||
|
.read(true)
|
||||||
|
.write(true)
|
||||||
|
.open(&tmp_path)
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("tempfile creation {tmp_path}"))?;
|
||||||
|
|
||||||
|
let mut paths = Vec::new();
|
||||||
|
for entry in WalkDir::new(pgdata_path) {
|
||||||
|
let entry = entry?;
|
||||||
|
let metadata = entry.metadata().expect("error getting dir entry metadata");
|
||||||
|
// Also allow directories so that we also get empty directories
|
||||||
|
if !(metadata.is_file() || metadata.is_dir()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let path = entry.into_path();
|
||||||
|
paths.push(path);
|
||||||
|
}
|
||||||
|
// Do a sort to get a more consistent listing
|
||||||
|
paths.sort_unstable();
|
||||||
|
let zstd = ZstdEncoder::with_quality_and_params(
|
||||||
|
file,
|
||||||
|
Level::Default,
|
||||||
|
&[CParameter::enable_long_distance_matching(true)],
|
||||||
|
);
|
||||||
|
let mut builder = Builder::new(zstd);
|
||||||
|
// Use reproducible header mode
|
||||||
|
builder.mode(HeaderMode::Deterministic);
|
||||||
|
for path in paths {
|
||||||
|
let rel_path = path.strip_prefix(pgdata_path)?;
|
||||||
|
if rel_path.is_empty() {
|
||||||
|
// The top directory should not be compressed,
|
||||||
|
// the tar crate doesn't like that
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
builder.append_path_with_name(&path, rel_path).await?;
|
||||||
|
}
|
||||||
|
let mut zstd = builder.into_inner().await?;
|
||||||
|
zstd.shutdown().await?;
|
||||||
|
let mut compressed = zstd.into_inner();
|
||||||
|
let compressed_len = compressed.metadata().await?.len();
|
||||||
|
const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024;
|
||||||
|
if compressed_len > INITDB_TAR_ZST_WARN_LIMIT {
|
||||||
|
warn!("compressed {INITDB_PATH} size of {compressed_len} is above limit {INITDB_TAR_ZST_WARN_LIMIT}.");
|
||||||
|
}
|
||||||
|
compressed.seek(SeekFrom::Start(0)).await?;
|
||||||
|
Ok((compressed, compressed_len))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn extract_tar_zst(
|
||||||
|
pgdata_path: &Utf8Path,
|
||||||
|
tar_zst: impl AsyncBufRead + Unpin,
|
||||||
|
) -> Result<()> {
|
||||||
|
let tar = Box::pin(ZstdDecoder::new(tar_zst));
|
||||||
|
let mut archive = Archive::new(tar);
|
||||||
|
archive.unpack(pgdata_path).await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|||||||
@@ -49,11 +49,22 @@ pub const DELTA_FILE_MAGIC: u16 = 0x5A61;
|
|||||||
|
|
||||||
static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
|
static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
|
||||||
|
|
||||||
|
/// The main cancellation token for the process.
|
||||||
|
///
|
||||||
|
/// Should only ever be used to create child tokens.
|
||||||
|
pub static PAGESERVER_SHUTDOWN_TOKEN: std::sync::OnceLock<tokio_util::sync::CancellationToken> =
|
||||||
|
std::sync::OnceLock::new();
|
||||||
|
|
||||||
pub use crate::metrics::preinitialize_metrics;
|
pub use crate::metrics::preinitialize_metrics;
|
||||||
|
|
||||||
#[tracing::instrument(skip_all, fields(%exit_code))]
|
#[tracing::instrument(skip_all, fields(%exit_code))]
|
||||||
pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_code: i32) {
|
pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_code: i32) {
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
|
||||||
|
if let Some(token) = PAGESERVER_SHUTDOWN_TOKEN.get() {
|
||||||
|
token.cancel();
|
||||||
|
}
|
||||||
|
|
||||||
// Shut down the libpq endpoint task. This prevents new connections from
|
// Shut down the libpq endpoint task. This prevents new connections from
|
||||||
// being accepted.
|
// being accepted.
|
||||||
timed(
|
timed(
|
||||||
@@ -186,13 +197,6 @@ pub struct InitializationOrder {
|
|||||||
/// Each initial tenant load task carries this until completion.
|
/// Each initial tenant load task carries this until completion.
|
||||||
pub initial_tenant_load: Option<utils::completion::Completion>,
|
pub initial_tenant_load: Option<utils::completion::Completion>,
|
||||||
|
|
||||||
/// Barrier for when we can start initial logical size calculations.
|
|
||||||
pub initial_logical_size_can_start: utils::completion::Barrier,
|
|
||||||
|
|
||||||
/// Each timeline owns a clone of this to be consumed on the initial logical size calculation
|
|
||||||
/// attempt. It is important to drop this once the attempt has completed.
|
|
||||||
pub initial_logical_size_attempt: Option<utils::completion::Completion>,
|
|
||||||
|
|
||||||
/// Barrier for when we can start any background jobs.
|
/// Barrier for when we can start any background jobs.
|
||||||
///
|
///
|
||||||
/// This can be broken up later on, but right now there is just one class of a background job.
|
/// This can be broken up later on, but right now there is just one class of a background job.
|
||||||
@@ -212,7 +216,7 @@ async fn timed<Fut: std::future::Future>(
|
|||||||
match tokio::time::timeout(warn_at, &mut fut).await {
|
match tokio::time::timeout(warn_at, &mut fut).await {
|
||||||
Ok(ret) => {
|
Ok(ret) => {
|
||||||
tracing::info!(
|
tracing::info!(
|
||||||
task = name,
|
stage = name,
|
||||||
elapsed_ms = started.elapsed().as_millis(),
|
elapsed_ms = started.elapsed().as_millis(),
|
||||||
"completed"
|
"completed"
|
||||||
);
|
);
|
||||||
@@ -220,7 +224,7 @@ async fn timed<Fut: std::future::Future>(
|
|||||||
}
|
}
|
||||||
Err(_) => {
|
Err(_) => {
|
||||||
tracing::info!(
|
tracing::info!(
|
||||||
task = name,
|
stage = name,
|
||||||
elapsed_ms = started.elapsed().as_millis(),
|
elapsed_ms = started.elapsed().as_millis(),
|
||||||
"still waiting, taking longer than expected..."
|
"still waiting, taking longer than expected..."
|
||||||
);
|
);
|
||||||
@@ -229,7 +233,7 @@ async fn timed<Fut: std::future::Future>(
|
|||||||
|
|
||||||
// this has a global allowed_errors
|
// this has a global allowed_errors
|
||||||
tracing::warn!(
|
tracing::warn!(
|
||||||
task = name,
|
stage = name,
|
||||||
elapsed_ms = started.elapsed().as_millis(),
|
elapsed_ms = started.elapsed().as_millis(),
|
||||||
"completed, took longer than expected"
|
"completed, took longer than expected"
|
||||||
);
|
);
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ use metrics::{
|
|||||||
HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
|
HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
|
||||||
};
|
};
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
|
use pageserver_api::shard::TenantShardId;
|
||||||
use strum::{EnumCount, IntoEnumIterator, VariantNames};
|
use strum::{EnumCount, IntoEnumIterator, VariantNames};
|
||||||
use strum_macros::{EnumVariantNames, IntoStaticStr};
|
use strum_macros::{EnumVariantNames, IntoStaticStr};
|
||||||
use utils::id::{TenantId, TimelineId};
|
use utils::id::{TenantId, TimelineId};
|
||||||
@@ -284,6 +285,63 @@ pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheS
|
|||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
|
pub(crate) mod page_cache_eviction_metrics {
|
||||||
|
use std::num::NonZeroUsize;
|
||||||
|
|
||||||
|
use metrics::{register_int_counter_vec, IntCounter, IntCounterVec};
|
||||||
|
use once_cell::sync::Lazy;
|
||||||
|
|
||||||
|
#[derive(Clone, Copy)]
|
||||||
|
pub(crate) enum Outcome {
|
||||||
|
FoundSlotUnused { iters: NonZeroUsize },
|
||||||
|
FoundSlotEvicted { iters: NonZeroUsize },
|
||||||
|
ItersExceeded { iters: NonZeroUsize },
|
||||||
|
}
|
||||||
|
|
||||||
|
static ITERS_TOTAL_VEC: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||||
|
register_int_counter_vec!(
|
||||||
|
"pageserver_page_cache_find_victim_iters_total",
|
||||||
|
"Counter for the number of iterations in the find_victim loop",
|
||||||
|
&["outcome"],
|
||||||
|
)
|
||||||
|
.expect("failed to define a metric")
|
||||||
|
});
|
||||||
|
|
||||||
|
static CALLS_VEC: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||||
|
register_int_counter_vec!(
|
||||||
|
"pageserver_page_cache_find_victim_calls",
|
||||||
|
"Incremented at the end of each find_victim() call.\
|
||||||
|
Filter by outcome to get e.g., eviction rate.",
|
||||||
|
&["outcome"]
|
||||||
|
)
|
||||||
|
.unwrap()
|
||||||
|
});
|
||||||
|
|
||||||
|
pub(crate) fn observe(outcome: Outcome) {
|
||||||
|
macro_rules! dry {
|
||||||
|
($label:literal, $iters:expr) => {{
|
||||||
|
static LABEL: &'static str = $label;
|
||||||
|
static ITERS_TOTAL: Lazy<IntCounter> =
|
||||||
|
Lazy::new(|| ITERS_TOTAL_VEC.with_label_values(&[LABEL]));
|
||||||
|
static CALLS: Lazy<IntCounter> =
|
||||||
|
Lazy::new(|| CALLS_VEC.with_label_values(&[LABEL]));
|
||||||
|
ITERS_TOTAL.inc_by(($iters.get()) as u64);
|
||||||
|
CALLS.inc();
|
||||||
|
}};
|
||||||
|
}
|
||||||
|
match outcome {
|
||||||
|
Outcome::FoundSlotUnused { iters } => dry!("found_empty", iters),
|
||||||
|
Outcome::FoundSlotEvicted { iters } => {
|
||||||
|
dry!("found_evicted", iters)
|
||||||
|
}
|
||||||
|
Outcome::ItersExceeded { iters } => {
|
||||||
|
dry!("err_iters_exceeded", iters);
|
||||||
|
super::page_cache_errors_inc(super::PageCacheErrorKind::EvictIterLimit);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy<Histogram> = Lazy::new(|| {
|
pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||||
register_histogram!(
|
register_histogram!(
|
||||||
"pageserver_page_cache_acquire_pinned_slot_seconds",
|
"pageserver_page_cache_acquire_pinned_slot_seconds",
|
||||||
@@ -293,14 +351,6 @@ pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy<Histogram> = Lazy::n
|
|||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
});
|
});
|
||||||
|
|
||||||
pub(crate) static PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
|
|
||||||
register_int_counter!(
|
|
||||||
"pageserver_page_cache_find_victim_iters_total",
|
|
||||||
"Counter for the number of iterations in the find_victim loop",
|
|
||||||
)
|
|
||||||
.expect("failed to define a metric")
|
|
||||||
});
|
|
||||||
|
|
||||||
static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
|
static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||||
register_int_counter_vec!(
|
register_int_counter_vec!(
|
||||||
"page_cache_errors_total",
|
"page_cache_errors_total",
|
||||||
@@ -402,6 +452,129 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
|||||||
.expect("failed to define current logical size metric")
|
.expect("failed to define current logical size metric")
|
||||||
});
|
});
|
||||||
|
|
||||||
|
pub(crate) mod initial_logical_size {
|
||||||
|
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
|
||||||
|
use once_cell::sync::Lazy;
|
||||||
|
|
||||||
|
pub(crate) struct StartCalculation(IntCounterVec);
|
||||||
|
pub(crate) static START_CALCULATION: Lazy<StartCalculation> = Lazy::new(|| {
|
||||||
|
StartCalculation(
|
||||||
|
register_int_counter_vec!(
|
||||||
|
"pageserver_initial_logical_size_start_calculation",
|
||||||
|
"Incremented each time we start an initial logical size calculation attempt. \
|
||||||
|
The `circumstances` label provides some additional details.",
|
||||||
|
&["attempt", "circumstances"]
|
||||||
|
)
|
||||||
|
.unwrap(),
|
||||||
|
)
|
||||||
|
});
|
||||||
|
|
||||||
|
struct DropCalculation {
|
||||||
|
first: IntCounter,
|
||||||
|
retry: IntCounter,
|
||||||
|
}
|
||||||
|
|
||||||
|
static DROP_CALCULATION: Lazy<DropCalculation> = Lazy::new(|| {
|
||||||
|
let vec = register_int_counter_vec!(
|
||||||
|
"pageserver_initial_logical_size_drop_calculation",
|
||||||
|
"Incremented each time we abort a started size calculation attmpt.",
|
||||||
|
&["attempt"]
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
DropCalculation {
|
||||||
|
first: vec.with_label_values(&["first"]),
|
||||||
|
retry: vec.with_label_values(&["retry"]),
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
pub(crate) struct Calculated {
|
||||||
|
pub(crate) births: IntCounter,
|
||||||
|
pub(crate) deaths: IntCounter,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) static CALCULATED: Lazy<Calculated> = Lazy::new(|| Calculated {
|
||||||
|
births: register_int_counter!(
|
||||||
|
"pageserver_initial_logical_size_finish_calculation",
|
||||||
|
"Incremented every time we finish calculation of initial logical size.\
|
||||||
|
If everything is working well, this should happen at most once per Timeline object."
|
||||||
|
)
|
||||||
|
.unwrap(),
|
||||||
|
deaths: register_int_counter!(
|
||||||
|
"pageserver_initial_logical_size_drop_finished_calculation",
|
||||||
|
"Incremented when we drop a finished initial logical size calculation result.\
|
||||||
|
Mainly useful to turn pageserver_initial_logical_size_finish_calculation into a gauge."
|
||||||
|
)
|
||||||
|
.unwrap(),
|
||||||
|
});
|
||||||
|
|
||||||
|
pub(crate) struct OngoingCalculationGuard {
|
||||||
|
inc_drop_calculation: Option<IntCounter>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(strum_macros::IntoStaticStr)]
|
||||||
|
pub(crate) enum StartCircumstances {
|
||||||
|
EmptyInitial,
|
||||||
|
SkippedConcurrencyLimiter,
|
||||||
|
AfterBackgroundTasksRateLimit,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl StartCalculation {
|
||||||
|
pub(crate) fn first(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
|
||||||
|
let circumstances_label: &'static str = circumstances.into();
|
||||||
|
self.0.with_label_values(&["first", circumstances_label]);
|
||||||
|
OngoingCalculationGuard {
|
||||||
|
inc_drop_calculation: Some(DROP_CALCULATION.first.clone()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pub(crate) fn retry(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
|
||||||
|
let circumstances_label: &'static str = circumstances.into();
|
||||||
|
self.0.with_label_values(&["retry", circumstances_label]);
|
||||||
|
OngoingCalculationGuard {
|
||||||
|
inc_drop_calculation: Some(DROP_CALCULATION.retry.clone()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Drop for OngoingCalculationGuard {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
if let Some(counter) = self.inc_drop_calculation.take() {
|
||||||
|
counter.inc();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl OngoingCalculationGuard {
|
||||||
|
pub(crate) fn calculation_result_saved(mut self) -> FinishedCalculationGuard {
|
||||||
|
drop(self.inc_drop_calculation.take());
|
||||||
|
CALCULATED.births.inc();
|
||||||
|
FinishedCalculationGuard {
|
||||||
|
inc_on_drop: CALCULATED.deaths.clone(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) struct FinishedCalculationGuard {
|
||||||
|
inc_on_drop: IntCounter,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Drop for FinishedCalculationGuard {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
self.inc_on_drop.inc();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// context: https://github.com/neondatabase/neon/issues/5963
|
||||||
|
pub(crate) static TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE: Lazy<IntCounter> =
|
||||||
|
Lazy::new(|| {
|
||||||
|
register_int_counter!(
|
||||||
|
"pageserver_initial_logical_size_timelines_where_walreceiver_got_approximate_size",
|
||||||
|
"Counter for the following event: walreceiver calls\
|
||||||
|
Timeline::get_current_logical_size() and it returns `Approximate` for the first time."
|
||||||
|
)
|
||||||
|
.unwrap()
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||||
register_uint_gauge_vec!(
|
register_uint_gauge_vec!(
|
||||||
"pageserver_tenant_states_count",
|
"pageserver_tenant_states_count",
|
||||||
@@ -477,7 +650,7 @@ static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy<IntCounterVec> = Lazy::new(||
|
|||||||
"pageserver_evictions_with_low_residence_duration",
|
"pageserver_evictions_with_low_residence_duration",
|
||||||
"If a layer is evicted that was resident for less than `low_threshold`, it is counted to this counter. \
|
"If a layer is evicted that was resident for less than `low_threshold`, it is counted to this counter. \
|
||||||
Residence duration is determined using the `residence_duration_data_source`.",
|
Residence duration is determined using the `residence_duration_data_source`.",
|
||||||
&["tenant_id", "timeline_id", "residence_duration_data_source", "low_threshold_secs"]
|
&["tenant_id", "shard_id", "timeline_id", "residence_duration_data_source", "low_threshold_secs"]
|
||||||
)
|
)
|
||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
});
|
});
|
||||||
@@ -541,10 +714,16 @@ impl EvictionsWithLowResidenceDurationBuilder {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn build(&self, tenant_id: &str, timeline_id: &str) -> EvictionsWithLowResidenceDuration {
|
fn build(
|
||||||
|
&self,
|
||||||
|
tenant_id: &str,
|
||||||
|
shard_id: &str,
|
||||||
|
timeline_id: &str,
|
||||||
|
) -> EvictionsWithLowResidenceDuration {
|
||||||
let counter = EVICTIONS_WITH_LOW_RESIDENCE_DURATION
|
let counter = EVICTIONS_WITH_LOW_RESIDENCE_DURATION
|
||||||
.get_metric_with_label_values(&[
|
.get_metric_with_label_values(&[
|
||||||
tenant_id,
|
tenant_id,
|
||||||
|
shard_id,
|
||||||
timeline_id,
|
timeline_id,
|
||||||
self.data_source,
|
self.data_source,
|
||||||
&EvictionsWithLowResidenceDuration::threshold_label_value(self.threshold),
|
&EvictionsWithLowResidenceDuration::threshold_label_value(self.threshold),
|
||||||
@@ -575,21 +754,24 @@ impl EvictionsWithLowResidenceDuration {
|
|||||||
pub fn change_threshold(
|
pub fn change_threshold(
|
||||||
&mut self,
|
&mut self,
|
||||||
tenant_id: &str,
|
tenant_id: &str,
|
||||||
|
shard_id: &str,
|
||||||
timeline_id: &str,
|
timeline_id: &str,
|
||||||
new_threshold: Duration,
|
new_threshold: Duration,
|
||||||
) {
|
) {
|
||||||
if new_threshold == self.threshold {
|
if new_threshold == self.threshold {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
let mut with_new =
|
let mut with_new = EvictionsWithLowResidenceDurationBuilder::new(
|
||||||
EvictionsWithLowResidenceDurationBuilder::new(self.data_source, new_threshold)
|
self.data_source,
|
||||||
.build(tenant_id, timeline_id);
|
new_threshold,
|
||||||
|
)
|
||||||
|
.build(tenant_id, shard_id, timeline_id);
|
||||||
std::mem::swap(self, &mut with_new);
|
std::mem::swap(self, &mut with_new);
|
||||||
with_new.remove(tenant_id, timeline_id);
|
with_new.remove(tenant_id, shard_id, timeline_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
// This could be a `Drop` impl, but, we need the `tenant_id` and `timeline_id`.
|
// This could be a `Drop` impl, but, we need the `tenant_id` and `timeline_id`.
|
||||||
fn remove(&mut self, tenant_id: &str, timeline_id: &str) {
|
fn remove(&mut self, tenant_id: &str, shard_id: &str, timeline_id: &str) {
|
||||||
let Some(_counter) = self.counter.take() else {
|
let Some(_counter) = self.counter.take() else {
|
||||||
return;
|
return;
|
||||||
};
|
};
|
||||||
@@ -598,6 +780,7 @@ impl EvictionsWithLowResidenceDuration {
|
|||||||
|
|
||||||
let removed = EVICTIONS_WITH_LOW_RESIDENCE_DURATION.remove_label_values(&[
|
let removed = EVICTIONS_WITH_LOW_RESIDENCE_DURATION.remove_label_values(&[
|
||||||
tenant_id,
|
tenant_id,
|
||||||
|
shard_id,
|
||||||
timeline_id,
|
timeline_id,
|
||||||
self.data_source,
|
self.data_source,
|
||||||
&threshold,
|
&threshold,
|
||||||
@@ -638,7 +821,7 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
|
|||||||
///
|
///
|
||||||
/// Operations:
|
/// Operations:
|
||||||
/// - open ([`std::fs::OpenOptions::open`])
|
/// - open ([`std::fs::OpenOptions::open`])
|
||||||
/// - close (dropping [`std::fs::File`])
|
/// - close (dropping [`crate::virtual_file::VirtualFile`])
|
||||||
/// - close-by-replace (close by replacement algorithm)
|
/// - close-by-replace (close by replacement algorithm)
|
||||||
/// - read (`read_at`)
|
/// - read (`read_at`)
|
||||||
/// - write (`write_at`)
|
/// - write (`write_at`)
|
||||||
@@ -650,6 +833,7 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
|
|||||||
)]
|
)]
|
||||||
pub(crate) enum StorageIoOperation {
|
pub(crate) enum StorageIoOperation {
|
||||||
Open,
|
Open,
|
||||||
|
OpenAfterReplace,
|
||||||
Close,
|
Close,
|
||||||
CloseByReplace,
|
CloseByReplace,
|
||||||
Read,
|
Read,
|
||||||
@@ -663,6 +847,7 @@ impl StorageIoOperation {
|
|||||||
pub fn as_str(&self) -> &'static str {
|
pub fn as_str(&self) -> &'static str {
|
||||||
match self {
|
match self {
|
||||||
StorageIoOperation::Open => "open",
|
StorageIoOperation::Open => "open",
|
||||||
|
StorageIoOperation::OpenAfterReplace => "open-after-replace",
|
||||||
StorageIoOperation::Close => "close",
|
StorageIoOperation::Close => "close",
|
||||||
StorageIoOperation::CloseByReplace => "close-by-replace",
|
StorageIoOperation::CloseByReplace => "close-by-replace",
|
||||||
StorageIoOperation::Read => "read",
|
StorageIoOperation::Read => "read",
|
||||||
@@ -717,6 +902,25 @@ pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
|
|||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
});
|
});
|
||||||
|
|
||||||
|
pub(crate) mod virtual_file_descriptor_cache {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
pub(crate) static SIZE_MAX: Lazy<UIntGauge> = Lazy::new(|| {
|
||||||
|
register_uint_gauge!(
|
||||||
|
"pageserver_virtual_file_descriptor_cache_size_max",
|
||||||
|
"Maximum number of open file descriptors in the cache."
|
||||||
|
)
|
||||||
|
.unwrap()
|
||||||
|
});
|
||||||
|
|
||||||
|
// SIZE_CURRENT: derive it like so:
|
||||||
|
// ```
|
||||||
|
// sum (pageserver_io_operations_seconds_count{operation=~"^(open|open-after-replace)$")
|
||||||
|
// -ignoring(operation)
|
||||||
|
// sum(pageserver_io_operations_seconds_count{operation=~"^(close|close-by-replace)$"}
|
||||||
|
// ```
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
struct GlobalAndPerTimelineHistogram {
|
struct GlobalAndPerTimelineHistogram {
|
||||||
global: Histogram,
|
global: Histogram,
|
||||||
@@ -1043,6 +1247,30 @@ pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
pub(crate) struct WalIngestMetrics {
|
||||||
|
pub(crate) records_received: IntCounter,
|
||||||
|
pub(crate) records_committed: IntCounter,
|
||||||
|
pub(crate) records_filtered: IntCounter,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
|
||||||
|
records_received: register_int_counter!(
|
||||||
|
"pageserver_wal_ingest_records_received",
|
||||||
|
"Number of WAL records received from safekeepers"
|
||||||
|
)
|
||||||
|
.expect("failed to define a metric"),
|
||||||
|
records_committed: register_int_counter!(
|
||||||
|
"pageserver_wal_ingest_records_committed",
|
||||||
|
"Number of WAL records which resulted in writes to pageserver storage"
|
||||||
|
)
|
||||||
|
.expect("failed to define a metric"),
|
||||||
|
records_filtered: register_int_counter!(
|
||||||
|
"pageserver_wal_ingest_records_filtered",
|
||||||
|
"Number of WAL records filtered out due to sharding"
|
||||||
|
)
|
||||||
|
.expect("failed to define a metric"),
|
||||||
|
});
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||||
pub enum RemoteOpKind {
|
pub enum RemoteOpKind {
|
||||||
Upload,
|
Upload,
|
||||||
@@ -1252,9 +1480,20 @@ pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
|
|||||||
.unwrap()
|
.unwrap()
|
||||||
});
|
});
|
||||||
|
|
||||||
|
pub(crate) static WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
|
||||||
|
register_histogram!(
|
||||||
|
"pageserver_wal_redo_process_launch_duration",
|
||||||
|
"Histogram of the duration of successful WalRedoProcess::launch calls",
|
||||||
|
redo_histogram_time_buckets!(),
|
||||||
|
)
|
||||||
|
.expect("failed to define a metric")
|
||||||
|
});
|
||||||
|
|
||||||
pub(crate) struct WalRedoProcessCounters {
|
pub(crate) struct WalRedoProcessCounters {
|
||||||
pub(crate) started: IntCounter,
|
pub(crate) started: IntCounter,
|
||||||
pub(crate) killed_by_cause: enum_map::EnumMap<WalRedoKillCause, IntCounter>,
|
pub(crate) killed_by_cause: enum_map::EnumMap<WalRedoKillCause, IntCounter>,
|
||||||
|
pub(crate) active_stderr_logger_tasks_started: IntCounter,
|
||||||
|
pub(crate) active_stderr_logger_tasks_finished: IntCounter,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, enum_map::Enum, strum_macros::IntoStaticStr)]
|
#[derive(Debug, enum_map::Enum, strum_macros::IntoStaticStr)]
|
||||||
@@ -1278,6 +1517,19 @@ impl Default for WalRedoProcessCounters {
|
|||||||
&["cause"],
|
&["cause"],
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
|
let active_stderr_logger_tasks_started = register_int_counter!(
|
||||||
|
"pageserver_walredo_stderr_logger_tasks_started_total",
|
||||||
|
"Number of active walredo stderr logger tasks that have started",
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let active_stderr_logger_tasks_finished = register_int_counter!(
|
||||||
|
"pageserver_walredo_stderr_logger_tasks_finished_total",
|
||||||
|
"Number of active walredo stderr logger tasks that have finished",
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
Self {
|
Self {
|
||||||
started,
|
started,
|
||||||
killed_by_cause: EnumMap::from_array(std::array::from_fn(|i| {
|
killed_by_cause: EnumMap::from_array(std::array::from_fn(|i| {
|
||||||
@@ -1285,6 +1537,8 @@ impl Default for WalRedoProcessCounters {
|
|||||||
let cause_str: &'static str = cause.into();
|
let cause_str: &'static str = cause.into();
|
||||||
killed.with_label_values(&[cause_str])
|
killed.with_label_values(&[cause_str])
|
||||||
})),
|
})),
|
||||||
|
active_stderr_logger_tasks_started,
|
||||||
|
active_stderr_logger_tasks_finished,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1359,6 +1613,7 @@ impl StorageTimeMetrics {
|
|||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct TimelineMetrics {
|
pub struct TimelineMetrics {
|
||||||
tenant_id: String,
|
tenant_id: String,
|
||||||
|
shard_id: String,
|
||||||
timeline_id: String,
|
timeline_id: String,
|
||||||
pub flush_time_histo: StorageTimeMetrics,
|
pub flush_time_histo: StorageTimeMetrics,
|
||||||
pub compact_time_histo: StorageTimeMetrics,
|
pub compact_time_histo: StorageTimeMetrics,
|
||||||
@@ -1379,11 +1634,12 @@ pub struct TimelineMetrics {
|
|||||||
|
|
||||||
impl TimelineMetrics {
|
impl TimelineMetrics {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
tenant_id: &TenantId,
|
tenant_shard_id: &TenantShardId,
|
||||||
timeline_id: &TimelineId,
|
timeline_id: &TimelineId,
|
||||||
evictions_with_low_residence_duration_builder: EvictionsWithLowResidenceDurationBuilder,
|
evictions_with_low_residence_duration_builder: EvictionsWithLowResidenceDurationBuilder,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
let tenant_id = tenant_id.to_string();
|
let tenant_id = tenant_shard_id.tenant_id.to_string();
|
||||||
|
let shard_id = format!("{}", tenant_shard_id.shard_slug());
|
||||||
let timeline_id = timeline_id.to_string();
|
let timeline_id = timeline_id.to_string();
|
||||||
let flush_time_histo =
|
let flush_time_histo =
|
||||||
StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id);
|
StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id);
|
||||||
@@ -1420,11 +1676,12 @@ impl TimelineMetrics {
|
|||||||
let evictions = EVICTIONS
|
let evictions = EVICTIONS
|
||||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let evictions_with_low_residence_duration =
|
let evictions_with_low_residence_duration = evictions_with_low_residence_duration_builder
|
||||||
evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);
|
.build(&tenant_id, &shard_id, &timeline_id);
|
||||||
|
|
||||||
TimelineMetrics {
|
TimelineMetrics {
|
||||||
tenant_id,
|
tenant_id,
|
||||||
|
shard_id,
|
||||||
timeline_id,
|
timeline_id,
|
||||||
flush_time_histo,
|
flush_time_histo,
|
||||||
compact_time_histo,
|
compact_time_histo,
|
||||||
@@ -1470,6 +1727,7 @@ impl Drop for TimelineMetrics {
|
|||||||
fn drop(&mut self) {
|
fn drop(&mut self) {
|
||||||
let tenant_id = &self.tenant_id;
|
let tenant_id = &self.tenant_id;
|
||||||
let timeline_id = &self.timeline_id;
|
let timeline_id = &self.timeline_id;
|
||||||
|
let shard_id = &self.shard_id;
|
||||||
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
|
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
|
||||||
{
|
{
|
||||||
RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
|
RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
|
||||||
@@ -1483,7 +1741,7 @@ impl Drop for TimelineMetrics {
|
|||||||
self.evictions_with_low_residence_duration
|
self.evictions_with_low_residence_duration
|
||||||
.write()
|
.write()
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.remove(tenant_id, timeline_id);
|
.remove(tenant_id, shard_id, timeline_id);
|
||||||
|
|
||||||
// The following metrics are born outside of the TimelineMetrics lifecycle but still
|
// The following metrics are born outside of the TimelineMetrics lifecycle but still
|
||||||
// removed at the end of it. The idea is to have the metrics outlive the
|
// removed at the end of it. The idea is to have the metrics outlive the
|
||||||
@@ -1571,9 +1829,9 @@ pub struct RemoteTimelineClientMetrics {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl RemoteTimelineClientMetrics {
|
impl RemoteTimelineClientMetrics {
|
||||||
pub fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
|
pub fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
|
||||||
RemoteTimelineClientMetrics {
|
RemoteTimelineClientMetrics {
|
||||||
tenant_id: tenant_id.to_string(),
|
tenant_id: tenant_shard_id.tenant_id.to_string(),
|
||||||
timeline_id: timeline_id.to_string(),
|
timeline_id: timeline_id.to_string(),
|
||||||
calls_unfinished_gauge: Mutex::new(HashMap::default()),
|
calls_unfinished_gauge: Mutex::new(HashMap::default()),
|
||||||
bytes_started_counter: Mutex::new(HashMap::default()),
|
bytes_started_counter: Mutex::new(HashMap::default()),
|
||||||
@@ -1944,6 +2202,8 @@ pub fn preinitialize_metrics() {
|
|||||||
// Tenant manager stats
|
// Tenant manager stats
|
||||||
Lazy::force(&TENANT_MANAGER);
|
Lazy::force(&TENANT_MANAGER);
|
||||||
|
|
||||||
|
Lazy::force(&crate::tenant::storage_layer::layer::LAYER_IMPL_METRICS);
|
||||||
|
|
||||||
// countervecs
|
// countervecs
|
||||||
[&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
|
[&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
|
||||||
.into_iter()
|
.into_iter()
|
||||||
@@ -1961,6 +2221,7 @@ pub fn preinitialize_metrics() {
|
|||||||
&WAL_REDO_TIME,
|
&WAL_REDO_TIME,
|
||||||
&WAL_REDO_RECORDS_HISTOGRAM,
|
&WAL_REDO_RECORDS_HISTOGRAM,
|
||||||
&WAL_REDO_BYTES_HISTOGRAM,
|
&WAL_REDO_BYTES_HISTOGRAM,
|
||||||
|
&WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM,
|
||||||
]
|
]
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.for_each(|h| {
|
.for_each(|h| {
|
||||||
|
|||||||
@@ -28,7 +28,7 @@
|
|||||||
//! Page cache maps from a cache key to a buffer slot.
|
//! Page cache maps from a cache key to a buffer slot.
|
||||||
//! The cache key uniquely identifies the piece of data that is being cached.
|
//! The cache key uniquely identifies the piece of data that is being cached.
|
||||||
//!
|
//!
|
||||||
//! The cache key for **materialized pages** is [`TenantId`], [`TimelineId`], [`Key`], and [`Lsn`].
|
//! The cache key for **materialized pages** is [`TenantShardId`], [`TimelineId`], [`Key`], and [`Lsn`].
|
||||||
//! Use [`PageCache::memorize_materialized_page`] and [`PageCache::lookup_materialized_page`] for fill & access.
|
//! Use [`PageCache::memorize_materialized_page`] and [`PageCache::lookup_materialized_page`] for fill & access.
|
||||||
//!
|
//!
|
||||||
//! The cache key for **immutable file** pages is [`FileId`] and a block number.
|
//! The cache key for **immutable file** pages is [`FileId`] and a block number.
|
||||||
@@ -83,12 +83,14 @@ use std::{
|
|||||||
|
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use once_cell::sync::OnceCell;
|
use once_cell::sync::OnceCell;
|
||||||
use utils::{
|
use pageserver_api::shard::TenantShardId;
|
||||||
id::{TenantId, TimelineId},
|
use utils::{id::TimelineId, lsn::Lsn};
|
||||||
lsn::Lsn,
|
|
||||||
};
|
|
||||||
|
|
||||||
use crate::{context::RequestContext, metrics::PageCacheSizeMetrics, repository::Key};
|
use crate::{
|
||||||
|
context::RequestContext,
|
||||||
|
metrics::{page_cache_eviction_metrics, PageCacheSizeMetrics},
|
||||||
|
repository::Key,
|
||||||
|
};
|
||||||
|
|
||||||
static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
|
static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
|
||||||
const TEST_PAGE_CACHE_SIZE: usize = 50;
|
const TEST_PAGE_CACHE_SIZE: usize = 50;
|
||||||
@@ -150,7 +152,13 @@ enum CacheKey {
|
|||||||
|
|
||||||
#[derive(Debug, PartialEq, Eq, Hash, Clone)]
|
#[derive(Debug, PartialEq, Eq, Hash, Clone)]
|
||||||
struct MaterializedPageHashKey {
|
struct MaterializedPageHashKey {
|
||||||
tenant_id: TenantId,
|
/// Why is this TenantShardId rather than TenantId?
|
||||||
|
///
|
||||||
|
/// Usually, the materialized value of a page@lsn is identical on any shard in the same tenant. However, this
|
||||||
|
/// this not the case for certain internally-generated pages (e.g. relation sizes). In future, we may make this
|
||||||
|
/// key smaller by omitting the shard, if we ensure that reads to such pages always skip the cache, or are
|
||||||
|
/// special-cased in some other way.
|
||||||
|
tenant_shard_id: TenantShardId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
key: Key,
|
key: Key,
|
||||||
}
|
}
|
||||||
@@ -374,7 +382,7 @@ impl PageCache {
|
|||||||
/// returned page.
|
/// returned page.
|
||||||
pub async fn lookup_materialized_page(
|
pub async fn lookup_materialized_page(
|
||||||
&self,
|
&self,
|
||||||
tenant_id: TenantId,
|
tenant_shard_id: TenantShardId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
key: &Key,
|
key: &Key,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
@@ -391,7 +399,7 @@ impl PageCache {
|
|||||||
|
|
||||||
let mut cache_key = CacheKey::MaterializedPage {
|
let mut cache_key = CacheKey::MaterializedPage {
|
||||||
hash_key: MaterializedPageHashKey {
|
hash_key: MaterializedPageHashKey {
|
||||||
tenant_id,
|
tenant_shard_id,
|
||||||
timeline_id,
|
timeline_id,
|
||||||
key: *key,
|
key: *key,
|
||||||
},
|
},
|
||||||
@@ -432,7 +440,7 @@ impl PageCache {
|
|||||||
///
|
///
|
||||||
pub async fn memorize_materialized_page(
|
pub async fn memorize_materialized_page(
|
||||||
&self,
|
&self,
|
||||||
tenant_id: TenantId,
|
tenant_shard_id: TenantShardId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
key: Key,
|
key: Key,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
@@ -440,7 +448,7 @@ impl PageCache {
|
|||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let cache_key = CacheKey::MaterializedPage {
|
let cache_key = CacheKey::MaterializedPage {
|
||||||
hash_key: MaterializedPageHashKey {
|
hash_key: MaterializedPageHashKey {
|
||||||
tenant_id,
|
tenant_shard_id,
|
||||||
timeline_id,
|
timeline_id,
|
||||||
key,
|
key,
|
||||||
},
|
},
|
||||||
@@ -897,8 +905,10 @@ impl PageCache {
|
|||||||
// Note that just yielding to tokio during iteration without such
|
// Note that just yielding to tokio during iteration without such
|
||||||
// priority boosting is likely counter-productive. We'd just give more opportunities
|
// priority boosting is likely counter-productive. We'd just give more opportunities
|
||||||
// for B to bump usage count, further starving A.
|
// for B to bump usage count, further starving A.
|
||||||
crate::metrics::page_cache_errors_inc(
|
page_cache_eviction_metrics::observe(
|
||||||
crate::metrics::PageCacheErrorKind::EvictIterLimit,
|
page_cache_eviction_metrics::Outcome::ItersExceeded {
|
||||||
|
iters: iters.try_into().unwrap(),
|
||||||
|
},
|
||||||
);
|
);
|
||||||
anyhow::bail!("exceeded evict iter limit");
|
anyhow::bail!("exceeded evict iter limit");
|
||||||
}
|
}
|
||||||
@@ -909,8 +919,18 @@ impl PageCache {
|
|||||||
// remove mapping for old buffer
|
// remove mapping for old buffer
|
||||||
self.remove_mapping(old_key);
|
self.remove_mapping(old_key);
|
||||||
inner.key = None;
|
inner.key = None;
|
||||||
|
page_cache_eviction_metrics::observe(
|
||||||
|
page_cache_eviction_metrics::Outcome::FoundSlotEvicted {
|
||||||
|
iters: iters.try_into().unwrap(),
|
||||||
|
},
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
page_cache_eviction_metrics::observe(
|
||||||
|
page_cache_eviction_metrics::Outcome::FoundSlotUnused {
|
||||||
|
iters: iters.try_into().unwrap(),
|
||||||
|
},
|
||||||
|
);
|
||||||
}
|
}
|
||||||
crate::metrics::PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL.inc_by(iters as u64);
|
|
||||||
return Ok((slot_idx, inner));
|
return Ok((slot_idx, inner));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -53,21 +53,23 @@ use crate::context::{DownloadBehavior, RequestContext};
|
|||||||
use crate::import_datadir::import_wal_from_tar;
|
use crate::import_datadir::import_wal_from_tar;
|
||||||
use crate::metrics;
|
use crate::metrics;
|
||||||
use crate::metrics::LIVE_CONNECTIONS_COUNT;
|
use crate::metrics::LIVE_CONNECTIONS_COUNT;
|
||||||
|
use crate::pgdatadir_mapping::rel_block_to_key;
|
||||||
use crate::task_mgr;
|
use crate::task_mgr;
|
||||||
use crate::task_mgr::TaskKind;
|
use crate::task_mgr::TaskKind;
|
||||||
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
|
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||||
use crate::tenant::mgr;
|
use crate::tenant::mgr;
|
||||||
use crate::tenant::mgr::get_active_tenant_with_timeout;
|
use crate::tenant::mgr::get_active_tenant_with_timeout;
|
||||||
use crate::tenant::mgr::GetActiveTenantError;
|
use crate::tenant::mgr::GetActiveTenantError;
|
||||||
|
use crate::tenant::mgr::ShardSelector;
|
||||||
use crate::tenant::Timeline;
|
use crate::tenant::Timeline;
|
||||||
use crate::trace::Tracer;
|
use crate::trace::Tracer;
|
||||||
|
|
||||||
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
|
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
|
||||||
use postgres_ffi::BLCKSZ;
|
use postgres_ffi::BLCKSZ;
|
||||||
|
|
||||||
// How long we may block waiting for a [`TenantSlot::InProgress`]` and/or a [`Tenant`] which
|
// How long we may wait for a [`TenantSlot::InProgress`]` and/or a [`Tenant`] which
|
||||||
// is not yet in state [`TenantState::Active`].
|
// is not yet in state [`TenantState::Active`].
|
||||||
const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
|
const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
|
||||||
|
|
||||||
/// Read the end of a tar archive.
|
/// Read the end of a tar archive.
|
||||||
///
|
///
|
||||||
@@ -164,6 +166,7 @@ pub async fn libpq_listener_main(
|
|||||||
None,
|
None,
|
||||||
"serving compute connection task",
|
"serving compute connection task",
|
||||||
false,
|
false,
|
||||||
|
cancel.child_token(),
|
||||||
page_service_conn_main(
|
page_service_conn_main(
|
||||||
conf,
|
conf,
|
||||||
broker_client.clone(),
|
broker_client.clone(),
|
||||||
@@ -399,18 +402,25 @@ impl PageServerHandler {
|
|||||||
{
|
{
|
||||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
|
|
||||||
// Make request tracer if needed
|
// Note that since one connection may contain getpage requests that target different
|
||||||
|
// shards (e.g. during splitting when the compute is not yet aware of the split), the tenant
|
||||||
|
// that we look up here may not be the one that serves all the actual requests: we will double
|
||||||
|
// check the mapping of key->shard later before calling into Timeline for getpage requests.
|
||||||
let tenant = mgr::get_active_tenant_with_timeout(
|
let tenant = mgr::get_active_tenant_with_timeout(
|
||||||
tenant_id,
|
tenant_id,
|
||||||
|
ShardSelector::First,
|
||||||
ACTIVE_TENANT_TIMEOUT,
|
ACTIVE_TENANT_TIMEOUT,
|
||||||
&task_mgr::shutdown_token(),
|
&task_mgr::shutdown_token(),
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
|
// Make request tracer if needed
|
||||||
let mut tracer = if tenant.get_trace_read_requests() {
|
let mut tracer = if tenant.get_trace_read_requests() {
|
||||||
let connection_id = ConnectionId::generate();
|
let connection_id = ConnectionId::generate();
|
||||||
let path = tenant
|
let path =
|
||||||
.conf
|
tenant
|
||||||
.trace_path(&tenant_id, &timeline_id, &connection_id);
|
.conf
|
||||||
|
.trace_path(&tenant.tenant_shard_id(), &timeline_id, &connection_id);
|
||||||
Some(Tracer::new(path))
|
Some(Tracer::new(path))
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
@@ -562,6 +572,7 @@ impl PageServerHandler {
|
|||||||
info!("creating new timeline");
|
info!("creating new timeline");
|
||||||
let tenant = get_active_tenant_with_timeout(
|
let tenant = get_active_tenant_with_timeout(
|
||||||
tenant_id,
|
tenant_id,
|
||||||
|
ShardSelector::Zero,
|
||||||
ACTIVE_TENANT_TIMEOUT,
|
ACTIVE_TENANT_TIMEOUT,
|
||||||
&task_mgr::shutdown_token(),
|
&task_mgr::shutdown_token(),
|
||||||
)
|
)
|
||||||
@@ -624,7 +635,7 @@ impl PageServerHandler {
|
|||||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
|
|
||||||
let timeline = self
|
let timeline = self
|
||||||
.get_active_tenant_timeline(tenant_id, timeline_id)
|
.get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
|
||||||
.await?;
|
.await?;
|
||||||
let last_record_lsn = timeline.get_last_record_lsn();
|
let last_record_lsn = timeline.get_last_record_lsn();
|
||||||
if last_record_lsn != start_lsn {
|
if last_record_lsn != start_lsn {
|
||||||
@@ -803,9 +814,49 @@ impl PageServerHandler {
|
|||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
|
|
||||||
let page = timeline
|
let key = rel_block_to_key(req.rel, req.blkno);
|
||||||
.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
|
let page = if timeline.get_shard_identity().is_key_local(&key) {
|
||||||
.await?;
|
timeline
|
||||||
|
.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
|
||||||
|
.await?
|
||||||
|
} else {
|
||||||
|
// The Tenant shard we looked up at connection start does not hold this particular
|
||||||
|
// key: look for other shards in this tenant. This scenario occurs if a pageserver
|
||||||
|
// has multiple shards for the same tenant.
|
||||||
|
//
|
||||||
|
// TODO: optimize this (https://github.com/neondatabase/neon/pull/6037)
|
||||||
|
let timeline = match self
|
||||||
|
.get_active_tenant_timeline(
|
||||||
|
timeline.tenant_shard_id.tenant_id,
|
||||||
|
timeline.timeline_id,
|
||||||
|
ShardSelector::Page(key),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(t) => t,
|
||||||
|
Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
|
||||||
|
// We already know this tenant exists in general, because we resolved it at
|
||||||
|
// start of connection. Getting a NotFound here indicates that the shard containing
|
||||||
|
// the requested page is not present on this node.
|
||||||
|
|
||||||
|
// TODO: this should be some kind of structured error that the client will understand,
|
||||||
|
// so that it can block until its config is updated: this error is expected in the case
|
||||||
|
// that the Tenant's shards' placements are being updated and the client hasn't been
|
||||||
|
// informed yet.
|
||||||
|
//
|
||||||
|
// https://github.com/neondatabase/neon/issues/6038
|
||||||
|
return Err(anyhow::anyhow!("Request routed to wrong shard"));
|
||||||
|
}
|
||||||
|
Err(e) => return Err(e.into()),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Take a GateGuard for the duration of this request. If we were using our main Timeline object,
|
||||||
|
// the GateGuard was already held over the whole connection.
|
||||||
|
let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
|
||||||
|
timeline
|
||||||
|
.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
|
||||||
|
.await?
|
||||||
|
};
|
||||||
|
|
||||||
Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
|
Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
|
||||||
page,
|
page,
|
||||||
@@ -834,7 +885,7 @@ impl PageServerHandler {
|
|||||||
|
|
||||||
// check that the timeline exists
|
// check that the timeline exists
|
||||||
let timeline = self
|
let timeline = self
|
||||||
.get_active_tenant_timeline(tenant_id, timeline_id)
|
.get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
|
||||||
.await?;
|
.await?;
|
||||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||||
if let Some(lsn) = lsn {
|
if let Some(lsn) = lsn {
|
||||||
@@ -940,9 +991,11 @@ impl PageServerHandler {
|
|||||||
&self,
|
&self,
|
||||||
tenant_id: TenantId,
|
tenant_id: TenantId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
|
selector: ShardSelector,
|
||||||
) -> Result<Arc<Timeline>, GetActiveTimelineError> {
|
) -> Result<Arc<Timeline>, GetActiveTimelineError> {
|
||||||
let tenant = get_active_tenant_with_timeout(
|
let tenant = get_active_tenant_with_timeout(
|
||||||
tenant_id,
|
tenant_id,
|
||||||
|
selector,
|
||||||
ACTIVE_TENANT_TIMEOUT,
|
ACTIVE_TENANT_TIMEOUT,
|
||||||
&task_mgr::shutdown_token(),
|
&task_mgr::shutdown_token(),
|
||||||
)
|
)
|
||||||
@@ -1116,7 +1169,7 @@ where
|
|||||||
|
|
||||||
self.check_permission(Some(tenant_id))?;
|
self.check_permission(Some(tenant_id))?;
|
||||||
let timeline = self
|
let timeline = self
|
||||||
.get_active_tenant_timeline(tenant_id, timeline_id)
|
.get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
let end_of_timeline = timeline.get_last_record_rlsn();
|
let end_of_timeline = timeline.get_last_record_rlsn();
|
||||||
@@ -1303,6 +1356,7 @@ where
|
|||||||
|
|
||||||
let tenant = get_active_tenant_with_timeout(
|
let tenant = get_active_tenant_with_timeout(
|
||||||
tenant_id,
|
tenant_id,
|
||||||
|
ShardSelector::Zero,
|
||||||
ACTIVE_TENANT_TIMEOUT,
|
ACTIVE_TENANT_TIMEOUT,
|
||||||
&task_mgr::shutdown_token(),
|
&task_mgr::shutdown_token(),
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ use crate::repository::*;
|
|||||||
use crate::walrecord::NeonWalRecord;
|
use crate::walrecord::NeonWalRecord;
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use bytes::{Buf, Bytes};
|
use bytes::{Buf, Bytes};
|
||||||
|
use pageserver_api::key::is_rel_block_key;
|
||||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
|
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||||
use postgres_ffi::BLCKSZ;
|
use postgres_ffi::BLCKSZ;
|
||||||
@@ -21,6 +22,7 @@ use serde::{Deserialize, Serialize};
|
|||||||
use std::collections::{hash_map, HashMap, HashSet};
|
use std::collections::{hash_map, HashMap, HashSet};
|
||||||
use std::ops::ControlFlow;
|
use std::ops::ControlFlow;
|
||||||
use std::ops::Range;
|
use std::ops::Range;
|
||||||
|
use tokio_util::sync::CancellationToken;
|
||||||
use tracing::{debug, trace, warn};
|
use tracing::{debug, trace, warn};
|
||||||
use utils::bin_ser::DeserializeError;
|
use utils::bin_ser::DeserializeError;
|
||||||
use utils::{bin_ser::BeSer, lsn::Lsn};
|
use utils::{bin_ser::BeSer, lsn::Lsn};
|
||||||
@@ -281,6 +283,10 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Get a list of all existing relations in given tablespace and database.
|
/// Get a list of all existing relations in given tablespace and database.
|
||||||
|
///
|
||||||
|
/// # Cancel-Safety
|
||||||
|
///
|
||||||
|
/// This method is cancellation-safe.
|
||||||
pub async fn list_rels(
|
pub async fn list_rels(
|
||||||
&self,
|
&self,
|
||||||
spcnode: Oid,
|
spcnode: Oid,
|
||||||
@@ -365,6 +371,7 @@ impl Timeline {
|
|||||||
pub async fn find_lsn_for_timestamp(
|
pub async fn find_lsn_for_timestamp(
|
||||||
&self,
|
&self,
|
||||||
search_timestamp: TimestampTz,
|
search_timestamp: TimestampTz,
|
||||||
|
cancel: &CancellationToken,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<LsnForTimestamp, PageReconstructError> {
|
) -> Result<LsnForTimestamp, PageReconstructError> {
|
||||||
let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
|
let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
|
||||||
@@ -383,6 +390,9 @@ impl Timeline {
|
|||||||
let mut found_smaller = false;
|
let mut found_smaller = false;
|
||||||
let mut found_larger = false;
|
let mut found_larger = false;
|
||||||
while low < high {
|
while low < high {
|
||||||
|
if cancel.is_cancelled() {
|
||||||
|
return Err(PageReconstructError::Cancelled);
|
||||||
|
}
|
||||||
// cannot overflow, high and low are both smaller than u64::MAX / 2
|
// cannot overflow, high and low are both smaller than u64::MAX / 2
|
||||||
let mid = (high + low) / 2;
|
let mid = (high + low) / 2;
|
||||||
|
|
||||||
@@ -625,6 +635,10 @@ impl Timeline {
|
|||||||
///
|
///
|
||||||
/// Only relation blocks are counted currently. That excludes metadata,
|
/// Only relation blocks are counted currently. That excludes metadata,
|
||||||
/// SLRUs, twophase files etc.
|
/// SLRUs, twophase files etc.
|
||||||
|
///
|
||||||
|
/// # Cancel-Safety
|
||||||
|
///
|
||||||
|
/// This method is cancellation-safe.
|
||||||
pub async fn get_current_logical_size_non_incremental(
|
pub async fn get_current_logical_size_non_incremental(
|
||||||
&self,
|
&self,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
@@ -808,10 +822,7 @@ impl<'a> DatadirModification<'a> {
|
|||||||
self.put(DBDIR_KEY, Value::Image(buf.into()));
|
self.put(DBDIR_KEY, Value::Image(buf.into()));
|
||||||
|
|
||||||
// Create AuxFilesDirectory
|
// Create AuxFilesDirectory
|
||||||
let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
|
self.init_aux_dir()?;
|
||||||
files: HashMap::new(),
|
|
||||||
})?;
|
|
||||||
self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
|
|
||||||
|
|
||||||
let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory {
|
let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory {
|
||||||
xids: HashSet::new(),
|
xids: HashSet::new(),
|
||||||
@@ -919,10 +930,7 @@ impl<'a> DatadirModification<'a> {
|
|||||||
self.put(DBDIR_KEY, Value::Image(buf.into()));
|
self.put(DBDIR_KEY, Value::Image(buf.into()));
|
||||||
|
|
||||||
// Create AuxFilesDirectory as well
|
// Create AuxFilesDirectory as well
|
||||||
let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
|
self.init_aux_dir()?;
|
||||||
files: HashMap::new(),
|
|
||||||
})?;
|
|
||||||
self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
|
|
||||||
}
|
}
|
||||||
if r.is_none() {
|
if r.is_none() {
|
||||||
// Create RelDirectory
|
// Create RelDirectory
|
||||||
@@ -1247,6 +1255,14 @@ impl<'a> DatadirModification<'a> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn init_aux_dir(&mut self) -> anyhow::Result<()> {
|
||||||
|
let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
|
||||||
|
files: HashMap::new(),
|
||||||
|
})?;
|
||||||
|
self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn put_file(
|
pub async fn put_file(
|
||||||
&mut self,
|
&mut self,
|
||||||
path: &str,
|
path: &str,
|
||||||
@@ -1309,7 +1325,7 @@ impl<'a> DatadirModification<'a> {
|
|||||||
// Flush relation and SLRU data blocks, keep metadata.
|
// Flush relation and SLRU data blocks, keep metadata.
|
||||||
let mut retained_pending_updates = HashMap::new();
|
let mut retained_pending_updates = HashMap::new();
|
||||||
for (key, value) in self.pending_updates.drain() {
|
for (key, value) in self.pending_updates.drain() {
|
||||||
if is_rel_block_key(key) || is_slru_block_key(key) {
|
if is_rel_block_key(&key) || is_slru_block_key(key) {
|
||||||
// This bails out on first error without modifying pending_updates.
|
// This bails out on first error without modifying pending_updates.
|
||||||
// That's Ok, cf this function's doc comment.
|
// That's Ok, cf this function's doc comment.
|
||||||
writer.put(key, self.lsn, &value, ctx).await?;
|
writer.put(key, self.lsn, &value, ctx).await?;
|
||||||
@@ -1354,6 +1370,10 @@ impl<'a> DatadirModification<'a> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn is_empty(&self) -> bool {
|
||||||
|
self.pending_updates.is_empty() && self.pending_deletions.is_empty()
|
||||||
|
}
|
||||||
|
|
||||||
// Internal helper functions to batch the modifications
|
// Internal helper functions to batch the modifications
|
||||||
|
|
||||||
async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
|
async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
|
||||||
@@ -1565,7 +1585,7 @@ fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
|
pub(crate) fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
|
||||||
Key {
|
Key {
|
||||||
field1: 0x00,
|
field1: 0x00,
|
||||||
field2: rel.spcnode,
|
field2: rel.spcnode,
|
||||||
@@ -1749,6 +1769,13 @@ const AUX_FILES_KEY: Key = Key {
|
|||||||
// Reverse mappings for a few Keys.
|
// Reverse mappings for a few Keys.
|
||||||
// These are needed by WAL redo manager.
|
// These are needed by WAL redo manager.
|
||||||
|
|
||||||
|
// AUX_FILES currently stores only data for logical replication (slots etc), and
|
||||||
|
// we don't preserve these on a branch because safekeepers can't follow timeline
|
||||||
|
// switch (and generally it likely should be optional), so ignore these.
|
||||||
|
pub fn is_inherited_key(key: Key) -> bool {
|
||||||
|
key != AUX_FILES_KEY
|
||||||
|
}
|
||||||
|
|
||||||
pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
|
pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
|
||||||
Ok(match key.field1 {
|
Ok(match key.field1 {
|
||||||
0x00 => (
|
0x00 => (
|
||||||
@@ -1764,10 +1791,6 @@ pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn is_rel_block_key(key: Key) -> bool {
|
|
||||||
key.field1 == 0x00 && key.field4 != 0
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn is_rel_fsm_block_key(key: Key) -> bool {
|
pub fn is_rel_fsm_block_key(key: Key) -> bool {
|
||||||
key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
|
key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -138,6 +138,14 @@ pub struct GcResult {
|
|||||||
|
|
||||||
#[serde(serialize_with = "serialize_duration_as_millis")]
|
#[serde(serialize_with = "serialize_duration_as_millis")]
|
||||||
pub elapsed: Duration,
|
pub elapsed: Duration,
|
||||||
|
|
||||||
|
/// The layers which were garbage collected.
|
||||||
|
///
|
||||||
|
/// Used in `/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc` to wait for the layers to be
|
||||||
|
/// dropped in tests.
|
||||||
|
#[cfg(feature = "testing")]
|
||||||
|
#[serde(skip)]
|
||||||
|
pub(crate) doomed_layers: Vec<crate::tenant::storage_layer::Layer>,
|
||||||
}
|
}
|
||||||
|
|
||||||
// helper function for `GcResult`, serializing a `Duration` as an integer number of milliseconds
|
// helper function for `GcResult`, serializing a `Duration` as an integer number of milliseconds
|
||||||
@@ -158,5 +166,11 @@ impl AddAssign for GcResult {
|
|||||||
self.layers_removed += other.layers_removed;
|
self.layers_removed += other.layers_removed;
|
||||||
|
|
||||||
self.elapsed += other.elapsed;
|
self.elapsed += other.elapsed;
|
||||||
|
|
||||||
|
#[cfg(feature = "testing")]
|
||||||
|
{
|
||||||
|
let mut other = other;
|
||||||
|
self.doomed_layers.append(&mut other.doomed_layers);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -42,6 +42,7 @@ use std::sync::atomic::{AtomicU64, Ordering};
|
|||||||
use std::sync::{Arc, Mutex};
|
use std::sync::{Arc, Mutex};
|
||||||
|
|
||||||
use futures::FutureExt;
|
use futures::FutureExt;
|
||||||
|
use pageserver_api::shard::TenantShardId;
|
||||||
use tokio::runtime::Runtime;
|
use tokio::runtime::Runtime;
|
||||||
use tokio::task::JoinHandle;
|
use tokio::task::JoinHandle;
|
||||||
use tokio::task_local;
|
use tokio::task_local;
|
||||||
@@ -51,7 +52,7 @@ use tracing::{debug, error, info, warn};
|
|||||||
|
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
|
|
||||||
use utils::id::{TenantId, TimelineId};
|
use utils::id::TimelineId;
|
||||||
|
|
||||||
use crate::shutdown_pageserver;
|
use crate::shutdown_pageserver;
|
||||||
|
|
||||||
@@ -317,7 +318,7 @@ struct PageServerTask {
|
|||||||
|
|
||||||
/// Tasks may optionally be launched for a particular tenant/timeline, enabling
|
/// Tasks may optionally be launched for a particular tenant/timeline, enabling
|
||||||
/// later cancelling tasks for that tenant/timeline in [`shutdown_tasks`]
|
/// later cancelling tasks for that tenant/timeline in [`shutdown_tasks`]
|
||||||
tenant_id: Option<TenantId>,
|
tenant_shard_id: Option<TenantShardId>,
|
||||||
timeline_id: Option<TimelineId>,
|
timeline_id: Option<TimelineId>,
|
||||||
|
|
||||||
mutable: Mutex<MutableTaskState>,
|
mutable: Mutex<MutableTaskState>,
|
||||||
@@ -326,26 +327,28 @@ struct PageServerTask {
|
|||||||
/// Launch a new task
|
/// Launch a new task
|
||||||
/// Note: if shutdown_process_on_error is set to true failure
|
/// Note: if shutdown_process_on_error is set to true failure
|
||||||
/// of the task will lead to shutdown of entire process
|
/// of the task will lead to shutdown of entire process
|
||||||
|
#[allow(clippy::too_many_arguments)]
|
||||||
pub fn spawn<F>(
|
pub fn spawn<F>(
|
||||||
runtime: &tokio::runtime::Handle,
|
runtime: &tokio::runtime::Handle,
|
||||||
kind: TaskKind,
|
kind: TaskKind,
|
||||||
tenant_id: Option<TenantId>,
|
tenant_shard_id: Option<TenantShardId>,
|
||||||
timeline_id: Option<TimelineId>,
|
timeline_id: Option<TimelineId>,
|
||||||
name: &str,
|
name: &str,
|
||||||
shutdown_process_on_error: bool,
|
shutdown_process_on_error: bool,
|
||||||
|
cancel: CancellationToken,
|
||||||
future: F,
|
future: F,
|
||||||
) -> PageserverTaskId
|
) -> PageserverTaskId
|
||||||
where
|
where
|
||||||
F: Future<Output = anyhow::Result<()>> + Send + 'static,
|
F: Future<Output = anyhow::Result<()>> + Send + 'static,
|
||||||
{
|
{
|
||||||
let cancel = CancellationToken::new();
|
// let cancel = CancellationToken::new();
|
||||||
let task_id = NEXT_TASK_ID.fetch_add(1, Ordering::Relaxed);
|
let task_id = NEXT_TASK_ID.fetch_add(1, Ordering::Relaxed);
|
||||||
let task = Arc::new(PageServerTask {
|
let task = Arc::new(PageServerTask {
|
||||||
task_id: PageserverTaskId(task_id),
|
task_id: PageserverTaskId(task_id),
|
||||||
kind,
|
kind,
|
||||||
name: name.to_string(),
|
name: name.to_string(),
|
||||||
cancel: cancel.clone(),
|
cancel: cancel.clone(),
|
||||||
tenant_id,
|
tenant_shard_id,
|
||||||
timeline_id,
|
timeline_id,
|
||||||
mutable: Mutex::new(MutableTaskState { join_handle: None }),
|
mutable: Mutex::new(MutableTaskState { join_handle: None }),
|
||||||
});
|
});
|
||||||
@@ -424,28 +427,28 @@ async fn task_finish(
|
|||||||
Ok(Err(err)) => {
|
Ok(Err(err)) => {
|
||||||
if shutdown_process_on_error {
|
if shutdown_process_on_error {
|
||||||
error!(
|
error!(
|
||||||
"Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
|
"Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}",
|
||||||
task_name, task.tenant_id, task.timeline_id, err
|
task_name, task.tenant_shard_id, task.timeline_id, err
|
||||||
);
|
);
|
||||||
shutdown_process = true;
|
shutdown_process = true;
|
||||||
} else {
|
} else {
|
||||||
error!(
|
error!(
|
||||||
"Task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
|
"Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}",
|
||||||
task_name, task.tenant_id, task.timeline_id, err
|
task_name, task.tenant_shard_id, task.timeline_id, err
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
if shutdown_process_on_error {
|
if shutdown_process_on_error {
|
||||||
error!(
|
error!(
|
||||||
"Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
|
"Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}",
|
||||||
task_name, task.tenant_id, task.timeline_id, err
|
task_name, task.tenant_shard_id, task.timeline_id, err
|
||||||
);
|
);
|
||||||
shutdown_process = true;
|
shutdown_process = true;
|
||||||
} else {
|
} else {
|
||||||
error!(
|
error!(
|
||||||
"Task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
|
"Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}",
|
||||||
task_name, task.tenant_id, task.timeline_id, err
|
task_name, task.tenant_shard_id, task.timeline_id, err
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -467,11 +470,11 @@ async fn task_finish(
|
|||||||
///
|
///
|
||||||
/// Or to shut down all tasks for given timeline:
|
/// Or to shut down all tasks for given timeline:
|
||||||
///
|
///
|
||||||
/// shutdown_tasks(None, Some(tenant_id), Some(timeline_id))
|
/// shutdown_tasks(None, Some(tenant_shard_id), Some(timeline_id))
|
||||||
///
|
///
|
||||||
pub async fn shutdown_tasks(
|
pub async fn shutdown_tasks(
|
||||||
kind: Option<TaskKind>,
|
kind: Option<TaskKind>,
|
||||||
tenant_id: Option<TenantId>,
|
tenant_shard_id: Option<TenantShardId>,
|
||||||
timeline_id: Option<TimelineId>,
|
timeline_id: Option<TimelineId>,
|
||||||
) {
|
) {
|
||||||
let mut victim_tasks = Vec::new();
|
let mut victim_tasks = Vec::new();
|
||||||
@@ -480,35 +483,35 @@ pub async fn shutdown_tasks(
|
|||||||
let tasks = TASKS.lock().unwrap();
|
let tasks = TASKS.lock().unwrap();
|
||||||
for task in tasks.values() {
|
for task in tasks.values() {
|
||||||
if (kind.is_none() || Some(task.kind) == kind)
|
if (kind.is_none() || Some(task.kind) == kind)
|
||||||
&& (tenant_id.is_none() || task.tenant_id == tenant_id)
|
&& (tenant_shard_id.is_none() || task.tenant_shard_id == tenant_shard_id)
|
||||||
&& (timeline_id.is_none() || task.timeline_id == timeline_id)
|
&& (timeline_id.is_none() || task.timeline_id == timeline_id)
|
||||||
{
|
{
|
||||||
task.cancel.cancel();
|
task.cancel.cancel();
|
||||||
victim_tasks.push((
|
victim_tasks.push((
|
||||||
Arc::clone(task),
|
Arc::clone(task),
|
||||||
task.kind,
|
task.kind,
|
||||||
task.tenant_id,
|
task.tenant_shard_id,
|
||||||
task.timeline_id,
|
task.timeline_id,
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let log_all = kind.is_none() && tenant_id.is_none() && timeline_id.is_none();
|
let log_all = kind.is_none() && tenant_shard_id.is_none() && timeline_id.is_none();
|
||||||
|
|
||||||
for (task, task_kind, tenant_id, timeline_id) in victim_tasks {
|
for (task, task_kind, tenant_shard_id, timeline_id) in victim_tasks {
|
||||||
let join_handle = {
|
let join_handle = {
|
||||||
let mut task_mut = task.mutable.lock().unwrap();
|
let mut task_mut = task.mutable.lock().unwrap();
|
||||||
task_mut.join_handle.take()
|
task_mut.join_handle.take()
|
||||||
};
|
};
|
||||||
if let Some(mut join_handle) = join_handle {
|
if let Some(mut join_handle) = join_handle {
|
||||||
if log_all {
|
if log_all {
|
||||||
if tenant_id.is_none() {
|
if tenant_shard_id.is_none() {
|
||||||
// there are quite few of these
|
// there are quite few of these
|
||||||
info!(name = task.name, kind = ?task_kind, "stopping global task");
|
info!(name = task.name, kind = ?task_kind, "stopping global task");
|
||||||
} else {
|
} else {
|
||||||
// warn to catch these in tests; there shouldn't be any
|
// warn to catch these in tests; there shouldn't be any
|
||||||
warn!(name = task.name, tenant_id = ?tenant_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
|
warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if tokio::time::timeout(std::time::Duration::from_secs(1), &mut join_handle)
|
if tokio::time::timeout(std::time::Duration::from_secs(1), &mut join_handle)
|
||||||
@@ -517,12 +520,13 @@ pub async fn shutdown_tasks(
|
|||||||
{
|
{
|
||||||
// allow some time to elapse before logging to cut down the number of log
|
// allow some time to elapse before logging to cut down the number of log
|
||||||
// lines.
|
// lines.
|
||||||
info!("waiting for {} to shut down", task.name);
|
info!("waiting for task {} to shut down", task.name);
|
||||||
// we never handled this return value, but:
|
// we never handled this return value, but:
|
||||||
// - we don't deschedule which would lead to is_cancelled
|
// - we don't deschedule which would lead to is_cancelled
|
||||||
// - panics are already logged (is_panicked)
|
// - panics are already logged (is_panicked)
|
||||||
// - task errors are already logged in the wrapper
|
// - task errors are already logged in the wrapper
|
||||||
let _ = join_handle.await;
|
let _ = join_handle.await;
|
||||||
|
info!("task {} completed", task.name);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Possibly one of:
|
// Possibly one of:
|
||||||
@@ -556,9 +560,14 @@ pub async fn shutdown_watcher() {
|
|||||||
/// cancelled. It can however be moved to other tasks, such as `tokio::task::spawn_blocking` or
|
/// cancelled. It can however be moved to other tasks, such as `tokio::task::spawn_blocking` or
|
||||||
/// `tokio::task::JoinSet::spawn`.
|
/// `tokio::task::JoinSet::spawn`.
|
||||||
pub fn shutdown_token() -> CancellationToken {
|
pub fn shutdown_token() -> CancellationToken {
|
||||||
SHUTDOWN_TOKEN
|
let res = SHUTDOWN_TOKEN.try_with(|t| t.clone());
|
||||||
.try_with(|t| t.clone())
|
|
||||||
.expect("shutdown_token() called in an unexpected task or thread")
|
if cfg!(test) {
|
||||||
|
res.unwrap_or_default()
|
||||||
|
} else {
|
||||||
|
// tests need to call the same paths which need to use get the shutdown token
|
||||||
|
res.expect("shutdown_token() called in an unexpected task or thread")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Has the current task been requested to shut down?
|
/// Has the current task been requested to shut down?
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -8,9 +8,12 @@
|
|||||||
//! We cannot use global or default config instead, because wrong settings
|
//! We cannot use global or default config instead, because wrong settings
|
||||||
//! may lead to a data loss.
|
//! may lead to a data loss.
|
||||||
//!
|
//!
|
||||||
use anyhow::Context;
|
use anyhow::bail;
|
||||||
use pageserver_api::models;
|
use pageserver_api::models;
|
||||||
|
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
|
||||||
|
use serde::de::IntoDeserializer;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
use serde_json::Value;
|
||||||
use std::num::NonZeroU64;
|
use std::num::NonZeroU64;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use utils::generation::Generation;
|
use utils::generation::Generation;
|
||||||
@@ -88,6 +91,14 @@ pub(crate) struct LocationConf {
|
|||||||
/// The location-specific part of the configuration, describes the operating
|
/// The location-specific part of the configuration, describes the operating
|
||||||
/// mode of this pageserver for this tenant.
|
/// mode of this pageserver for this tenant.
|
||||||
pub(crate) mode: LocationMode,
|
pub(crate) mode: LocationMode,
|
||||||
|
|
||||||
|
/// The detailed shard identity. This structure is already scoped within
|
||||||
|
/// a TenantShardId, but we need the full ShardIdentity to enable calculating
|
||||||
|
/// key->shard mappings.
|
||||||
|
#[serde(default = "ShardIdentity::unsharded")]
|
||||||
|
#[serde(skip_serializing_if = "ShardIdentity::is_unsharded")]
|
||||||
|
pub(crate) shard: ShardIdentity,
|
||||||
|
|
||||||
/// The pan-cluster tenant configuration, the same on all locations
|
/// The pan-cluster tenant configuration, the same on all locations
|
||||||
pub(crate) tenant_conf: TenantConfOpt,
|
pub(crate) tenant_conf: TenantConfOpt,
|
||||||
}
|
}
|
||||||
@@ -160,6 +171,8 @@ impl LocationConf {
|
|||||||
generation,
|
generation,
|
||||||
attach_mode: AttachmentMode::Single,
|
attach_mode: AttachmentMode::Single,
|
||||||
}),
|
}),
|
||||||
|
// Legacy configuration loads are always from tenants created before sharding existed.
|
||||||
|
shard: ShardIdentity::unsharded(),
|
||||||
tenant_conf,
|
tenant_conf,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -187,6 +200,7 @@ impl LocationConf {
|
|||||||
|
|
||||||
fn get_generation(conf: &'_ models::LocationConfig) -> Result<Generation, anyhow::Error> {
|
fn get_generation(conf: &'_ models::LocationConfig) -> Result<Generation, anyhow::Error> {
|
||||||
conf.generation
|
conf.generation
|
||||||
|
.map(Generation::new)
|
||||||
.ok_or_else(|| anyhow::anyhow!("Generation must be set when attaching"))
|
.ok_or_else(|| anyhow::anyhow!("Generation must be set when attaching"))
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -226,7 +240,21 @@ impl LocationConf {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok(Self { mode, tenant_conf })
|
let shard = if conf.shard_count == 0 {
|
||||||
|
ShardIdentity::unsharded()
|
||||||
|
} else {
|
||||||
|
ShardIdentity::new(
|
||||||
|
ShardNumber(conf.shard_number),
|
||||||
|
ShardCount(conf.shard_count),
|
||||||
|
ShardStripeSize(conf.shard_stripe_size),
|
||||||
|
)?
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(Self {
|
||||||
|
shard,
|
||||||
|
mode,
|
||||||
|
tenant_conf,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -241,6 +269,7 @@ impl Default for LocationConf {
|
|||||||
attach_mode: AttachmentMode::Single,
|
attach_mode: AttachmentMode::Single,
|
||||||
}),
|
}),
|
||||||
tenant_conf: TenantConfOpt::default(),
|
tenant_conf: TenantConfOpt::default(),
|
||||||
|
shard: ShardIdentity::unsharded(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -494,105 +523,49 @@ impl Default for TenantConf {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper function to standardize the error messages we produce on bad durations
|
|
||||||
//
|
|
||||||
// Intended to be used with anyhow's `with_context`, e.g.:
|
|
||||||
//
|
|
||||||
// let value = result.with_context(bad_duration("name", &value))?;
|
|
||||||
//
|
|
||||||
fn bad_duration<'a>(field_name: &'static str, value: &'a str) -> impl 'a + Fn() -> String {
|
|
||||||
move || format!("Cannot parse `{field_name}` duration {value:?}")
|
|
||||||
}
|
|
||||||
|
|
||||||
impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
|
impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
|
||||||
type Error = anyhow::Error;
|
type Error = anyhow::Error;
|
||||||
|
|
||||||
fn try_from(request_data: &'_ models::TenantConfig) -> Result<Self, Self::Error> {
|
fn try_from(request_data: &'_ models::TenantConfig) -> Result<Self, Self::Error> {
|
||||||
let mut tenant_conf = TenantConfOpt::default();
|
// Convert the request_data to a JSON Value
|
||||||
|
let json_value: Value = serde_json::to_value(request_data)?;
|
||||||
|
|
||||||
if let Some(gc_period) = &request_data.gc_period {
|
// Create a Deserializer from the JSON Value
|
||||||
tenant_conf.gc_period = Some(
|
let deserializer = json_value.into_deserializer();
|
||||||
humantime::parse_duration(gc_period)
|
|
||||||
.with_context(bad_duration("gc_period", gc_period))?,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
tenant_conf.gc_horizon = request_data.gc_horizon;
|
|
||||||
tenant_conf.image_creation_threshold = request_data.image_creation_threshold;
|
|
||||||
|
|
||||||
if let Some(pitr_interval) = &request_data.pitr_interval {
|
// Use serde_path_to_error to deserialize the JSON Value into TenantConfOpt
|
||||||
tenant_conf.pitr_interval = Some(
|
let tenant_conf: TenantConfOpt = serde_path_to_error::deserialize(deserializer)?;
|
||||||
humantime::parse_duration(pitr_interval)
|
|
||||||
.with_context(bad_duration("pitr_interval", pitr_interval))?,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(walreceiver_connect_timeout) = &request_data.walreceiver_connect_timeout {
|
|
||||||
tenant_conf.walreceiver_connect_timeout = Some(
|
|
||||||
humantime::parse_duration(walreceiver_connect_timeout).with_context(
|
|
||||||
bad_duration("walreceiver_connect_timeout", walreceiver_connect_timeout),
|
|
||||||
)?,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
if let Some(lagging_wal_timeout) = &request_data.lagging_wal_timeout {
|
|
||||||
tenant_conf.lagging_wal_timeout = Some(
|
|
||||||
humantime::parse_duration(lagging_wal_timeout)
|
|
||||||
.with_context(bad_duration("lagging_wal_timeout", lagging_wal_timeout))?,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag {
|
|
||||||
tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag);
|
|
||||||
}
|
|
||||||
if let Some(trace_read_requests) = request_data.trace_read_requests {
|
|
||||||
tenant_conf.trace_read_requests = Some(trace_read_requests);
|
|
||||||
}
|
|
||||||
|
|
||||||
tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
|
|
||||||
if let Some(checkpoint_timeout) = &request_data.checkpoint_timeout {
|
|
||||||
tenant_conf.checkpoint_timeout = Some(
|
|
||||||
humantime::parse_duration(checkpoint_timeout)
|
|
||||||
.with_context(bad_duration("checkpoint_timeout", checkpoint_timeout))?,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
tenant_conf.compaction_target_size = request_data.compaction_target_size;
|
|
||||||
tenant_conf.compaction_threshold = request_data.compaction_threshold;
|
|
||||||
|
|
||||||
if let Some(compaction_period) = &request_data.compaction_period {
|
|
||||||
tenant_conf.compaction_period = Some(
|
|
||||||
humantime::parse_duration(compaction_period)
|
|
||||||
.with_context(bad_duration("compaction_period", compaction_period))?,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(eviction_policy) = &request_data.eviction_policy {
|
|
||||||
tenant_conf.eviction_policy = Some(
|
|
||||||
serde::Deserialize::deserialize(eviction_policy)
|
|
||||||
.context("parse field `eviction_policy`")?,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
tenant_conf.min_resident_size_override = request_data.min_resident_size_override;
|
|
||||||
|
|
||||||
if let Some(evictions_low_residence_duration_metric_threshold) =
|
|
||||||
&request_data.evictions_low_residence_duration_metric_threshold
|
|
||||||
{
|
|
||||||
tenant_conf.evictions_low_residence_duration_metric_threshold = Some(
|
|
||||||
humantime::parse_duration(evictions_low_residence_duration_metric_threshold)
|
|
||||||
.with_context(bad_duration(
|
|
||||||
"evictions_low_residence_duration_metric_threshold",
|
|
||||||
evictions_low_residence_duration_metric_threshold,
|
|
||||||
))?,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
tenant_conf.gc_feedback = request_data.gc_feedback;
|
|
||||||
|
|
||||||
Ok(tenant_conf)
|
Ok(tenant_conf)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl TryFrom<toml_edit::Item> for TenantConfOpt {
|
||||||
|
type Error = anyhow::Error;
|
||||||
|
|
||||||
|
fn try_from(item: toml_edit::Item) -> Result<Self, Self::Error> {
|
||||||
|
match item {
|
||||||
|
toml_edit::Item::Value(value) => {
|
||||||
|
let d = value.into_deserializer();
|
||||||
|
return serde_path_to_error::deserialize(d)
|
||||||
|
.map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
|
||||||
|
}
|
||||||
|
toml_edit::Item::Table(table) => {
|
||||||
|
let deserializer = toml_edit::de::Deserializer::new(table.into());
|
||||||
|
return serde_path_to_error::deserialize(deserializer)
|
||||||
|
.map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
bail!("expected non-inline table but found {item}")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
use models::TenantConfig;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn de_serializing_pageserver_config_omits_empty_values() {
|
fn de_serializing_pageserver_config_omits_empty_values() {
|
||||||
@@ -609,4 +582,38 @@ mod tests {
|
|||||||
assert_eq!(json_form, "{\"gc_horizon\":42}");
|
assert_eq!(json_form, "{\"gc_horizon\":42}");
|
||||||
assert_eq!(small_conf, serde_json::from_str(&json_form).unwrap());
|
assert_eq!(small_conf, serde_json::from_str(&json_form).unwrap());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_try_from_models_tenant_config_err() {
|
||||||
|
let tenant_config = models::TenantConfig {
|
||||||
|
lagging_wal_timeout: Some("5a".to_string()),
|
||||||
|
..TenantConfig::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
let tenant_conf_opt = TenantConfOpt::try_from(&tenant_config);
|
||||||
|
|
||||||
|
assert!(
|
||||||
|
tenant_conf_opt.is_err(),
|
||||||
|
"Suceeded to convert TenantConfig to TenantConfOpt"
|
||||||
|
);
|
||||||
|
|
||||||
|
let expected_error_str =
|
||||||
|
"lagging_wal_timeout: invalid value: string \"5a\", expected a duration";
|
||||||
|
assert_eq!(tenant_conf_opt.unwrap_err().to_string(), expected_error_str);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_try_from_models_tenant_config_success() {
|
||||||
|
let tenant_config = models::TenantConfig {
|
||||||
|
lagging_wal_timeout: Some("5s".to_string()),
|
||||||
|
..TenantConfig::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
let tenant_conf_opt = TenantConfOpt::try_from(&tenant_config).unwrap();
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
tenant_conf_opt.lagging_wal_timeout,
|
||||||
|
Some(Duration::from_secs(5))
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,22 +2,19 @@ use std::sync::Arc;
|
|||||||
|
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use camino::{Utf8Path, Utf8PathBuf};
|
use camino::{Utf8Path, Utf8PathBuf};
|
||||||
use pageserver_api::models::TenantState;
|
use pageserver_api::{models::TenantState, shard::TenantShardId};
|
||||||
use remote_storage::{GenericRemoteStorage, RemotePath};
|
use remote_storage::{GenericRemoteStorage, RemotePath};
|
||||||
use tokio::sync::OwnedMutexGuard;
|
use tokio::sync::OwnedMutexGuard;
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
use tracing::{error, instrument, warn, Instrument, Span};
|
use tracing::{error, instrument, Instrument, Span};
|
||||||
|
|
||||||
use utils::{
|
use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId};
|
||||||
backoff, completion, crashsafe, fs_ext,
|
|
||||||
id::{TenantId, TimelineId},
|
|
||||||
};
|
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
config::PageServerConf,
|
config::PageServerConf,
|
||||||
context::RequestContext,
|
context::RequestContext,
|
||||||
task_mgr::{self, TaskKind},
|
task_mgr::{self, TaskKind},
|
||||||
InitializationOrder,
|
tenant::mgr::{TenantSlot, TenantsMapRemoveResult},
|
||||||
};
|
};
|
||||||
|
|
||||||
use super::{
|
use super::{
|
||||||
@@ -59,10 +56,10 @@ type DeletionGuard = tokio::sync::OwnedMutexGuard<DeleteTenantFlow>;
|
|||||||
|
|
||||||
fn remote_tenant_delete_mark_path(
|
fn remote_tenant_delete_mark_path(
|
||||||
conf: &PageServerConf,
|
conf: &PageServerConf,
|
||||||
tenant_id: &TenantId,
|
tenant_shard_id: &TenantShardId,
|
||||||
) -> anyhow::Result<RemotePath> {
|
) -> anyhow::Result<RemotePath> {
|
||||||
let tenant_remote_path = conf
|
let tenant_remote_path = conf
|
||||||
.tenant_path(tenant_id)
|
.tenant_path(tenant_shard_id)
|
||||||
.strip_prefix(&conf.workdir)
|
.strip_prefix(&conf.workdir)
|
||||||
.context("Failed to strip workdir prefix")
|
.context("Failed to strip workdir prefix")
|
||||||
.and_then(RemotePath::new)
|
.and_then(RemotePath::new)
|
||||||
@@ -73,15 +70,17 @@ fn remote_tenant_delete_mark_path(
|
|||||||
async fn create_remote_delete_mark(
|
async fn create_remote_delete_mark(
|
||||||
conf: &PageServerConf,
|
conf: &PageServerConf,
|
||||||
remote_storage: &GenericRemoteStorage,
|
remote_storage: &GenericRemoteStorage,
|
||||||
tenant_id: &TenantId,
|
tenant_shard_id: &TenantShardId,
|
||||||
) -> Result<(), DeleteTenantError> {
|
) -> Result<(), DeleteTenantError> {
|
||||||
let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id)?;
|
let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
|
||||||
|
|
||||||
let data: &[u8] = &[];
|
let data: &[u8] = &[];
|
||||||
backoff::retry(
|
backoff::retry(
|
||||||
|| async {
|
|| async {
|
||||||
|
let data = bytes::Bytes::from_static(data);
|
||||||
|
let stream = futures::stream::once(futures::future::ready(Ok(data)));
|
||||||
remote_storage
|
remote_storage
|
||||||
.upload(data, 0, &remote_mark_path, None)
|
.upload(stream, 0, &remote_mark_path, None)
|
||||||
.await
|
.await
|
||||||
},
|
},
|
||||||
|_e| false,
|
|_e| false,
|
||||||
@@ -99,9 +98,9 @@ async fn create_remote_delete_mark(
|
|||||||
|
|
||||||
async fn create_local_delete_mark(
|
async fn create_local_delete_mark(
|
||||||
conf: &PageServerConf,
|
conf: &PageServerConf,
|
||||||
tenant_id: &TenantId,
|
tenant_shard_id: &TenantShardId,
|
||||||
) -> Result<(), DeleteTenantError> {
|
) -> Result<(), DeleteTenantError> {
|
||||||
let marker_path = conf.tenant_deleted_mark_file_path(tenant_id);
|
let marker_path = conf.tenant_deleted_mark_file_path(tenant_shard_id);
|
||||||
|
|
||||||
// Note: we're ok to replace existing file.
|
// Note: we're ok to replace existing file.
|
||||||
let _ = std::fs::OpenOptions::new()
|
let _ = std::fs::OpenOptions::new()
|
||||||
@@ -170,10 +169,10 @@ async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), Del
|
|||||||
async fn remove_tenant_remote_delete_mark(
|
async fn remove_tenant_remote_delete_mark(
|
||||||
conf: &PageServerConf,
|
conf: &PageServerConf,
|
||||||
remote_storage: Option<&GenericRemoteStorage>,
|
remote_storage: Option<&GenericRemoteStorage>,
|
||||||
tenant_id: &TenantId,
|
tenant_shard_id: &TenantShardId,
|
||||||
) -> Result<(), DeleteTenantError> {
|
) -> Result<(), DeleteTenantError> {
|
||||||
if let Some(remote_storage) = remote_storage {
|
if let Some(remote_storage) = remote_storage {
|
||||||
let path = remote_tenant_delete_mark_path(conf, tenant_id)?;
|
let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
|
||||||
backoff::retry(
|
backoff::retry(
|
||||||
|| async { remote_storage.delete(&path).await },
|
|| async { remote_storage.delete(&path).await },
|
||||||
|_e| false,
|
|_e| false,
|
||||||
@@ -192,7 +191,7 @@ async fn remove_tenant_remote_delete_mark(
|
|||||||
// Cleanup fs traces: tenant config, timelines dir local delete mark, tenant dir
|
// Cleanup fs traces: tenant config, timelines dir local delete mark, tenant dir
|
||||||
async fn cleanup_remaining_fs_traces(
|
async fn cleanup_remaining_fs_traces(
|
||||||
conf: &PageServerConf,
|
conf: &PageServerConf,
|
||||||
tenant_id: &TenantId,
|
tenant_shard_id: &TenantShardId,
|
||||||
) -> Result<(), DeleteTenantError> {
|
) -> Result<(), DeleteTenantError> {
|
||||||
let rm = |p: Utf8PathBuf, is_dir: bool| async move {
|
let rm = |p: Utf8PathBuf, is_dir: bool| async move {
|
||||||
if is_dir {
|
if is_dir {
|
||||||
@@ -204,8 +203,8 @@ async fn cleanup_remaining_fs_traces(
|
|||||||
.with_context(|| format!("failed to delete {p}"))
|
.with_context(|| format!("failed to delete {p}"))
|
||||||
};
|
};
|
||||||
|
|
||||||
rm(conf.tenant_config_path(tenant_id), false).await?;
|
rm(conf.tenant_config_path(tenant_shard_id), false).await?;
|
||||||
rm(conf.tenant_location_config_path(tenant_id), false).await?;
|
rm(conf.tenant_location_config_path(tenant_shard_id), false).await?;
|
||||||
|
|
||||||
fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| {
|
fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| {
|
||||||
Err(anyhow::anyhow!(
|
Err(anyhow::anyhow!(
|
||||||
@@ -213,7 +212,7 @@ async fn cleanup_remaining_fs_traces(
|
|||||||
))?
|
))?
|
||||||
});
|
});
|
||||||
|
|
||||||
rm(conf.timelines_path(tenant_id), true).await?;
|
rm(conf.timelines_path(tenant_shard_id), true).await?;
|
||||||
|
|
||||||
fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| {
|
fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| {
|
||||||
Err(anyhow::anyhow!(
|
Err(anyhow::anyhow!(
|
||||||
@@ -227,14 +226,14 @@ async fn cleanup_remaining_fs_traces(
|
|||||||
// to be reordered later and thus missed if a crash occurs.
|
// to be reordered later and thus missed if a crash occurs.
|
||||||
// Note that we dont need to sync after mark file is removed
|
// Note that we dont need to sync after mark file is removed
|
||||||
// because we can tolerate the case when mark file reappears on startup.
|
// because we can tolerate the case when mark file reappears on startup.
|
||||||
let tenant_path = &conf.tenant_path(tenant_id);
|
let tenant_path = &conf.tenant_path(tenant_shard_id);
|
||||||
if tenant_path.exists() {
|
if tenant_path.exists() {
|
||||||
crashsafe::fsync_async(&conf.tenant_path(tenant_id))
|
crashsafe::fsync_async(&conf.tenant_path(tenant_shard_id))
|
||||||
.await
|
.await
|
||||||
.context("fsync_pre_mark_remove")?;
|
.context("fsync_pre_mark_remove")?;
|
||||||
}
|
}
|
||||||
|
|
||||||
rm(conf.tenant_deleted_mark_file_path(tenant_id), false).await?;
|
rm(conf.tenant_deleted_mark_file_path(tenant_shard_id), false).await?;
|
||||||
|
|
||||||
fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
|
fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
|
||||||
Err(anyhow::anyhow!(
|
Err(anyhow::anyhow!(
|
||||||
@@ -242,7 +241,7 @@ async fn cleanup_remaining_fs_traces(
|
|||||||
))?
|
))?
|
||||||
});
|
});
|
||||||
|
|
||||||
rm(conf.tenant_path(tenant_id), true).await?;
|
rm(conf.tenant_path(tenant_shard_id), true).await?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -287,6 +286,8 @@ impl DeleteTenantFlow {
|
|||||||
) -> Result<(), DeleteTenantError> {
|
) -> Result<(), DeleteTenantError> {
|
||||||
span::debug_assert_current_span_has_tenant_id();
|
span::debug_assert_current_span_has_tenant_id();
|
||||||
|
|
||||||
|
pausable_failpoint!("tenant-delete-before-run");
|
||||||
|
|
||||||
let mut guard = Self::prepare(&tenant).await?;
|
let mut guard = Self::prepare(&tenant).await?;
|
||||||
|
|
||||||
if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
|
if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
|
||||||
@@ -321,7 +322,7 @@ impl DeleteTenantFlow {
|
|||||||
// Though sounds scary, different mark name?
|
// Though sounds scary, different mark name?
|
||||||
// Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
|
// Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
|
||||||
if let Some(remote_storage) = &remote_storage {
|
if let Some(remote_storage) = &remote_storage {
|
||||||
create_remote_delete_mark(conf, remote_storage, &tenant.tenant_id)
|
create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id)
|
||||||
.await
|
.await
|
||||||
.context("remote_mark")?
|
.context("remote_mark")?
|
||||||
}
|
}
|
||||||
@@ -332,7 +333,7 @@ impl DeleteTenantFlow {
|
|||||||
))?
|
))?
|
||||||
});
|
});
|
||||||
|
|
||||||
create_local_delete_mark(conf, &tenant.tenant_id)
|
create_local_delete_mark(conf, &tenant.tenant_shard_id)
|
||||||
.await
|
.await
|
||||||
.context("local delete mark")?;
|
.context("local delete mark")?;
|
||||||
|
|
||||||
@@ -374,9 +375,11 @@ impl DeleteTenantFlow {
|
|||||||
return Ok(acquire(tenant));
|
return Ok(acquire(tenant));
|
||||||
}
|
}
|
||||||
|
|
||||||
let tenant_id = tenant.tenant_id;
|
|
||||||
// Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
|
// Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
|
||||||
if conf.tenant_deleted_mark_file_path(&tenant_id).exists() {
|
if conf
|
||||||
|
.tenant_deleted_mark_file_path(&tenant.tenant_shard_id)
|
||||||
|
.exists()
|
||||||
|
{
|
||||||
Ok(acquire(tenant))
|
Ok(acquire(tenant))
|
||||||
} else {
|
} else {
|
||||||
Ok(None)
|
Ok(None)
|
||||||
@@ -388,7 +391,6 @@ impl DeleteTenantFlow {
|
|||||||
tenant: &Arc<Tenant>,
|
tenant: &Arc<Tenant>,
|
||||||
preload: Option<TenantPreload>,
|
preload: Option<TenantPreload>,
|
||||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||||
init_order: Option<InitializationOrder>,
|
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<(), DeleteTenantError> {
|
) -> Result<(), DeleteTenantError> {
|
||||||
let (_, progress) = completion::channel();
|
let (_, progress) = completion::channel();
|
||||||
@@ -398,10 +400,7 @@ impl DeleteTenantFlow {
|
|||||||
.await
|
.await
|
||||||
.expect("cant be stopping or broken");
|
.expect("cant be stopping or broken");
|
||||||
|
|
||||||
tenant
|
tenant.attach(preload, ctx).await.context("attach")?;
|
||||||
.attach(init_order, preload, ctx)
|
|
||||||
.await
|
|
||||||
.context("attach")?;
|
|
||||||
|
|
||||||
Self::background(
|
Self::background(
|
||||||
guard,
|
guard,
|
||||||
@@ -459,15 +458,22 @@ impl DeleteTenantFlow {
|
|||||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||||
tenant: Arc<Tenant>,
|
tenant: Arc<Tenant>,
|
||||||
) {
|
) {
|
||||||
let tenant_id = tenant.tenant_id;
|
let tenant_shard_id = tenant.tenant_shard_id;
|
||||||
|
|
||||||
|
let cancel = crate::PAGESERVER_SHUTDOWN_TOKEN
|
||||||
|
.get()
|
||||||
|
.cloned()
|
||||||
|
.unwrap_or_default()
|
||||||
|
.child_token();
|
||||||
|
|
||||||
task_mgr::spawn(
|
task_mgr::spawn(
|
||||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||||
TaskKind::TimelineDeletionWorker,
|
TaskKind::TimelineDeletionWorker,
|
||||||
Some(tenant_id),
|
Some(tenant_shard_id),
|
||||||
None,
|
None,
|
||||||
"tenant_delete",
|
"tenant_delete",
|
||||||
false,
|
false,
|
||||||
|
cancel,
|
||||||
async move {
|
async move {
|
||||||
if let Err(err) =
|
if let Err(err) =
|
||||||
Self::background(guard, conf, remote_storage, tenants, &tenant).await
|
Self::background(guard, conf, remote_storage, tenants, &tenant).await
|
||||||
@@ -478,7 +484,7 @@ impl DeleteTenantFlow {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
.instrument({
|
.instrument({
|
||||||
let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_id);
|
let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug());
|
||||||
span.follows_from(Span::current());
|
span.follows_from(Span::current());
|
||||||
span
|
span
|
||||||
}),
|
}),
|
||||||
@@ -516,7 +522,7 @@ impl DeleteTenantFlow {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let timelines_path = conf.timelines_path(&tenant.tenant_id);
|
let timelines_path = conf.timelines_path(&tenant.tenant_shard_id);
|
||||||
// May not exist if we fail in cleanup_remaining_fs_traces after removing it
|
// May not exist if we fail in cleanup_remaining_fs_traces after removing it
|
||||||
if timelines_path.exists() {
|
if timelines_path.exists() {
|
||||||
// sanity check to guard against layout changes
|
// sanity check to guard against layout changes
|
||||||
@@ -525,7 +531,8 @@ impl DeleteTenantFlow {
|
|||||||
.context("timelines dir not empty")?;
|
.context("timelines dir not empty")?;
|
||||||
}
|
}
|
||||||
|
|
||||||
remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant.tenant_id).await?;
|
remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant.tenant_shard_id)
|
||||||
|
.await?;
|
||||||
|
|
||||||
fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| {
|
fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| {
|
||||||
Err(anyhow::anyhow!(
|
Err(anyhow::anyhow!(
|
||||||
@@ -533,21 +540,73 @@ impl DeleteTenantFlow {
|
|||||||
))?
|
))?
|
||||||
});
|
});
|
||||||
|
|
||||||
cleanup_remaining_fs_traces(conf, &tenant.tenant_id)
|
cleanup_remaining_fs_traces(conf, &tenant.tenant_shard_id)
|
||||||
.await
|
.await
|
||||||
.context("cleanup_remaining_fs_traces")?;
|
.context("cleanup_remaining_fs_traces")?;
|
||||||
|
|
||||||
{
|
{
|
||||||
let mut locked = tenants.write().unwrap();
|
pausable_failpoint!("tenant-delete-before-map-remove");
|
||||||
if locked.remove(&tenant.tenant_id).is_none() {
|
|
||||||
warn!("Tenant got removed from tenants map during deletion");
|
|
||||||
};
|
|
||||||
|
|
||||||
// FIXME: we should not be modifying this from outside of mgr.rs.
|
// This block is simply removing the TenantSlot for this tenant. It requires a loop because
|
||||||
// This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)
|
// we might conflict with a TenantSlot::InProgress marker and need to wait for it.
|
||||||
crate::metrics::TENANT_MANAGER
|
//
|
||||||
.tenant_slots
|
// This complexity will go away when we simplify how deletion works:
|
||||||
.set(locked.len() as u64);
|
// https://github.com/neondatabase/neon/issues/5080
|
||||||
|
loop {
|
||||||
|
// Under the TenantMap lock, try to remove the tenant. We usually succeed, but if
|
||||||
|
// we encounter an InProgress marker, yield the barrier it contains and wait on it.
|
||||||
|
let barrier = {
|
||||||
|
let mut locked = tenants.write().unwrap();
|
||||||
|
let removed = locked.remove(tenant.tenant_shard_id);
|
||||||
|
|
||||||
|
// FIXME: we should not be modifying this from outside of mgr.rs.
|
||||||
|
// This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)
|
||||||
|
crate::metrics::TENANT_MANAGER
|
||||||
|
.tenant_slots
|
||||||
|
.set(locked.len() as u64);
|
||||||
|
|
||||||
|
match removed {
|
||||||
|
TenantsMapRemoveResult::Occupied(TenantSlot::Attached(tenant)) => {
|
||||||
|
match tenant.current_state() {
|
||||||
|
TenantState::Stopping { .. } | TenantState::Broken { .. } => {
|
||||||
|
// Expected: we put the tenant into stopping state before we start deleting it
|
||||||
|
}
|
||||||
|
state => {
|
||||||
|
// Unexpected state
|
||||||
|
tracing::warn!(
|
||||||
|
"Tenant in unexpected state {state} after deletion"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
TenantsMapRemoveResult::Occupied(TenantSlot::Secondary) => {
|
||||||
|
// This is unexpected: this secondary tenants should not have been created, and we
|
||||||
|
// are not in a position to shut it down from here.
|
||||||
|
tracing::warn!("Tenant transitioned to secondary mode while deleting!");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
TenantsMapRemoveResult::Occupied(TenantSlot::InProgress(_)) => {
|
||||||
|
unreachable!("TenantsMap::remove handles InProgress separately, should never return it here");
|
||||||
|
}
|
||||||
|
TenantsMapRemoveResult::Vacant => {
|
||||||
|
tracing::warn!(
|
||||||
|
"Tenant removed from TenantsMap before deletion completed"
|
||||||
|
);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
TenantsMapRemoveResult::InProgress(barrier) => {
|
||||||
|
// An InProgress entry was found, we must wait on its barrier
|
||||||
|
barrier
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
tracing::info!(
|
||||||
|
"Waiting for competing operation to complete before deleting state for tenant"
|
||||||
|
);
|
||||||
|
barrier.wait().await;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
*guard = Self::Finished;
|
*guard = Self::Finished;
|
||||||
|
|||||||
@@ -7,18 +7,19 @@ use crate::page_cache::{self, PAGE_SZ};
|
|||||||
use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
|
use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
|
||||||
use crate::virtual_file::VirtualFile;
|
use crate::virtual_file::VirtualFile;
|
||||||
use camino::Utf8PathBuf;
|
use camino::Utf8PathBuf;
|
||||||
|
use pageserver_api::shard::TenantShardId;
|
||||||
use std::cmp::min;
|
use std::cmp::min;
|
||||||
use std::fs::OpenOptions;
|
use std::fs::OpenOptions;
|
||||||
use std::io::{self, ErrorKind};
|
use std::io::{self, ErrorKind};
|
||||||
use std::ops::DerefMut;
|
use std::ops::DerefMut;
|
||||||
use std::sync::atomic::AtomicU64;
|
use std::sync::atomic::AtomicU64;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
use utils::id::{TenantId, TimelineId};
|
use utils::id::TimelineId;
|
||||||
|
|
||||||
pub struct EphemeralFile {
|
pub struct EphemeralFile {
|
||||||
page_cache_file_id: page_cache::FileId,
|
page_cache_file_id: page_cache::FileId,
|
||||||
|
|
||||||
_tenant_id: TenantId,
|
_tenant_shard_id: TenantShardId,
|
||||||
_timeline_id: TimelineId,
|
_timeline_id: TimelineId,
|
||||||
file: VirtualFile,
|
file: VirtualFile,
|
||||||
len: u64,
|
len: u64,
|
||||||
@@ -31,7 +32,7 @@ pub struct EphemeralFile {
|
|||||||
impl EphemeralFile {
|
impl EphemeralFile {
|
||||||
pub async fn create(
|
pub async fn create(
|
||||||
conf: &PageServerConf,
|
conf: &PageServerConf,
|
||||||
tenant_id: TenantId,
|
tenant_shard_id: TenantShardId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
) -> Result<EphemeralFile, io::Error> {
|
) -> Result<EphemeralFile, io::Error> {
|
||||||
static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1);
|
static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1);
|
||||||
@@ -39,7 +40,7 @@ impl EphemeralFile {
|
|||||||
NEXT_FILENAME.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
NEXT_FILENAME.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
||||||
|
|
||||||
let filename = conf
|
let filename = conf
|
||||||
.timeline_path(&tenant_id, &timeline_id)
|
.timeline_path(&tenant_shard_id, &timeline_id)
|
||||||
.join(Utf8PathBuf::from(format!(
|
.join(Utf8PathBuf::from(format!(
|
||||||
"ephemeral-{filename_disambiguator}"
|
"ephemeral-{filename_disambiguator}"
|
||||||
)));
|
)));
|
||||||
@@ -52,7 +53,7 @@ impl EphemeralFile {
|
|||||||
|
|
||||||
Ok(EphemeralFile {
|
Ok(EphemeralFile {
|
||||||
page_cache_file_id: page_cache::next_file_id(),
|
page_cache_file_id: page_cache::next_file_id(),
|
||||||
_tenant_id: tenant_id,
|
_tenant_shard_id: tenant_shard_id,
|
||||||
_timeline_id: timeline_id,
|
_timeline_id: timeline_id,
|
||||||
file,
|
file,
|
||||||
len: 0,
|
len: 0,
|
||||||
@@ -282,7 +283,7 @@ mod tests {
|
|||||||
) -> Result<
|
) -> Result<
|
||||||
(
|
(
|
||||||
&'static PageServerConf,
|
&'static PageServerConf,
|
||||||
TenantId,
|
TenantShardId,
|
||||||
TimelineId,
|
TimelineId,
|
||||||
RequestContext,
|
RequestContext,
|
||||||
),
|
),
|
||||||
@@ -295,13 +296,13 @@ mod tests {
|
|||||||
// OK in a test.
|
// OK in a test.
|
||||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||||
|
|
||||||
let tenant_id = TenantId::from_str("11000000000000000000000000000000").unwrap();
|
let tenant_shard_id = TenantShardId::from_str("11000000000000000000000000000000").unwrap();
|
||||||
let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap();
|
let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap();
|
||||||
fs::create_dir_all(conf.timeline_path(&tenant_id, &timeline_id))?;
|
fs::create_dir_all(conf.timeline_path(&tenant_shard_id, &timeline_id))?;
|
||||||
|
|
||||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||||
|
|
||||||
Ok((conf, tenant_id, timeline_id, ctx))
|
Ok((conf, tenant_shard_id, timeline_id, ctx))
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
|
|||||||
@@ -11,15 +11,12 @@
|
|||||||
use std::io::{self};
|
use std::io::{self};
|
||||||
|
|
||||||
use anyhow::{ensure, Context};
|
use anyhow::{ensure, Context};
|
||||||
|
use pageserver_api::shard::TenantShardId;
|
||||||
use serde::{de::Error, Deserialize, Serialize, Serializer};
|
use serde::{de::Error, Deserialize, Serialize, Serializer};
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
use utils::bin_ser::SerializeError;
|
use utils::bin_ser::SerializeError;
|
||||||
use utils::crashsafe::path_with_suffix_extension;
|
use utils::crashsafe::path_with_suffix_extension;
|
||||||
use utils::{
|
use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn};
|
||||||
bin_ser::BeSer,
|
|
||||||
id::{TenantId, TimelineId},
|
|
||||||
lsn::Lsn,
|
|
||||||
};
|
|
||||||
|
|
||||||
use crate::config::PageServerConf;
|
use crate::config::PageServerConf;
|
||||||
use crate::virtual_file::VirtualFile;
|
use crate::virtual_file::VirtualFile;
|
||||||
@@ -272,14 +269,14 @@ impl Serialize for TimelineMetadata {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Save timeline metadata to file
|
/// Save timeline metadata to file
|
||||||
#[tracing::instrument(skip_all, fields(%tenant_id, %timeline_id))]
|
#[tracing::instrument(skip_all, fields(%tenant_id=tenant_shard_id.tenant_id, %shard_id=tenant_shard_id.shard_slug(), %timeline_id))]
|
||||||
pub async fn save_metadata(
|
pub async fn save_metadata(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
tenant_id: &TenantId,
|
tenant_shard_id: &TenantShardId,
|
||||||
timeline_id: &TimelineId,
|
timeline_id: &TimelineId,
|
||||||
data: &TimelineMetadata,
|
data: &TimelineMetadata,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let path = conf.metadata_path(tenant_id, timeline_id);
|
let path = conf.metadata_path(tenant_shard_id, timeline_id);
|
||||||
let temp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX);
|
let temp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX);
|
||||||
let metadata_bytes = data.to_bytes().context("serialize metadata")?;
|
let metadata_bytes = data.to_bytes().context("serialize metadata")?;
|
||||||
VirtualFile::crashsafe_overwrite(&path, &temp_path, &metadata_bytes)
|
VirtualFile::crashsafe_overwrite(&path, &temp_path, &metadata_bytes)
|
||||||
@@ -299,10 +296,10 @@ pub enum LoadMetadataError {
|
|||||||
|
|
||||||
pub fn load_metadata(
|
pub fn load_metadata(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
tenant_id: &TenantId,
|
tenant_shard_id: &TenantShardId,
|
||||||
timeline_id: &TimelineId,
|
timeline_id: &TimelineId,
|
||||||
) -> Result<TimelineMetadata, LoadMetadataError> {
|
) -> Result<TimelineMetadata, LoadMetadataError> {
|
||||||
let metadata_path = conf.metadata_path(tenant_id, timeline_id);
|
let metadata_path = conf.metadata_path(tenant_shard_id, timeline_id);
|
||||||
let metadata_bytes = std::fs::read(metadata_path)?;
|
let metadata_bytes = std::fs::read(metadata_path)?;
|
||||||
|
|
||||||
Ok(TimelineMetadata::from_bytes(&metadata_bytes)?)
|
Ok(TimelineMetadata::from_bytes(&metadata_bytes)?)
|
||||||
|
|||||||
@@ -2,7 +2,8 @@
|
|||||||
//! page server.
|
//! page server.
|
||||||
|
|
||||||
use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
|
use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
|
||||||
use pageserver_api::shard::TenantShardId;
|
use pageserver_api::key::Key;
|
||||||
|
use pageserver_api::shard::{ShardIdentity, ShardNumber, TenantShardId};
|
||||||
use rand::{distributions::Alphanumeric, Rng};
|
use rand::{distributions::Alphanumeric, Rng};
|
||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::collections::{BTreeMap, HashMap};
|
use std::collections::{BTreeMap, HashMap};
|
||||||
@@ -29,7 +30,9 @@ use crate::control_plane_client::{
|
|||||||
use crate::deletion_queue::DeletionQueueClient;
|
use crate::deletion_queue::DeletionQueueClient;
|
||||||
use crate::metrics::TENANT_MANAGER as METRICS;
|
use crate::metrics::TENANT_MANAGER as METRICS;
|
||||||
use crate::task_mgr::{self, TaskKind};
|
use crate::task_mgr::{self, TaskKind};
|
||||||
use crate::tenant::config::{AttachmentMode, LocationConf, LocationMode, TenantConfOpt};
|
use crate::tenant::config::{
|
||||||
|
AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, TenantConfOpt,
|
||||||
|
};
|
||||||
use crate::tenant::delete::DeleteTenantFlow;
|
use crate::tenant::delete::DeleteTenantFlow;
|
||||||
use crate::tenant::span::debug_assert_current_span_has_tenant_id;
|
use crate::tenant::span::debug_assert_current_span_has_tenant_id;
|
||||||
use crate::tenant::{create_tenant_files, AttachedTenantConf, SpawnMode, Tenant, TenantState};
|
use crate::tenant::{create_tenant_files, AttachedTenantConf, SpawnMode, Tenant, TenantState};
|
||||||
@@ -95,57 +98,100 @@ pub(crate) enum TenantsMap {
|
|||||||
ShuttingDown(BTreeMap<TenantShardId, TenantSlot>),
|
ShuttingDown(BTreeMap<TenantShardId, TenantSlot>),
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Helper for mapping shard-unaware functions to a sharding-aware map
|
pub(crate) enum TenantsMapRemoveResult {
|
||||||
/// TODO(sharding): all users of this must be made shard-aware.
|
Occupied(TenantSlot),
|
||||||
fn exactly_one_or_none<'a>(
|
Vacant,
|
||||||
map: &'a BTreeMap<TenantShardId, TenantSlot>,
|
InProgress(utils::completion::Barrier),
|
||||||
tenant_id: &TenantId,
|
}
|
||||||
) -> Option<(&'a TenantShardId, &'a TenantSlot)> {
|
|
||||||
let mut slots = map.range(TenantShardId::tenant_range(*tenant_id));
|
|
||||||
|
|
||||||
// Retrieve the first two slots in the range: if both are populated, we must panic because the caller
|
/// When resolving a TenantId to a shard, we may be looking for the 0th
|
||||||
// needs a shard-naive view of the world in which only one slot can exist for a TenantId at a time.
|
/// shard, or we might be looking for whichever shard holds a particular page.
|
||||||
let slot_a = slots.next();
|
pub(crate) enum ShardSelector {
|
||||||
let slot_b = slots.next();
|
/// Only return the 0th shard, if it is present. If a non-0th shard is present,
|
||||||
match (slot_a, slot_b) {
|
/// ignore it.
|
||||||
(None, None) => None,
|
Zero,
|
||||||
(Some(slot), None) => {
|
/// Pick the first shard we find for the TenantId
|
||||||
// Exactly one matching slot
|
First,
|
||||||
Some(slot)
|
/// Pick the shard that holds this key
|
||||||
}
|
Page(Key),
|
||||||
(Some(_slot_a), Some(_slot_b)) => {
|
|
||||||
// Multiple shards for this tenant: cannot handle this yet.
|
|
||||||
// TODO(sharding): callers of get() should be shard-aware.
|
|
||||||
todo!("Attaching multiple shards in teh same tenant to the same pageserver")
|
|
||||||
}
|
|
||||||
(None, Some(_)) => unreachable!(),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TenantsMap {
|
impl TenantsMap {
|
||||||
/// Convenience function for typical usage, where we want to get a `Tenant` object, for
|
/// Convenience function for typical usage, where we want to get a `Tenant` object, for
|
||||||
/// working with attached tenants. If the TenantId is in the map but in Secondary state,
|
/// working with attached tenants. If the TenantId is in the map but in Secondary state,
|
||||||
/// None is returned.
|
/// None is returned.
|
||||||
pub(crate) fn get(&self, tenant_id: &TenantId) -> Option<&Arc<Tenant>> {
|
pub(crate) fn get(&self, tenant_shard_id: &TenantShardId) -> Option<&Arc<Tenant>> {
|
||||||
match self {
|
match self {
|
||||||
TenantsMap::Initializing => None,
|
TenantsMap::Initializing => None,
|
||||||
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
|
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
|
||||||
// TODO(sharding): callers of get() should be shard-aware.
|
m.get(tenant_shard_id).and_then(|slot| slot.get_attached())
|
||||||
exactly_one_or_none(m, tenant_id).and_then(|(_, slot)| slot.get_attached())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn remove(&mut self, tenant_id: &TenantId) -> Option<TenantSlot> {
|
/// A page service client sends a TenantId, and to look up the correct Tenant we must
|
||||||
|
/// resolve this to a fully qualified TenantShardId.
|
||||||
|
fn resolve_shard(
|
||||||
|
&self,
|
||||||
|
tenant_id: &TenantId,
|
||||||
|
selector: ShardSelector,
|
||||||
|
) -> Option<TenantShardId> {
|
||||||
|
let mut want_shard = None;
|
||||||
match self {
|
match self {
|
||||||
TenantsMap::Initializing => None,
|
TenantsMap::Initializing => None,
|
||||||
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
|
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
|
||||||
let key = exactly_one_or_none(m, tenant_id).map(|(k, _)| *k);
|
for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
|
||||||
key.and_then(|key| m.remove(&key))
|
match selector {
|
||||||
|
ShardSelector::First => return Some(*slot.0),
|
||||||
|
ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
|
||||||
|
return Some(*slot.0)
|
||||||
|
}
|
||||||
|
ShardSelector::Page(key) => {
|
||||||
|
if let Some(tenant) = slot.1.get_attached() {
|
||||||
|
// First slot we see for this tenant, calculate the expected shard number
|
||||||
|
// for the key: we will use this for checking if this and subsequent
|
||||||
|
// slots contain the key, rather than recalculating the hash each time.
|
||||||
|
if want_shard.is_none() {
|
||||||
|
want_shard = Some(tenant.shard_identity.get_shard_number(&key));
|
||||||
|
}
|
||||||
|
|
||||||
|
if Some(tenant.shard_identity.number) == want_shard {
|
||||||
|
return Some(*slot.0);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => continue,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fall through: we didn't find an acceptable shard
|
||||||
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Only for use from DeleteTenantFlow. This method directly removes a TenantSlot from the map.
|
||||||
|
///
|
||||||
|
/// The normal way to remove a tenant is using a SlotGuard, which will gracefully remove the guarded
|
||||||
|
/// slot if the enclosed tenant is shutdown.
|
||||||
|
pub(crate) fn remove(&mut self, tenant_shard_id: TenantShardId) -> TenantsMapRemoveResult {
|
||||||
|
use std::collections::btree_map::Entry;
|
||||||
|
match self {
|
||||||
|
TenantsMap::Initializing => TenantsMapRemoveResult::Vacant,
|
||||||
|
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => match m.entry(tenant_shard_id) {
|
||||||
|
Entry::Occupied(entry) => match entry.get() {
|
||||||
|
TenantSlot::InProgress(barrier) => {
|
||||||
|
TenantsMapRemoveResult::InProgress(barrier.clone())
|
||||||
|
}
|
||||||
|
_ => TenantsMapRemoveResult::Occupied(entry.remove()),
|
||||||
|
},
|
||||||
|
Entry::Vacant(_entry) => TenantsMapRemoveResult::Vacant,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub(crate) fn len(&self) -> usize {
|
pub(crate) fn len(&self) -> usize {
|
||||||
match self {
|
match self {
|
||||||
TenantsMap::Initializing => 0,
|
TenantsMap::Initializing => 0,
|
||||||
@@ -190,49 +236,6 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
|
|||||||
static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
|
static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
|
||||||
Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing));
|
Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing));
|
||||||
|
|
||||||
/// Create a directory, including parents. This does no fsyncs and makes
|
|
||||||
/// no guarantees about the persistence of the resulting metadata: for
|
|
||||||
/// use when creating dirs for use as cache.
|
|
||||||
async fn unsafe_create_dir_all(path: &Utf8PathBuf) -> std::io::Result<()> {
|
|
||||||
let mut dirs_to_create = Vec::new();
|
|
||||||
let mut path: &Utf8Path = path.as_ref();
|
|
||||||
|
|
||||||
// Figure out which directories we need to create.
|
|
||||||
loop {
|
|
||||||
let meta = tokio::fs::metadata(path).await;
|
|
||||||
match meta {
|
|
||||||
Ok(metadata) if metadata.is_dir() => break,
|
|
||||||
Ok(_) => {
|
|
||||||
return Err(std::io::Error::new(
|
|
||||||
std::io::ErrorKind::AlreadyExists,
|
|
||||||
format!("non-directory found in path: {path}"),
|
|
||||||
));
|
|
||||||
}
|
|
||||||
Err(ref e) if e.kind() == std::io::ErrorKind::NotFound => {}
|
|
||||||
Err(e) => return Err(e),
|
|
||||||
}
|
|
||||||
|
|
||||||
dirs_to_create.push(path);
|
|
||||||
|
|
||||||
match path.parent() {
|
|
||||||
Some(parent) => path = parent,
|
|
||||||
None => {
|
|
||||||
return Err(std::io::Error::new(
|
|
||||||
std::io::ErrorKind::InvalidInput,
|
|
||||||
format!("can't find parent of path '{path}'"),
|
|
||||||
));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create directories from parent to child.
|
|
||||||
for &path in dirs_to_create.iter().rev() {
|
|
||||||
tokio::fs::create_dir(path).await?;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The TenantManager is responsible for storing and mutating the collection of all tenants
|
/// The TenantManager is responsible for storing and mutating the collection of all tenants
|
||||||
/// that this pageserver process has state for. Every Tenant and SecondaryTenant instance
|
/// that this pageserver process has state for. Every Tenant and SecondaryTenant instance
|
||||||
/// lives inside the TenantManager.
|
/// lives inside the TenantManager.
|
||||||
@@ -250,8 +253,8 @@ pub struct TenantManager {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn emergency_generations(
|
fn emergency_generations(
|
||||||
tenant_confs: &HashMap<TenantId, anyhow::Result<LocationConf>>,
|
tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
|
||||||
) -> HashMap<TenantId, Generation> {
|
) -> HashMap<TenantShardId, Generation> {
|
||||||
tenant_confs
|
tenant_confs
|
||||||
.iter()
|
.iter()
|
||||||
.filter_map(|(tid, lc)| {
|
.filter_map(|(tid, lc)| {
|
||||||
@@ -271,16 +274,16 @@ fn emergency_generations(
|
|||||||
|
|
||||||
async fn init_load_generations(
|
async fn init_load_generations(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
tenant_confs: &HashMap<TenantId, anyhow::Result<LocationConf>>,
|
tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
|
||||||
resources: &TenantSharedResources,
|
resources: &TenantSharedResources,
|
||||||
cancel: &CancellationToken,
|
cancel: &CancellationToken,
|
||||||
) -> anyhow::Result<Option<HashMap<TenantId, Generation>>> {
|
) -> anyhow::Result<Option<HashMap<TenantShardId, Generation>>> {
|
||||||
let generations = if conf.control_plane_emergency_mode {
|
let generations = if conf.control_plane_emergency_mode {
|
||||||
error!(
|
error!(
|
||||||
"Emergency mode! Tenants will be attached unsafely using their last known generation"
|
"Emergency mode! Tenants will be attached unsafely using their last known generation"
|
||||||
);
|
);
|
||||||
emergency_generations(tenant_confs)
|
emergency_generations(tenant_confs)
|
||||||
} else if let Some(client) = ControlPlaneClient::new(conf, cancel) {
|
} else if let Some(client) = ControlPlaneClient::new(conf, cancel.child_token()) {
|
||||||
info!("Calling control plane API to re-attach tenants");
|
info!("Calling control plane API to re-attach tenants");
|
||||||
// If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
|
// If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
|
||||||
match client.re_attach().await {
|
match client.re_attach().await {
|
||||||
@@ -317,7 +320,7 @@ async fn init_load_generations(
|
|||||||
fn load_tenant_config(
|
fn load_tenant_config(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
dentry: Utf8DirEntry,
|
dentry: Utf8DirEntry,
|
||||||
) -> anyhow::Result<Option<(TenantId, anyhow::Result<LocationConf>)>> {
|
) -> anyhow::Result<Option<(TenantShardId, anyhow::Result<LocationConf>)>> {
|
||||||
let tenant_dir_path = dentry.path().to_path_buf();
|
let tenant_dir_path = dentry.path().to_path_buf();
|
||||||
if crate::is_temporary(&tenant_dir_path) {
|
if crate::is_temporary(&tenant_dir_path) {
|
||||||
info!("Found temporary tenant directory, removing: {tenant_dir_path}");
|
info!("Found temporary tenant directory, removing: {tenant_dir_path}");
|
||||||
@@ -353,10 +356,10 @@ fn load_tenant_config(
|
|||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
|
|
||||||
let tenant_id = match tenant_dir_path
|
let tenant_shard_id = match tenant_dir_path
|
||||||
.file_name()
|
.file_name()
|
||||||
.unwrap_or_default()
|
.unwrap_or_default()
|
||||||
.parse::<TenantId>()
|
.parse::<TenantShardId>()
|
||||||
{
|
{
|
||||||
Ok(id) => id,
|
Ok(id) => id,
|
||||||
Err(_) => {
|
Err(_) => {
|
||||||
@@ -366,8 +369,8 @@ fn load_tenant_config(
|
|||||||
};
|
};
|
||||||
|
|
||||||
Ok(Some((
|
Ok(Some((
|
||||||
tenant_id,
|
tenant_shard_id,
|
||||||
Tenant::load_tenant_config(conf, &tenant_id),
|
Tenant::load_tenant_config(conf, &tenant_shard_id),
|
||||||
)))
|
)))
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -378,7 +381,7 @@ fn load_tenant_config(
|
|||||||
/// seconds even on reasonably fast drives.
|
/// seconds even on reasonably fast drives.
|
||||||
async fn init_load_tenant_configs(
|
async fn init_load_tenant_configs(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
) -> anyhow::Result<HashMap<TenantId, anyhow::Result<LocationConf>>> {
|
) -> anyhow::Result<HashMap<TenantShardId, anyhow::Result<LocationConf>>> {
|
||||||
let tenants_dir = conf.tenants_path();
|
let tenants_dir = conf.tenants_path();
|
||||||
|
|
||||||
let dentries = tokio::task::spawn_blocking(move || -> anyhow::Result<Vec<Utf8DirEntry>> {
|
let dentries = tokio::task::spawn_blocking(move || -> anyhow::Result<Vec<Utf8DirEntry>> {
|
||||||
@@ -428,19 +431,19 @@ pub async fn init_tenant_mgr(
|
|||||||
init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;
|
init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;
|
||||||
|
|
||||||
// Construct `Tenant` objects and start them running
|
// Construct `Tenant` objects and start them running
|
||||||
for (tenant_id, location_conf) in tenant_configs {
|
for (tenant_shard_id, location_conf) in tenant_configs {
|
||||||
let tenant_dir_path = conf.tenant_path(&tenant_id);
|
let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
|
||||||
|
|
||||||
let mut location_conf = match location_conf {
|
let mut location_conf = match location_conf {
|
||||||
Ok(l) => l,
|
Ok(l) => l,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
warn!(%tenant_id, "Marking tenant broken, failed to {e:#}");
|
warn!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Marking tenant broken, failed to {e:#}");
|
||||||
|
|
||||||
tenants.insert(
|
tenants.insert(
|
||||||
TenantShardId::unsharded(tenant_id),
|
tenant_shard_id,
|
||||||
TenantSlot::Attached(Tenant::create_broken_tenant(
|
TenantSlot::Attached(Tenant::create_broken_tenant(
|
||||||
conf,
|
conf,
|
||||||
tenant_id,
|
tenant_shard_id,
|
||||||
format!("{}", e),
|
format!("{}", e),
|
||||||
)),
|
)),
|
||||||
);
|
);
|
||||||
@@ -451,7 +454,7 @@ pub async fn init_tenant_mgr(
|
|||||||
let generation = if let Some(generations) = &tenant_generations {
|
let generation = if let Some(generations) = &tenant_generations {
|
||||||
// We have a generation map: treat it as the authority for whether
|
// We have a generation map: treat it as the authority for whether
|
||||||
// this tenant is really attached.
|
// this tenant is really attached.
|
||||||
if let Some(gen) = generations.get(&tenant_id) {
|
if let Some(gen) = generations.get(&tenant_shard_id) {
|
||||||
*gen
|
*gen
|
||||||
} else {
|
} else {
|
||||||
match &location_conf.mode {
|
match &location_conf.mode {
|
||||||
@@ -459,8 +462,8 @@ pub async fn init_tenant_mgr(
|
|||||||
// We do not require the control plane's permission for secondary mode
|
// We do not require the control plane's permission for secondary mode
|
||||||
// tenants, because they do no remote writes and hence require no
|
// tenants, because they do no remote writes and hence require no
|
||||||
// generation number
|
// generation number
|
||||||
info!(%tenant_id, "Loaded tenant in secondary mode");
|
info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Loaded tenant in secondary mode");
|
||||||
tenants.insert(TenantShardId::unsharded(tenant_id), TenantSlot::Secondary);
|
tenants.insert(tenant_shard_id, TenantSlot::Secondary);
|
||||||
}
|
}
|
||||||
LocationMode::Attached(_) => {
|
LocationMode::Attached(_) => {
|
||||||
// TODO: augment re-attach API to enable the control plane to
|
// TODO: augment re-attach API to enable the control plane to
|
||||||
@@ -468,9 +471,9 @@ pub async fn init_tenant_mgr(
|
|||||||
// away local state, we can gracefully fall back to secondary here, if the control
|
// away local state, we can gracefully fall back to secondary here, if the control
|
||||||
// plane tells us so.
|
// plane tells us so.
|
||||||
// (https://github.com/neondatabase/neon/issues/5377)
|
// (https://github.com/neondatabase/neon/issues/5377)
|
||||||
info!(%tenant_id, "Detaching tenant, control plane omitted it in re-attach response");
|
info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response");
|
||||||
if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
|
if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
|
||||||
error!(%tenant_id,
|
error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
|
||||||
"Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}",
|
"Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -482,21 +485,23 @@ pub async fn init_tenant_mgr(
|
|||||||
} else {
|
} else {
|
||||||
// Legacy mode: no generation information, any tenant present
|
// Legacy mode: no generation information, any tenant present
|
||||||
// on local disk may activate
|
// on local disk may activate
|
||||||
info!(%tenant_id, "Starting tenant in legacy mode, no generation",);
|
info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Starting tenant in legacy mode, no generation",);
|
||||||
Generation::none()
|
Generation::none()
|
||||||
};
|
};
|
||||||
|
|
||||||
// Presence of a generation number implies attachment: attach the tenant
|
// Presence of a generation number implies attachment: attach the tenant
|
||||||
// if it wasn't already, and apply the generation number.
|
// if it wasn't already, and apply the generation number.
|
||||||
location_conf.attach_in_generation(generation);
|
location_conf.attach_in_generation(generation);
|
||||||
Tenant::persist_tenant_config(conf, &tenant_id, &location_conf).await?;
|
Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;
|
||||||
|
|
||||||
|
let shard_identity = location_conf.shard;
|
||||||
match tenant_spawn(
|
match tenant_spawn(
|
||||||
conf,
|
conf,
|
||||||
tenant_id,
|
tenant_shard_id,
|
||||||
&tenant_dir_path,
|
&tenant_dir_path,
|
||||||
resources.clone(),
|
resources.clone(),
|
||||||
AttachedTenantConf::try_from(location_conf)?,
|
AttachedTenantConf::try_from(location_conf)?,
|
||||||
|
shard_identity,
|
||||||
Some(init_order.clone()),
|
Some(init_order.clone()),
|
||||||
&TENANTS,
|
&TENANTS,
|
||||||
SpawnMode::Normal,
|
SpawnMode::Normal,
|
||||||
@@ -509,7 +514,7 @@ pub async fn init_tenant_mgr(
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
error!(%tenant_id, "Failed to start tenant: {e:#}");
|
error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -533,10 +538,11 @@ pub async fn init_tenant_mgr(
|
|||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
pub(crate) fn tenant_spawn(
|
pub(crate) fn tenant_spawn(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
tenant_id: TenantId,
|
tenant_shard_id: TenantShardId,
|
||||||
tenant_path: &Utf8Path,
|
tenant_path: &Utf8Path,
|
||||||
resources: TenantSharedResources,
|
resources: TenantSharedResources,
|
||||||
location_conf: AttachedTenantConf,
|
location_conf: AttachedTenantConf,
|
||||||
|
shard_identity: ShardIdentity,
|
||||||
init_order: Option<InitializationOrder>,
|
init_order: Option<InitializationOrder>,
|
||||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||||
mode: SpawnMode,
|
mode: SpawnMode,
|
||||||
@@ -557,18 +563,25 @@ pub(crate) fn tenant_spawn(
|
|||||||
"Cannot load tenant from empty directory {tenant_path:?}"
|
"Cannot load tenant from empty directory {tenant_path:?}"
|
||||||
);
|
);
|
||||||
|
|
||||||
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id);
|
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
|
||||||
anyhow::ensure!(
|
anyhow::ensure!(
|
||||||
!conf.tenant_ignore_mark_file_path(&tenant_id).exists(),
|
!conf.tenant_ignore_mark_file_path(&tenant_shard_id).exists(),
|
||||||
"Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
|
"Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
|
||||||
);
|
);
|
||||||
|
|
||||||
info!("Attaching tenant {tenant_id}");
|
info!(
|
||||||
|
tenant_id = %tenant_shard_id.tenant_id,
|
||||||
|
shard_id = %tenant_shard_id.shard_slug(),
|
||||||
|
generation = ?location_conf.location.generation,
|
||||||
|
attach_mode = ?location_conf.location.attach_mode,
|
||||||
|
"Attaching tenant"
|
||||||
|
);
|
||||||
let tenant = match Tenant::spawn(
|
let tenant = match Tenant::spawn(
|
||||||
conf,
|
conf,
|
||||||
tenant_id,
|
tenant_shard_id,
|
||||||
resources,
|
resources,
|
||||||
location_conf,
|
location_conf,
|
||||||
|
shard_identity,
|
||||||
init_order,
|
init_order,
|
||||||
tenants,
|
tenants,
|
||||||
mode,
|
mode,
|
||||||
@@ -576,8 +589,8 @@ pub(crate) fn tenant_spawn(
|
|||||||
) {
|
) {
|
||||||
Ok(tenant) => tenant,
|
Ok(tenant) => tenant,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
error!("Failed to spawn tenant {tenant_id}, reason: {e:#}");
|
error!("Failed to spawn tenant {tenant_shard_id}, reason: {e:#}");
|
||||||
Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
|
Tenant::create_broken_tenant(conf, tenant_shard_id, format!("{e:#}"))
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -732,19 +745,20 @@ pub(crate) async fn create_tenant(
|
|||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<Arc<Tenant>, TenantMapInsertError> {
|
) -> Result<Arc<Tenant>, TenantMapInsertError> {
|
||||||
let location_conf = LocationConf::attached_single(tenant_conf, generation);
|
let location_conf = LocationConf::attached_single(tenant_conf, generation);
|
||||||
|
info!("Creating tenant at location {location_conf:?}");
|
||||||
|
|
||||||
let slot_guard =
|
let slot_guard =
|
||||||
tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
|
tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
|
||||||
// TODO(sharding): make local paths shard-aware
|
let tenant_path = super::create_tenant_files(conf, &location_conf, &tenant_shard_id).await?;
|
||||||
let tenant_path =
|
|
||||||
super::create_tenant_files(conf, &location_conf, &tenant_shard_id.tenant_id).await?;
|
|
||||||
|
|
||||||
|
let shard_identity = location_conf.shard;
|
||||||
let created_tenant = tenant_spawn(
|
let created_tenant = tenant_spawn(
|
||||||
conf,
|
conf,
|
||||||
tenant_shard_id.tenant_id,
|
tenant_shard_id,
|
||||||
&tenant_path,
|
&tenant_path,
|
||||||
resources,
|
resources,
|
||||||
AttachedTenantConf::try_from(location_conf)?,
|
AttachedTenantConf::try_from(location_conf)?,
|
||||||
|
shard_identity,
|
||||||
None,
|
None,
|
||||||
&TENANTS,
|
&TENANTS,
|
||||||
SpawnMode::Create,
|
SpawnMode::Create,
|
||||||
@@ -774,15 +788,18 @@ pub(crate) async fn set_new_tenant_config(
|
|||||||
new_tenant_conf: TenantConfOpt,
|
new_tenant_conf: TenantConfOpt,
|
||||||
tenant_id: TenantId,
|
tenant_id: TenantId,
|
||||||
) -> Result<(), SetNewTenantConfigError> {
|
) -> Result<(), SetNewTenantConfigError> {
|
||||||
|
// Legacy API: does not support sharding
|
||||||
|
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||||
|
|
||||||
info!("configuring tenant {tenant_id}");
|
info!("configuring tenant {tenant_id}");
|
||||||
let tenant = get_tenant(tenant_id, true)?;
|
let tenant = get_tenant(tenant_shard_id, true)?;
|
||||||
|
|
||||||
// This is a legacy API that only operates on attached tenants: the preferred
|
// This is a legacy API that only operates on attached tenants: the preferred
|
||||||
// API to use is the location_config/ endpoint, which lets the caller provide
|
// API to use is the location_config/ endpoint, which lets the caller provide
|
||||||
// the full LocationConf.
|
// the full LocationConf.
|
||||||
let location_conf = LocationConf::attached_single(new_tenant_conf, tenant.generation);
|
let location_conf = LocationConf::attached_single(new_tenant_conf, tenant.generation);
|
||||||
|
|
||||||
Tenant::persist_tenant_config(conf, &tenant_id, &location_conf)
|
Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf)
|
||||||
.await
|
.await
|
||||||
.map_err(SetNewTenantConfigError::Persist)?;
|
.map_err(SetNewTenantConfigError::Persist)?;
|
||||||
tenant.set_new_tenant_config(new_tenant_conf);
|
tenant.set_new_tenant_config(new_tenant_conf);
|
||||||
@@ -792,8 +809,6 @@ pub(crate) async fn set_new_tenant_config(
|
|||||||
impl TenantManager {
|
impl TenantManager {
|
||||||
/// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or is not fitting to the query.
|
/// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or is not fitting to the query.
|
||||||
/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
|
/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
|
||||||
///
|
|
||||||
/// This method is cancel-safe.
|
|
||||||
pub(crate) fn get_attached_tenant_shard(
|
pub(crate) fn get_attached_tenant_shard(
|
||||||
&self,
|
&self,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_shard_id: TenantShardId,
|
||||||
@@ -838,10 +853,12 @@ impl TenantManager {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
|
||||||
pub(crate) async fn upsert_location(
|
pub(crate) async fn upsert_location(
|
||||||
&self,
|
&self,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_shard_id: TenantShardId,
|
||||||
new_location_config: LocationConf,
|
new_location_config: LocationConf,
|
||||||
|
flush: Option<Duration>,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<(), anyhow::Error> {
|
) -> Result<(), anyhow::Error> {
|
||||||
debug_assert_current_span_has_tenant_id();
|
debug_assert_current_span_has_tenant_id();
|
||||||
@@ -850,7 +867,7 @@ impl TenantManager {
|
|||||||
// Special case fast-path for updates to Tenant: if our upsert is only updating configuration,
|
// Special case fast-path for updates to Tenant: if our upsert is only updating configuration,
|
||||||
// then we do not need to set the slot to InProgress, we can just call into the
|
// then we do not need to set the slot to InProgress, we can just call into the
|
||||||
// existng tenant.
|
// existng tenant.
|
||||||
{
|
let modify_tenant = {
|
||||||
let locked = self.tenants.read().unwrap();
|
let locked = self.tenants.read().unwrap();
|
||||||
let peek_slot =
|
let peek_slot =
|
||||||
tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Write)?;
|
tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Write)?;
|
||||||
@@ -861,22 +878,50 @@ impl TenantManager {
|
|||||||
// take our fast path and just provide the updated configuration
|
// take our fast path and just provide the updated configuration
|
||||||
// to the tenant.
|
// to the tenant.
|
||||||
tenant.set_new_location_config(AttachedTenantConf::try_from(
|
tenant.set_new_location_config(AttachedTenantConf::try_from(
|
||||||
new_location_config,
|
new_location_config.clone(),
|
||||||
)?);
|
)?);
|
||||||
|
|
||||||
// Persist the new config in the background, to avoid holding up any
|
Some(tenant.clone())
|
||||||
// locks while we do so.
|
|
||||||
// TODO
|
|
||||||
|
|
||||||
return Ok(());
|
|
||||||
} else {
|
} else {
|
||||||
// Different generations, fall through to general case
|
// Different generations, fall through to general case
|
||||||
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
_ => {
|
_ => {
|
||||||
// Not an Attached->Attached transition, fall through to general case
|
// Not an Attached->Attached transition, fall through to general case
|
||||||
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Fast-path continued: having dropped out of the self.tenants lock, do the async
|
||||||
|
// phase of waiting for flush, before returning.
|
||||||
|
if let Some(tenant) = modify_tenant {
|
||||||
|
// Transition to AttachedStale means we may well hold a valid generation
|
||||||
|
// still, and have been requested to go stale as part of a migration. If
|
||||||
|
// the caller set `flush`, then flush to remote storage.
|
||||||
|
if let LocationMode::Attached(AttachedLocationConfig {
|
||||||
|
generation: _,
|
||||||
|
attach_mode: AttachmentMode::Stale,
|
||||||
|
}) = &new_location_config.mode
|
||||||
|
{
|
||||||
|
if let Some(flush_timeout) = flush {
|
||||||
|
match tokio::time::timeout(flush_timeout, tenant.flush_remote()).await {
|
||||||
|
Ok(Err(e)) => {
|
||||||
|
return Err(e);
|
||||||
|
}
|
||||||
|
Ok(Ok(_)) => return Ok(()),
|
||||||
|
Err(_) => {
|
||||||
|
tracing::warn!(
|
||||||
|
timeout_ms = flush_timeout.as_millis(),
|
||||||
|
"Timed out waiting for flush to remote storage, proceeding anyway."
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
// General case for upserts to TenantsMap, excluding the case above: we will substitute an
|
// General case for upserts to TenantsMap, excluding the case above: we will substitute an
|
||||||
@@ -915,55 +960,44 @@ impl TenantManager {
|
|||||||
slot_guard.drop_old_value().expect("We just shut it down");
|
slot_guard.drop_old_value().expect("We just shut it down");
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO(sharding): make local paths sharding-aware
|
let tenant_path = self.conf.tenant_path(&tenant_shard_id);
|
||||||
let tenant_path = self.conf.tenant_path(&tenant_shard_id.tenant_id);
|
|
||||||
|
|
||||||
let new_slot = match &new_location_config.mode {
|
let new_slot = match &new_location_config.mode {
|
||||||
LocationMode::Secondary(_) => {
|
LocationMode::Secondary(_) => {
|
||||||
// Directory doesn't need to be fsync'd because if we crash it can
|
// Directory doesn't need to be fsync'd because if we crash it can
|
||||||
// safely be recreated next time this tenant location is configured.
|
// safely be recreated next time this tenant location is configured.
|
||||||
unsafe_create_dir_all(&tenant_path)
|
tokio::fs::create_dir_all(&tenant_path)
|
||||||
.await
|
.await
|
||||||
.with_context(|| format!("Creating {tenant_path}"))?;
|
.with_context(|| format!("Creating {tenant_path}"))?;
|
||||||
|
|
||||||
// TODO(sharding): make local paths sharding-aware
|
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
|
||||||
Tenant::persist_tenant_config(
|
.await
|
||||||
self.conf,
|
.map_err(SetNewTenantConfigError::Persist)?;
|
||||||
&tenant_shard_id.tenant_id,
|
|
||||||
&new_location_config,
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
.map_err(SetNewTenantConfigError::Persist)?;
|
|
||||||
|
|
||||||
TenantSlot::Secondary
|
TenantSlot::Secondary
|
||||||
}
|
}
|
||||||
LocationMode::Attached(_attach_config) => {
|
LocationMode::Attached(_attach_config) => {
|
||||||
// TODO(sharding): make local paths sharding-aware
|
let timelines_path = self.conf.timelines_path(&tenant_shard_id);
|
||||||
let timelines_path = self.conf.timelines_path(&tenant_shard_id.tenant_id);
|
|
||||||
|
|
||||||
// Directory doesn't need to be fsync'd because we do not depend on
|
// Directory doesn't need to be fsync'd because we do not depend on
|
||||||
// it to exist after crashes: it may be recreated when tenant is
|
// it to exist after crashes: it may be recreated when tenant is
|
||||||
// re-attached, see https://github.com/neondatabase/neon/issues/5550
|
// re-attached, see https://github.com/neondatabase/neon/issues/5550
|
||||||
unsafe_create_dir_all(&timelines_path)
|
tokio::fs::create_dir_all(&tenant_path)
|
||||||
.await
|
.await
|
||||||
.with_context(|| format!("Creating {timelines_path}"))?;
|
.with_context(|| format!("Creating {timelines_path}"))?;
|
||||||
|
|
||||||
// TODO(sharding): make local paths sharding-aware
|
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
|
||||||
Tenant::persist_tenant_config(
|
.await
|
||||||
self.conf,
|
.map_err(SetNewTenantConfigError::Persist)?;
|
||||||
&tenant_shard_id.tenant_id,
|
|
||||||
&new_location_config,
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
.map_err(SetNewTenantConfigError::Persist)?;
|
|
||||||
|
|
||||||
// TODO(sharding): make spawn sharding-aware
|
let shard_identity = new_location_config.shard;
|
||||||
let tenant = tenant_spawn(
|
let tenant = tenant_spawn(
|
||||||
self.conf,
|
self.conf,
|
||||||
tenant_shard_id.tenant_id,
|
tenant_shard_id,
|
||||||
&tenant_path,
|
&tenant_path,
|
||||||
self.resources.clone(),
|
self.resources.clone(),
|
||||||
AttachedTenantConf::try_from(new_location_config)?,
|
AttachedTenantConf::try_from(new_location_config)?,
|
||||||
|
shard_identity,
|
||||||
None,
|
None,
|
||||||
self.tenants,
|
self.tenants,
|
||||||
SpawnMode::Normal,
|
SpawnMode::Normal,
|
||||||
@@ -978,6 +1012,81 @@ impl TenantManager {
|
|||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Resetting a tenant is equivalent to detaching it, then attaching it again with the same
|
||||||
|
/// LocationConf that was last used to attach it. Optionally, the local file cache may be
|
||||||
|
/// dropped before re-attaching.
|
||||||
|
///
|
||||||
|
/// This is not part of a tenant's normal lifecycle: it is used for debug/support, in situations
|
||||||
|
/// where an issue is identified that would go away with a restart of the tenant.
|
||||||
|
///
|
||||||
|
/// This does not have any special "force" shutdown of a tenant: it relies on the tenant's tasks
|
||||||
|
/// to respect the cancellation tokens used in normal shutdown().
|
||||||
|
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %drop_cache))]
|
||||||
|
pub(crate) async fn reset_tenant(
|
||||||
|
&self,
|
||||||
|
tenant_shard_id: TenantShardId,
|
||||||
|
drop_cache: bool,
|
||||||
|
ctx: RequestContext,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
|
||||||
|
let Some(old_slot) = slot_guard.get_old_value() else {
|
||||||
|
anyhow::bail!("Tenant not found when trying to reset");
|
||||||
|
};
|
||||||
|
|
||||||
|
let Some(tenant) = old_slot.get_attached() else {
|
||||||
|
slot_guard.revert();
|
||||||
|
anyhow::bail!("Tenant is not in attached state");
|
||||||
|
};
|
||||||
|
|
||||||
|
let (_guard, progress) = utils::completion::channel();
|
||||||
|
match tenant.shutdown(progress, false).await {
|
||||||
|
Ok(()) => {
|
||||||
|
slot_guard.drop_old_value()?;
|
||||||
|
}
|
||||||
|
Err(_barrier) => {
|
||||||
|
slot_guard.revert();
|
||||||
|
anyhow::bail!("Cannot reset Tenant, already shutting down");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let tenant_path = self.conf.tenant_path(&tenant_shard_id);
|
||||||
|
let timelines_path = self.conf.timelines_path(&tenant_shard_id);
|
||||||
|
let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
|
||||||
|
|
||||||
|
if drop_cache {
|
||||||
|
tracing::info!("Dropping local file cache");
|
||||||
|
|
||||||
|
match tokio::fs::read_dir(&timelines_path).await {
|
||||||
|
Err(e) => {
|
||||||
|
tracing::warn!("Failed to list timelines while dropping cache: {}", e);
|
||||||
|
}
|
||||||
|
Ok(mut entries) => {
|
||||||
|
while let Some(entry) = entries.next_entry().await? {
|
||||||
|
tokio::fs::remove_dir_all(entry.path()).await?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let shard_identity = config.shard;
|
||||||
|
let tenant = tenant_spawn(
|
||||||
|
self.conf,
|
||||||
|
tenant_shard_id,
|
||||||
|
&tenant_path,
|
||||||
|
self.resources.clone(),
|
||||||
|
AttachedTenantConf::try_from(config)?,
|
||||||
|
shard_identity,
|
||||||
|
None,
|
||||||
|
self.tenants,
|
||||||
|
SpawnMode::Normal,
|
||||||
|
&ctx,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
slot_guard.upsert(TenantSlot::Attached(tenant))?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, thiserror::Error)]
|
#[derive(Debug, thiserror::Error)]
|
||||||
@@ -1002,14 +1111,11 @@ pub(crate) enum GetTenantError {
|
|||||||
///
|
///
|
||||||
/// This method is cancel-safe.
|
/// This method is cancel-safe.
|
||||||
pub(crate) fn get_tenant(
|
pub(crate) fn get_tenant(
|
||||||
tenant_id: TenantId,
|
tenant_shard_id: TenantShardId,
|
||||||
active_only: bool,
|
active_only: bool,
|
||||||
) -> Result<Arc<Tenant>, GetTenantError> {
|
) -> Result<Arc<Tenant>, GetTenantError> {
|
||||||
let locked = TENANTS.read().unwrap();
|
let locked = TENANTS.read().unwrap();
|
||||||
|
|
||||||
// TODO(sharding): make all callers of get_tenant shard-aware
|
|
||||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
|
||||||
|
|
||||||
let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?;
|
let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?;
|
||||||
|
|
||||||
match peek_slot {
|
match peek_slot {
|
||||||
@@ -1021,14 +1127,18 @@ pub(crate) fn get_tenant(
|
|||||||
TenantState::Active => Ok(Arc::clone(tenant)),
|
TenantState::Active => Ok(Arc::clone(tenant)),
|
||||||
_ => {
|
_ => {
|
||||||
if active_only {
|
if active_only {
|
||||||
Err(GetTenantError::NotActive(tenant_id))
|
Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
|
||||||
} else {
|
} else {
|
||||||
Ok(Arc::clone(tenant))
|
Ok(Arc::clone(tenant))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_id)),
|
Some(TenantSlot::InProgress(_)) => {
|
||||||
None | Some(TenantSlot::Secondary) => Err(GetTenantError::NotFound(tenant_id)),
|
Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
|
||||||
|
}
|
||||||
|
None | Some(TenantSlot::Secondary) => {
|
||||||
|
Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1062,6 +1172,7 @@ pub(crate) enum GetActiveTenantError {
|
|||||||
/// then wait for up to `timeout` (minus however long we waited for the slot).
|
/// then wait for up to `timeout` (minus however long we waited for the slot).
|
||||||
pub(crate) async fn get_active_tenant_with_timeout(
|
pub(crate) async fn get_active_tenant_with_timeout(
|
||||||
tenant_id: TenantId,
|
tenant_id: TenantId,
|
||||||
|
shard_selector: ShardSelector,
|
||||||
timeout: Duration,
|
timeout: Duration,
|
||||||
cancel: &CancellationToken,
|
cancel: &CancellationToken,
|
||||||
) -> Result<Arc<Tenant>, GetActiveTenantError> {
|
) -> Result<Arc<Tenant>, GetActiveTenantError> {
|
||||||
@@ -1070,15 +1181,17 @@ pub(crate) async fn get_active_tenant_with_timeout(
|
|||||||
Tenant(Arc<Tenant>),
|
Tenant(Arc<Tenant>),
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO(sharding): make page service interface sharding-aware (page service should apply ShardIdentity to the key
|
|
||||||
// to decide which shard services the request)
|
|
||||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
|
||||||
|
|
||||||
let wait_start = Instant::now();
|
let wait_start = Instant::now();
|
||||||
let deadline = wait_start + timeout;
|
let deadline = wait_start + timeout;
|
||||||
|
|
||||||
let wait_for = {
|
let (wait_for, tenant_shard_id) = {
|
||||||
let locked = TENANTS.read().unwrap();
|
let locked = TENANTS.read().unwrap();
|
||||||
|
|
||||||
|
// Resolve TenantId to TenantShardId
|
||||||
|
let tenant_shard_id = locked.resolve_shard(&tenant_id, shard_selector).ok_or(
|
||||||
|
GetActiveTenantError::NotFound(GetTenantError::NotFound(tenant_id)),
|
||||||
|
)?;
|
||||||
|
|
||||||
let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
|
let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
|
||||||
.map_err(GetTenantError::MapState)?;
|
.map_err(GetTenantError::MapState)?;
|
||||||
match peek_slot {
|
match peek_slot {
|
||||||
@@ -1088,7 +1201,7 @@ pub(crate) async fn get_active_tenant_with_timeout(
|
|||||||
// Fast path: we don't need to do any async waiting.
|
// Fast path: we don't need to do any async waiting.
|
||||||
return Ok(tenant.clone());
|
return Ok(tenant.clone());
|
||||||
}
|
}
|
||||||
_ => WaitFor::Tenant(tenant.clone()),
|
_ => (WaitFor::Tenant(tenant.clone()), tenant_shard_id),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Some(TenantSlot::Secondary) => {
|
Some(TenantSlot::Secondary) => {
|
||||||
@@ -1096,7 +1209,9 @@ pub(crate) async fn get_active_tenant_with_timeout(
|
|||||||
tenant_id,
|
tenant_id,
|
||||||
)))
|
)))
|
||||||
}
|
}
|
||||||
Some(TenantSlot::InProgress(barrier)) => WaitFor::Barrier(barrier.clone()),
|
Some(TenantSlot::InProgress(barrier)) => {
|
||||||
|
(WaitFor::Barrier(barrier.clone()), tenant_shard_id)
|
||||||
|
}
|
||||||
None => {
|
None => {
|
||||||
return Err(GetActiveTenantError::NotFound(GetTenantError::NotFound(
|
return Err(GetActiveTenantError::NotFound(GetTenantError::NotFound(
|
||||||
tenant_id,
|
tenant_id,
|
||||||
@@ -1181,8 +1296,7 @@ pub(crate) async fn delete_tenant(
|
|||||||
// See https://github.com/neondatabase/neon/issues/5080
|
// See https://github.com/neondatabase/neon/issues/5080
|
||||||
|
|
||||||
// TODO(sharding): make delete API sharding-aware
|
// TODO(sharding): make delete API sharding-aware
|
||||||
let mut slot_guard =
|
let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
|
||||||
tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
|
|
||||||
|
|
||||||
// unwrap is safe because we used MustExist mode when acquiring
|
// unwrap is safe because we used MustExist mode when acquiring
|
||||||
let tenant = match slot_guard.get_old_value().as_ref().unwrap() {
|
let tenant = match slot_guard.get_old_value().as_ref().unwrap() {
|
||||||
@@ -1238,6 +1352,11 @@ pub(crate) async fn detach_tenant(
|
|||||||
// Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
|
// Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
|
||||||
// After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
|
// After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
|
||||||
let task_tenant_id = None;
|
let task_tenant_id = None;
|
||||||
|
let cancel = crate::PAGESERVER_SHUTDOWN_TOKEN
|
||||||
|
.get()
|
||||||
|
.cloned()
|
||||||
|
.unwrap_or_default()
|
||||||
|
.child_token();
|
||||||
task_mgr::spawn(
|
task_mgr::spawn(
|
||||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||||
TaskKind::MgmtRequest,
|
TaskKind::MgmtRequest,
|
||||||
@@ -1245,6 +1364,7 @@ pub(crate) async fn detach_tenant(
|
|||||||
None,
|
None,
|
||||||
"tenant_files_delete",
|
"tenant_files_delete",
|
||||||
false,
|
false,
|
||||||
|
cancel,
|
||||||
async move {
|
async move {
|
||||||
fs::remove_dir_all(tmp_path.as_path())
|
fs::remove_dir_all(tmp_path.as_path())
|
||||||
.await
|
.await
|
||||||
@@ -1262,8 +1382,7 @@ async fn detach_tenant0(
|
|||||||
deletion_queue_client: &DeletionQueueClient,
|
deletion_queue_client: &DeletionQueueClient,
|
||||||
) -> Result<Utf8PathBuf, TenantStateError> {
|
) -> Result<Utf8PathBuf, TenantStateError> {
|
||||||
let tenant_dir_rename_operation = |tenant_id_to_clean: TenantShardId| async move {
|
let tenant_dir_rename_operation = |tenant_id_to_clean: TenantShardId| async move {
|
||||||
// TODO(sharding): make local path helpers shard-aware
|
let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
|
||||||
let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean.tenant_id);
|
|
||||||
safe_rename_tenant_dir(&local_tenant_directory)
|
safe_rename_tenant_dir(&local_tenant_directory)
|
||||||
.await
|
.await
|
||||||
.with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))
|
.with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))
|
||||||
@@ -1288,8 +1407,7 @@ async fn detach_tenant0(
|
|||||||
Err(TenantStateError::SlotError(TenantSlotError::NotFound(_)))
|
Err(TenantStateError::SlotError(TenantSlotError::NotFound(_)))
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
// TODO(sharding): make local paths sharding-aware
|
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
|
||||||
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id.tenant_id);
|
|
||||||
if tenant_ignore_mark.exists() {
|
if tenant_ignore_mark.exists() {
|
||||||
info!("Detaching an ignored tenant");
|
info!("Detaching an ignored tenant");
|
||||||
let tmp_path = tenant_dir_rename_operation(tenant_shard_id)
|
let tmp_path = tenant_dir_rename_operation(tenant_shard_id)
|
||||||
@@ -1318,9 +1436,9 @@ pub(crate) async fn load_tenant(
|
|||||||
|
|
||||||
let slot_guard =
|
let slot_guard =
|
||||||
tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
|
tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
|
||||||
let tenant_path = conf.tenant_path(&tenant_id);
|
let tenant_path = conf.tenant_path(&tenant_shard_id);
|
||||||
|
|
||||||
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id);
|
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
|
||||||
if tenant_ignore_mark.exists() {
|
if tenant_ignore_mark.exists() {
|
||||||
std::fs::remove_file(&tenant_ignore_mark).with_context(|| {
|
std::fs::remove_file(&tenant_ignore_mark).with_context(|| {
|
||||||
format!(
|
format!(
|
||||||
@@ -1336,17 +1454,19 @@ pub(crate) async fn load_tenant(
|
|||||||
};
|
};
|
||||||
|
|
||||||
let mut location_conf =
|
let mut location_conf =
|
||||||
Tenant::load_tenant_config(conf, &tenant_id).map_err(TenantMapInsertError::Other)?;
|
Tenant::load_tenant_config(conf, &tenant_shard_id).map_err(TenantMapInsertError::Other)?;
|
||||||
location_conf.attach_in_generation(generation);
|
location_conf.attach_in_generation(generation);
|
||||||
|
|
||||||
Tenant::persist_tenant_config(conf, &tenant_id, &location_conf).await?;
|
Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;
|
||||||
|
|
||||||
|
let shard_identity = location_conf.shard;
|
||||||
let new_tenant = tenant_spawn(
|
let new_tenant = tenant_spawn(
|
||||||
conf,
|
conf,
|
||||||
tenant_id,
|
tenant_shard_id,
|
||||||
&tenant_path,
|
&tenant_path,
|
||||||
resources,
|
resources,
|
||||||
AttachedTenantConf::try_from(location_conf)?,
|
AttachedTenantConf::try_from(location_conf)?,
|
||||||
|
shard_identity,
|
||||||
None,
|
None,
|
||||||
&TENANTS,
|
&TENANTS,
|
||||||
SpawnMode::Normal,
|
SpawnMode::Normal,
|
||||||
@@ -1374,7 +1494,7 @@ async fn ignore_tenant0(
|
|||||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||||
|
|
||||||
remove_tenant_from_memory(tenants, tenant_shard_id, async {
|
remove_tenant_from_memory(tenants, tenant_shard_id, async {
|
||||||
let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_id);
|
let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
|
||||||
fs::File::create(&ignore_mark_file)
|
fs::File::create(&ignore_mark_file)
|
||||||
.await
|
.await
|
||||||
.context("Failed to create ignore mark file")
|
.context("Failed to create ignore mark file")
|
||||||
@@ -1397,7 +1517,8 @@ pub(crate) enum TenantMapListError {
|
|||||||
///
|
///
|
||||||
/// Get list of tenants, for the mgmt API
|
/// Get list of tenants, for the mgmt API
|
||||||
///
|
///
|
||||||
pub(crate) async fn list_tenants() -> Result<Vec<(TenantId, TenantState)>, TenantMapListError> {
|
pub(crate) async fn list_tenants() -> Result<Vec<(TenantShardId, TenantState)>, TenantMapListError>
|
||||||
|
{
|
||||||
let tenants = TENANTS.read().unwrap();
|
let tenants = TENANTS.read().unwrap();
|
||||||
let m = match &*tenants {
|
let m = match &*tenants {
|
||||||
TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
|
TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
|
||||||
@@ -1405,12 +1526,10 @@ pub(crate) async fn list_tenants() -> Result<Vec<(TenantId, TenantState)>, Tenan
|
|||||||
};
|
};
|
||||||
Ok(m.iter()
|
Ok(m.iter()
|
||||||
.filter_map(|(id, tenant)| match tenant {
|
.filter_map(|(id, tenant)| match tenant {
|
||||||
TenantSlot::Attached(tenant) => Some((id, tenant.current_state())),
|
TenantSlot::Attached(tenant) => Some((*id, tenant.current_state())),
|
||||||
TenantSlot::Secondary => None,
|
TenantSlot::Secondary => None,
|
||||||
TenantSlot::InProgress(_) => None,
|
TenantSlot::InProgress(_) => None,
|
||||||
})
|
})
|
||||||
// TODO(sharding): make callers of this function shard-aware
|
|
||||||
.map(|(k, v)| (k.tenant_id, v))
|
|
||||||
.collect())
|
.collect())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1432,16 +1551,18 @@ pub(crate) async fn attach_tenant(
|
|||||||
let slot_guard =
|
let slot_guard =
|
||||||
tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
|
tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
|
||||||
let location_conf = LocationConf::attached_single(tenant_conf, generation);
|
let location_conf = LocationConf::attached_single(tenant_conf, generation);
|
||||||
let tenant_dir = create_tenant_files(conf, &location_conf, &tenant_id).await?;
|
let tenant_dir = create_tenant_files(conf, &location_conf, &tenant_shard_id).await?;
|
||||||
// TODO: tenant directory remains on disk if we bail out from here on.
|
// TODO: tenant directory remains on disk if we bail out from here on.
|
||||||
// See https://github.com/neondatabase/neon/issues/4233
|
// See https://github.com/neondatabase/neon/issues/4233
|
||||||
|
|
||||||
|
let shard_identity = location_conf.shard;
|
||||||
let attached_tenant = tenant_spawn(
|
let attached_tenant = tenant_spawn(
|
||||||
conf,
|
conf,
|
||||||
tenant_id,
|
tenant_shard_id,
|
||||||
&tenant_dir,
|
&tenant_dir,
|
||||||
resources,
|
resources,
|
||||||
AttachedTenantConf::try_from(location_conf)?,
|
AttachedTenantConf::try_from(location_conf)?,
|
||||||
|
shard_identity,
|
||||||
None,
|
None,
|
||||||
&TENANTS,
|
&TENANTS,
|
||||||
SpawnMode::Normal,
|
SpawnMode::Normal,
|
||||||
@@ -1507,9 +1628,10 @@ pub enum TenantSlotUpsertError {
|
|||||||
MapState(#[from] TenantMapError),
|
MapState(#[from] TenantMapError),
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug, thiserror::Error)]
|
||||||
enum TenantSlotDropError {
|
enum TenantSlotDropError {
|
||||||
/// It is only legal to drop a TenantSlot if its contents are fully shut down
|
/// It is only legal to drop a TenantSlot if its contents are fully shut down
|
||||||
|
#[error("Tenant was not shut down")]
|
||||||
NotShutdown,
|
NotShutdown,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1569,9 +1691,9 @@ impl SlotGuard {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Take any value that was present in the slot before we acquired ownership
|
/// Get any value that was present in the slot before we acquired ownership
|
||||||
/// of it: in state transitions, this will be the old state.
|
/// of it: in state transitions, this will be the old state.
|
||||||
fn get_old_value(&mut self) -> &Option<TenantSlot> {
|
fn get_old_value(&self) -> &Option<TenantSlot> {
|
||||||
&self.old_value
|
&self.old_value
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1789,7 +1911,7 @@ fn tenant_map_acquire_slot_impl(
|
|||||||
METRICS.tenant_slot_writes.inc();
|
METRICS.tenant_slot_writes.inc();
|
||||||
|
|
||||||
let mut locked = tenants.write().unwrap();
|
let mut locked = tenants.write().unwrap();
|
||||||
let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard=tenant_shard_id.shard_slug());
|
let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard = %tenant_shard_id.shard_slug());
|
||||||
let _guard = span.enter();
|
let _guard = span.enter();
|
||||||
|
|
||||||
let m = match &mut *locked {
|
let m = match &mut *locked {
|
||||||
@@ -1941,16 +2063,18 @@ use {
|
|||||||
};
|
};
|
||||||
|
|
||||||
pub(crate) async fn immediate_gc(
|
pub(crate) async fn immediate_gc(
|
||||||
tenant_id: TenantId,
|
tenant_shard_id: TenantShardId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
gc_req: TimelineGcRequest,
|
gc_req: TimelineGcRequest,
|
||||||
|
cancel: CancellationToken,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<tokio::sync::oneshot::Receiver<Result<GcResult, anyhow::Error>>, ApiError> {
|
) -> Result<tokio::sync::oneshot::Receiver<Result<GcResult, anyhow::Error>>, ApiError> {
|
||||||
let guard = TENANTS.read().unwrap();
|
let guard = TENANTS.read().unwrap();
|
||||||
|
|
||||||
let tenant = guard
|
let tenant = guard
|
||||||
.get(&tenant_id)
|
.get(&tenant_shard_id)
|
||||||
.map(Arc::clone)
|
.map(Arc::clone)
|
||||||
.with_context(|| format!("tenant {tenant_id}"))
|
.with_context(|| format!("tenant {tenant_shard_id}"))
|
||||||
.map_err(|e| ApiError::NotFound(e.into()))?;
|
.map_err(|e| ApiError::NotFound(e.into()))?;
|
||||||
|
|
||||||
let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
|
let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
|
||||||
@@ -1960,21 +2084,51 @@ pub(crate) async fn immediate_gc(
|
|||||||
// Run in task_mgr to avoid race with tenant_detach operation
|
// Run in task_mgr to avoid race with tenant_detach operation
|
||||||
let ctx = ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
|
let ctx = ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
|
||||||
let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
|
let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
|
||||||
|
// TODO: spawning is redundant now, need to hold the gate
|
||||||
task_mgr::spawn(
|
task_mgr::spawn(
|
||||||
&tokio::runtime::Handle::current(),
|
&tokio::runtime::Handle::current(),
|
||||||
TaskKind::GarbageCollector,
|
TaskKind::GarbageCollector,
|
||||||
Some(tenant_id),
|
Some(tenant_shard_id),
|
||||||
Some(timeline_id),
|
Some(timeline_id),
|
||||||
&format!("timeline_gc_handler garbage collection run for tenant {tenant_id} timeline {timeline_id}"),
|
&format!("timeline_gc_handler garbage collection run for tenant {tenant_shard_id} timeline {timeline_id}"),
|
||||||
false,
|
false,
|
||||||
|
tenant.cancel.child_token(),
|
||||||
async move {
|
async move {
|
||||||
fail::fail_point!("immediate_gc_task_pre");
|
fail::fail_point!("immediate_gc_task_pre");
|
||||||
let result = tenant
|
|
||||||
.gc_iteration(Some(timeline_id), gc_horizon, pitr, &ctx)
|
#[allow(unused_mut)]
|
||||||
.instrument(info_span!("manual_gc", %tenant_id, %timeline_id))
|
let mut result = tenant
|
||||||
|
.gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx)
|
||||||
|
.instrument(info_span!("manual_gc", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))
|
||||||
.await;
|
.await;
|
||||||
// FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
|
// FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
|
||||||
// better once the types support it.
|
// better once the types support it.
|
||||||
|
|
||||||
|
#[cfg(feature = "testing")]
|
||||||
|
{
|
||||||
|
if let Ok(result) = result.as_mut() {
|
||||||
|
// why not futures unordered? it seems it needs very much the same task structure
|
||||||
|
// but would only run on single task.
|
||||||
|
let mut js = tokio::task::JoinSet::new();
|
||||||
|
for layer in std::mem::take(&mut result.doomed_layers) {
|
||||||
|
js.spawn(layer.wait_drop());
|
||||||
|
}
|
||||||
|
tracing::info!(total = js.len(), "starting to wait for the gc'd layers to be dropped");
|
||||||
|
while let Some(res) = js.join_next().await {
|
||||||
|
res.expect("wait_drop should not panic");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let timeline = tenant.get_timeline(timeline_id, false).ok();
|
||||||
|
let rtc = timeline.as_ref().and_then(|x| x.remote_client.as_ref());
|
||||||
|
|
||||||
|
if let Some(rtc) = rtc {
|
||||||
|
// layer drops schedule actions on remote timeline client to actually do the
|
||||||
|
// deletions; don't care just exit fast about the shutdown error
|
||||||
|
drop(rtc.wait_completion().await);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
match task_done.send(result) {
|
match task_done.send(result) {
|
||||||
Ok(_) => (),
|
Ok(_) => (),
|
||||||
Err(result) => error!("failed to send gc result: {result:?}"),
|
Err(result) => error!("failed to send gc result: {result:?}"),
|
||||||
|
|||||||
@@ -188,8 +188,11 @@ use anyhow::Context;
|
|||||||
use camino::Utf8Path;
|
use camino::Utf8Path;
|
||||||
use chrono::{NaiveDateTime, Utc};
|
use chrono::{NaiveDateTime, Utc};
|
||||||
|
|
||||||
|
pub(crate) use download::download_initdb_tar_zst;
|
||||||
|
use pageserver_api::shard::{ShardIndex, TenantShardId};
|
||||||
use scopeguard::ScopeGuard;
|
use scopeguard::ScopeGuard;
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
|
pub(crate) use upload::upload_initdb_dir;
|
||||||
use utils::backoff::{
|
use utils::backoff::{
|
||||||
self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
|
self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
|
||||||
};
|
};
|
||||||
@@ -249,6 +252,11 @@ pub(crate) const FAILED_REMOTE_OP_RETRIES: u32 = 10;
|
|||||||
// retries. Uploads and deletions are retried forever, though.
|
// retries. Uploads and deletions are retried forever, though.
|
||||||
pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
|
pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
|
||||||
|
|
||||||
|
pub(crate) const INITDB_PATH: &str = "initdb.tar.zst";
|
||||||
|
|
||||||
|
/// Default buffer size when interfacing with [`tokio::fs::File`].
|
||||||
|
pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
|
||||||
|
|
||||||
pub enum MaybeDeletedIndexPart {
|
pub enum MaybeDeletedIndexPart {
|
||||||
IndexPart(IndexPart),
|
IndexPart(IndexPart),
|
||||||
Deleted(IndexPart),
|
Deleted(IndexPart),
|
||||||
@@ -297,7 +305,7 @@ pub struct RemoteTimelineClient {
|
|||||||
|
|
||||||
runtime: tokio::runtime::Handle,
|
runtime: tokio::runtime::Handle,
|
||||||
|
|
||||||
tenant_id: TenantId,
|
tenant_shard_id: TenantShardId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
generation: Generation,
|
generation: Generation,
|
||||||
|
|
||||||
@@ -321,7 +329,7 @@ impl RemoteTimelineClient {
|
|||||||
remote_storage: GenericRemoteStorage,
|
remote_storage: GenericRemoteStorage,
|
||||||
deletion_queue_client: DeletionQueueClient,
|
deletion_queue_client: DeletionQueueClient,
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
tenant_id: TenantId,
|
tenant_shard_id: TenantShardId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
generation: Generation,
|
generation: Generation,
|
||||||
) -> RemoteTimelineClient {
|
) -> RemoteTimelineClient {
|
||||||
@@ -333,22 +341,29 @@ impl RemoteTimelineClient {
|
|||||||
} else {
|
} else {
|
||||||
BACKGROUND_RUNTIME.handle().clone()
|
BACKGROUND_RUNTIME.handle().clone()
|
||||||
},
|
},
|
||||||
tenant_id,
|
tenant_shard_id,
|
||||||
timeline_id,
|
timeline_id,
|
||||||
generation,
|
generation,
|
||||||
storage_impl: remote_storage,
|
storage_impl: remote_storage,
|
||||||
deletion_queue_client,
|
deletion_queue_client,
|
||||||
upload_queue: Mutex::new(UploadQueue::Uninitialized),
|
upload_queue: Mutex::new(UploadQueue::Uninitialized),
|
||||||
metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)),
|
metrics: Arc::new(RemoteTimelineClientMetrics::new(
|
||||||
|
&tenant_shard_id,
|
||||||
|
&timeline_id,
|
||||||
|
)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Initialize the upload queue for a remote storage that already received
|
/// Initialize the upload queue for a remote storage that already received
|
||||||
/// an index file upload, i.e., it's not empty.
|
/// an index file upload, i.e., it's not empty.
|
||||||
/// The given `index_part` must be the one on the remote.
|
/// The given `index_part` must be the one on the remote.
|
||||||
pub fn init_upload_queue(&self, index_part: &IndexPart) -> anyhow::Result<()> {
|
pub fn init_upload_queue(
|
||||||
|
&self,
|
||||||
|
index_part: &IndexPart,
|
||||||
|
cancel: CancellationToken,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||||
upload_queue.initialize_with_current_remote_index_part(index_part)?;
|
upload_queue.initialize_with_current_remote_index_part(index_part, cancel)?;
|
||||||
self.update_remote_physical_size_gauge(Some(index_part));
|
self.update_remote_physical_size_gauge(Some(index_part));
|
||||||
info!(
|
info!(
|
||||||
"initialized upload queue from remote index with {} layer files",
|
"initialized upload queue from remote index with {} layer files",
|
||||||
@@ -362,9 +377,10 @@ impl RemoteTimelineClient {
|
|||||||
pub fn init_upload_queue_for_empty_remote(
|
pub fn init_upload_queue_for_empty_remote(
|
||||||
&self,
|
&self,
|
||||||
local_metadata: &TimelineMetadata,
|
local_metadata: &TimelineMetadata,
|
||||||
|
cancel: CancellationToken,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||||
upload_queue.initialize_empty_remote(local_metadata)?;
|
upload_queue.initialize_empty_remote(local_metadata, cancel)?;
|
||||||
self.update_remote_physical_size_gauge(None);
|
self.update_remote_physical_size_gauge(None);
|
||||||
info!("initialized upload queue as empty");
|
info!("initialized upload queue as empty");
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -375,6 +391,7 @@ impl RemoteTimelineClient {
|
|||||||
pub fn init_upload_queue_stopped_to_continue_deletion(
|
pub fn init_upload_queue_stopped_to_continue_deletion(
|
||||||
&self,
|
&self,
|
||||||
index_part: &IndexPart,
|
index_part: &IndexPart,
|
||||||
|
cancel: CancellationToken,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
// FIXME: consider newtype for DeletedIndexPart.
|
// FIXME: consider newtype for DeletedIndexPart.
|
||||||
let deleted_at = index_part.deleted_at.ok_or(anyhow::anyhow!(
|
let deleted_at = index_part.deleted_at.ok_or(anyhow::anyhow!(
|
||||||
@@ -383,7 +400,7 @@ impl RemoteTimelineClient {
|
|||||||
|
|
||||||
{
|
{
|
||||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||||
upload_queue.initialize_with_current_remote_index_part(index_part)?;
|
upload_queue.initialize_with_current_remote_index_part(index_part, cancel)?;
|
||||||
self.update_remote_physical_size_gauge(Some(index_part));
|
self.update_remote_physical_size_gauge(Some(index_part));
|
||||||
}
|
}
|
||||||
// also locks upload queue, without dropping the guard above it will be a deadlock
|
// also locks upload queue, without dropping the guard above it will be a deadlock
|
||||||
@@ -460,13 +477,13 @@ impl RemoteTimelineClient {
|
|||||||
|
|
||||||
let index_part = download::download_index_part(
|
let index_part = download::download_index_part(
|
||||||
&self.storage_impl,
|
&self.storage_impl,
|
||||||
&self.tenant_id,
|
&self.tenant_shard_id,
|
||||||
&self.timeline_id,
|
&self.timeline_id,
|
||||||
self.generation,
|
self.generation,
|
||||||
cancel,
|
cancel,
|
||||||
)
|
)
|
||||||
.measure_remote_op(
|
.measure_remote_op(
|
||||||
self.tenant_id,
|
self.tenant_shard_id.tenant_id,
|
||||||
self.timeline_id,
|
self.timeline_id,
|
||||||
RemoteOpFileKind::Index,
|
RemoteOpFileKind::Index,
|
||||||
RemoteOpKind::Download,
|
RemoteOpKind::Download,
|
||||||
@@ -502,13 +519,13 @@ impl RemoteTimelineClient {
|
|||||||
download::download_layer_file(
|
download::download_layer_file(
|
||||||
self.conf,
|
self.conf,
|
||||||
&self.storage_impl,
|
&self.storage_impl,
|
||||||
self.tenant_id,
|
self.tenant_shard_id,
|
||||||
self.timeline_id,
|
self.timeline_id,
|
||||||
layer_file_name,
|
layer_file_name,
|
||||||
layer_metadata,
|
layer_metadata,
|
||||||
)
|
)
|
||||||
.measure_remote_op(
|
.measure_remote_op(
|
||||||
self.tenant_id,
|
self.tenant_shard_id.tenant_id,
|
||||||
self.timeline_id,
|
self.timeline_id,
|
||||||
RemoteOpFileKind::Layer,
|
RemoteOpFileKind::Layer,
|
||||||
RemoteOpKind::Download,
|
RemoteOpKind::Download,
|
||||||
@@ -654,10 +671,10 @@ impl RemoteTimelineClient {
|
|||||||
let mut guard = self.upload_queue.lock().unwrap();
|
let mut guard = self.upload_queue.lock().unwrap();
|
||||||
let upload_queue = guard.initialized_mut()?;
|
let upload_queue = guard.initialized_mut()?;
|
||||||
|
|
||||||
let with_generations =
|
let with_metadata =
|
||||||
self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned());
|
self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned());
|
||||||
|
|
||||||
self.schedule_deletion_of_unlinked0(upload_queue, with_generations);
|
self.schedule_deletion_of_unlinked0(upload_queue, with_metadata);
|
||||||
|
|
||||||
// Launch the tasks immediately, if possible
|
// Launch the tasks immediately, if possible
|
||||||
self.launch_queued_tasks(upload_queue);
|
self.launch_queued_tasks(upload_queue);
|
||||||
@@ -692,7 +709,7 @@ impl RemoteTimelineClient {
|
|||||||
self: &Arc<Self>,
|
self: &Arc<Self>,
|
||||||
upload_queue: &mut UploadQueueInitialized,
|
upload_queue: &mut UploadQueueInitialized,
|
||||||
names: I,
|
names: I,
|
||||||
) -> Vec<(LayerFileName, Generation)>
|
) -> Vec<(LayerFileName, LayerFileMetadata)>
|
||||||
where
|
where
|
||||||
I: IntoIterator<Item = LayerFileName>,
|
I: IntoIterator<Item = LayerFileName>,
|
||||||
{
|
{
|
||||||
@@ -700,16 +717,17 @@ impl RemoteTimelineClient {
|
|||||||
// so we don't need update it. Just serialize it.
|
// so we don't need update it. Just serialize it.
|
||||||
let metadata = upload_queue.latest_metadata.clone();
|
let metadata = upload_queue.latest_metadata.clone();
|
||||||
|
|
||||||
// Decorate our list of names with each name's generation, dropping
|
// Decorate our list of names with each name's metadata, dropping
|
||||||
// names that are unexpectedly missing from our metadata.
|
// names that are unexpectedly missing from our metadata. This metadata
|
||||||
let with_generations: Vec<_> = names
|
// is later used when physically deleting layers, to construct key paths.
|
||||||
|
let with_metadata: Vec<_> = names
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.filter_map(|name| {
|
.filter_map(|name| {
|
||||||
let meta = upload_queue.latest_files.remove(&name);
|
let meta = upload_queue.latest_files.remove(&name);
|
||||||
|
|
||||||
if let Some(meta) = meta {
|
if let Some(meta) = meta {
|
||||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
|
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
|
||||||
Some((name, meta.generation))
|
Some((name, meta))
|
||||||
} else {
|
} else {
|
||||||
// This can only happen if we forgot to to schedule the file upload
|
// This can only happen if we forgot to to schedule the file upload
|
||||||
// before scheduling the delete. Log it because it is a rare/strange
|
// before scheduling the delete. Log it because it is a rare/strange
|
||||||
@@ -722,9 +740,10 @@ impl RemoteTimelineClient {
|
|||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
#[cfg(feature = "testing")]
|
#[cfg(feature = "testing")]
|
||||||
for (name, gen) in &with_generations {
|
for (name, metadata) in &with_metadata {
|
||||||
if let Some(unexpected) = upload_queue.dangling_files.insert(name.to_owned(), *gen) {
|
let gen = metadata.generation;
|
||||||
if &unexpected == gen {
|
if let Some(unexpected) = upload_queue.dangling_files.insert(name.to_owned(), gen) {
|
||||||
|
if unexpected == gen {
|
||||||
tracing::error!("{name} was unlinked twice with same generation");
|
tracing::error!("{name} was unlinked twice with same generation");
|
||||||
} else {
|
} else {
|
||||||
tracing::error!("{name} was unlinked twice with different generations {gen:?} and {unexpected:?}");
|
tracing::error!("{name} was unlinked twice with different generations {gen:?} and {unexpected:?}");
|
||||||
@@ -739,14 +758,14 @@ impl RemoteTimelineClient {
|
|||||||
self.schedule_index_upload(upload_queue, metadata);
|
self.schedule_index_upload(upload_queue, metadata);
|
||||||
}
|
}
|
||||||
|
|
||||||
with_generations
|
with_metadata
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Schedules deletion for layer files which have previously been unlinked from the
|
/// Schedules deletion for layer files which have previously been unlinked from the
|
||||||
/// `index_part.json` with [`Self::schedule_gc_update`] or [`Self::schedule_compaction_update`].
|
/// `index_part.json` with [`Self::schedule_gc_update`] or [`Self::schedule_compaction_update`].
|
||||||
pub(crate) fn schedule_deletion_of_unlinked(
|
pub(crate) fn schedule_deletion_of_unlinked(
|
||||||
self: &Arc<Self>,
|
self: &Arc<Self>,
|
||||||
layers: Vec<(LayerFileName, Generation)>,
|
layers: Vec<(LayerFileName, LayerFileMetadata)>,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let mut guard = self.upload_queue.lock().unwrap();
|
let mut guard = self.upload_queue.lock().unwrap();
|
||||||
let upload_queue = guard.initialized_mut()?;
|
let upload_queue = guard.initialized_mut()?;
|
||||||
@@ -759,16 +778,22 @@ impl RemoteTimelineClient {
|
|||||||
fn schedule_deletion_of_unlinked0(
|
fn schedule_deletion_of_unlinked0(
|
||||||
self: &Arc<Self>,
|
self: &Arc<Self>,
|
||||||
upload_queue: &mut UploadQueueInitialized,
|
upload_queue: &mut UploadQueueInitialized,
|
||||||
with_generations: Vec<(LayerFileName, Generation)>,
|
with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
|
||||||
) {
|
) {
|
||||||
for (name, gen) in &with_generations {
|
for (name, meta) in &with_metadata {
|
||||||
info!("scheduling deletion of layer {}{}", name, gen.get_suffix());
|
info!(
|
||||||
|
"scheduling deletion of layer {}{} (shard {})",
|
||||||
|
name,
|
||||||
|
meta.generation.get_suffix(),
|
||||||
|
meta.shard
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(feature = "testing")]
|
#[cfg(feature = "testing")]
|
||||||
for (name, gen) in &with_generations {
|
for (name, meta) in &with_metadata {
|
||||||
|
let gen = meta.generation;
|
||||||
match upload_queue.dangling_files.remove(name) {
|
match upload_queue.dangling_files.remove(name) {
|
||||||
Some(same) if &same == gen => { /* expected */ }
|
Some(same) if same == gen => { /* expected */ }
|
||||||
Some(other) => {
|
Some(other) => {
|
||||||
tracing::error!("{name} was unlinked with {other:?} but deleted with {gen:?}");
|
tracing::error!("{name} was unlinked with {other:?} but deleted with {gen:?}");
|
||||||
}
|
}
|
||||||
@@ -780,7 +805,7 @@ impl RemoteTimelineClient {
|
|||||||
|
|
||||||
// schedule the actual deletions
|
// schedule the actual deletions
|
||||||
let op = UploadOp::Delete(Delete {
|
let op = UploadOp::Delete(Delete {
|
||||||
layers: with_generations,
|
layers: with_metadata,
|
||||||
});
|
});
|
||||||
self.calls_unfinished_metric_begin(&op);
|
self.calls_unfinished_metric_begin(&op);
|
||||||
upload_queue.queued_operations.push_back(op);
|
upload_queue.queued_operations.push_back(op);
|
||||||
@@ -809,23 +834,29 @@ impl RemoteTimelineClient {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
///
|
|
||||||
/// Wait for all previously scheduled uploads/deletions to complete
|
/// Wait for all previously scheduled uploads/deletions to complete
|
||||||
///
|
pub(crate) async fn wait_completion(self: &Arc<Self>) -> anyhow::Result<()> {
|
||||||
pub async fn wait_completion(self: &Arc<Self>) -> anyhow::Result<()> {
|
|
||||||
let mut receiver = {
|
let mut receiver = {
|
||||||
let mut guard = self.upload_queue.lock().unwrap();
|
let mut guard = self.upload_queue.lock().unwrap();
|
||||||
let upload_queue = guard.initialized_mut()?;
|
let upload_queue = guard.initialized_mut()?;
|
||||||
self.schedule_barrier(upload_queue)
|
self.schedule_barrier0(upload_queue)
|
||||||
};
|
};
|
||||||
|
|
||||||
if receiver.changed().await.is_err() {
|
if receiver.changed().await.is_err() {
|
||||||
anyhow::bail!("wait_completion aborted because upload queue was stopped");
|
anyhow::bail!("wait_completion aborted because upload queue was stopped");
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn schedule_barrier(
|
pub(crate) fn schedule_barrier(self: &Arc<Self>) -> anyhow::Result<()> {
|
||||||
|
let mut guard = self.upload_queue.lock().unwrap();
|
||||||
|
let upload_queue = guard.initialized_mut()?;
|
||||||
|
self.schedule_barrier0(upload_queue);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn schedule_barrier0(
|
||||||
self: &Arc<Self>,
|
self: &Arc<Self>,
|
||||||
upload_queue: &mut UploadQueueInitialized,
|
upload_queue: &mut UploadQueueInitialized,
|
||||||
) -> tokio::sync::watch::Receiver<()> {
|
) -> tokio::sync::watch::Receiver<()> {
|
||||||
@@ -841,6 +872,56 @@ impl RemoteTimelineClient {
|
|||||||
receiver
|
receiver
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Wait for all previously scheduled operations to complete, and then stop.
|
||||||
|
///
|
||||||
|
/// Not cancellation safe
|
||||||
|
pub(crate) async fn shutdown(self: &Arc<Self>) -> Result<(), StopError> {
|
||||||
|
// On cancellation the queue is left in ackward state of refusing new operations but
|
||||||
|
// proper stop is yet to be called. On cancel the original or some later task must call
|
||||||
|
// `stop` or `shutdown`.
|
||||||
|
let sg = scopeguard::guard((), |_| {
|
||||||
|
tracing::error!("RemoteTimelineClient::shutdown was cancelled; this should not happen, do not make this into an allowed_error")
|
||||||
|
});
|
||||||
|
|
||||||
|
let fut = {
|
||||||
|
let mut guard = self.upload_queue.lock().unwrap();
|
||||||
|
let upload_queue = match &mut *guard {
|
||||||
|
UploadQueue::Stopped(_) => return Ok(()),
|
||||||
|
UploadQueue::Uninitialized => return Err(StopError::QueueUninitialized),
|
||||||
|
UploadQueue::Initialized(ref mut init) => init,
|
||||||
|
};
|
||||||
|
|
||||||
|
// if the queue is already stuck due to a shutdown operation which was cancelled, then
|
||||||
|
// just don't add more of these as they would never complete.
|
||||||
|
//
|
||||||
|
// TODO: if launch_queued_tasks were to be refactored to accept a &mut UploadQueue
|
||||||
|
// in every place we would not have to jump through this hoop, and this method could be
|
||||||
|
// made cancellable.
|
||||||
|
if !upload_queue.shutting_down {
|
||||||
|
upload_queue.shutting_down = true;
|
||||||
|
upload_queue.queued_operations.push_back(UploadOp::Shutdown);
|
||||||
|
// this operation is not counted similar to Barrier
|
||||||
|
|
||||||
|
self.launch_queued_tasks(upload_queue);
|
||||||
|
}
|
||||||
|
|
||||||
|
upload_queue.shutdown_ready.clone().acquire_owned()
|
||||||
|
};
|
||||||
|
|
||||||
|
let res = fut.await;
|
||||||
|
|
||||||
|
scopeguard::ScopeGuard::into_inner(sg);
|
||||||
|
|
||||||
|
match res {
|
||||||
|
Ok(_permit) => unreachable!("shutdown_ready should not have been added permits"),
|
||||||
|
Err(_closed) => {
|
||||||
|
// expected
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
self.stop()
|
||||||
|
}
|
||||||
|
|
||||||
/// Set the deleted_at field in the remote index file.
|
/// Set the deleted_at field in the remote index file.
|
||||||
///
|
///
|
||||||
/// This fails if the upload queue has not been `stop()`ed.
|
/// This fails if the upload queue has not been `stop()`ed.
|
||||||
@@ -892,7 +973,7 @@ impl RemoteTimelineClient {
|
|||||||
|| {
|
|| {
|
||||||
upload::upload_index_part(
|
upload::upload_index_part(
|
||||||
&self.storage_impl,
|
&self.storage_impl,
|
||||||
&self.tenant_id,
|
&self.tenant_shard_id,
|
||||||
&self.timeline_id,
|
&self.timeline_id,
|
||||||
self.generation,
|
self.generation,
|
||||||
&index_part_with_deleted_at,
|
&index_part_with_deleted_at,
|
||||||
@@ -950,8 +1031,9 @@ impl RemoteTimelineClient {
|
|||||||
.drain()
|
.drain()
|
||||||
.map(|(file_name, meta)| {
|
.map(|(file_name, meta)| {
|
||||||
remote_layer_path(
|
remote_layer_path(
|
||||||
&self.tenant_id,
|
&self.tenant_shard_id.tenant_id,
|
||||||
&self.timeline_id,
|
&self.timeline_id,
|
||||||
|
meta.shard,
|
||||||
&file_name,
|
&file_name,
|
||||||
meta.generation,
|
meta.generation,
|
||||||
)
|
)
|
||||||
@@ -964,7 +1046,7 @@ impl RemoteTimelineClient {
|
|||||||
|
|
||||||
// Do not delete index part yet, it is needed for possible retry. If we remove it first
|
// Do not delete index part yet, it is needed for possible retry. If we remove it first
|
||||||
// and retry will arrive to different pageserver there wont be any traces of it on remote storage
|
// and retry will arrive to different pageserver there wont be any traces of it on remote storage
|
||||||
let timeline_storage_path = remote_timeline_path(&self.tenant_id, &self.timeline_id);
|
let timeline_storage_path = remote_timeline_path(&self.tenant_shard_id, &self.timeline_id);
|
||||||
|
|
||||||
// Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't
|
// Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't
|
||||||
// taking the burden of listing all the layers that we already know we should delete.
|
// taking the burden of listing all the layers that we already know we should delete.
|
||||||
@@ -1000,12 +1082,22 @@ impl RemoteTimelineClient {
|
|||||||
.unwrap_or(
|
.unwrap_or(
|
||||||
// No generation-suffixed indices, assume we are dealing with
|
// No generation-suffixed indices, assume we are dealing with
|
||||||
// a legacy index.
|
// a legacy index.
|
||||||
remote_index_path(&self.tenant_id, &self.timeline_id, Generation::none()),
|
remote_index_path(&self.tenant_shard_id, &self.timeline_id, Generation::none()),
|
||||||
);
|
);
|
||||||
|
|
||||||
let remaining_layers: Vec<RemotePath> = remaining
|
let remaining_layers: Vec<RemotePath> = remaining
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.filter(|p| p!= &latest_index)
|
.filter(|p| {
|
||||||
|
if p == &latest_index {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if let Some(name) = p.object_name() {
|
||||||
|
if name == INITDB_PATH {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
true
|
||||||
|
})
|
||||||
.inspect(|path| {
|
.inspect(|path| {
|
||||||
if let Some(name) = path.object_name() {
|
if let Some(name) = path.object_name() {
|
||||||
info!(%name, "deleting a file not referenced from index_part.json");
|
info!(%name, "deleting a file not referenced from index_part.json");
|
||||||
@@ -1071,7 +1163,9 @@ impl RemoteTimelineClient {
|
|||||||
upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len()
|
upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len()
|
||||||
}
|
}
|
||||||
|
|
||||||
UploadOp::Barrier(_) => upload_queue.inprogress_tasks.is_empty(),
|
UploadOp::Barrier(_) | UploadOp::Shutdown => {
|
||||||
|
upload_queue.inprogress_tasks.is_empty()
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// If we cannot launch this task, don't look any further.
|
// If we cannot launch this task, don't look any further.
|
||||||
@@ -1084,6 +1178,13 @@ impl RemoteTimelineClient {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if let UploadOp::Shutdown = next_op {
|
||||||
|
// leave the op in the queue but do not start more tasks; it will be dropped when
|
||||||
|
// the stop is called.
|
||||||
|
upload_queue.shutdown_ready.close();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
// We can launch this task. Remove it from the queue first.
|
// We can launch this task. Remove it from the queue first.
|
||||||
let next_op = upload_queue.queued_operations.pop_front().unwrap();
|
let next_op = upload_queue.queued_operations.pop_front().unwrap();
|
||||||
|
|
||||||
@@ -1104,6 +1205,7 @@ impl RemoteTimelineClient {
|
|||||||
sender.send_replace(());
|
sender.send_replace(());
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
UploadOp::Shutdown => unreachable!("shutdown is intentionally never popped off"),
|
||||||
};
|
};
|
||||||
|
|
||||||
// Assign unique ID to this task
|
// Assign unique ID to this task
|
||||||
@@ -1122,20 +1224,21 @@ impl RemoteTimelineClient {
|
|||||||
|
|
||||||
// Spawn task to perform the task
|
// Spawn task to perform the task
|
||||||
let self_rc = Arc::clone(self);
|
let self_rc = Arc::clone(self);
|
||||||
let tenant_id = self.tenant_id;
|
let tenant_shard_id = self.tenant_shard_id;
|
||||||
let timeline_id = self.timeline_id;
|
let timeline_id = self.timeline_id;
|
||||||
task_mgr::spawn(
|
task_mgr::spawn(
|
||||||
&self.runtime,
|
&self.runtime,
|
||||||
TaskKind::RemoteUploadTask,
|
TaskKind::RemoteUploadTask,
|
||||||
Some(self.tenant_id),
|
Some(self.tenant_shard_id),
|
||||||
Some(self.timeline_id),
|
Some(self.timeline_id),
|
||||||
"remote upload",
|
"remote upload",
|
||||||
false,
|
false,
|
||||||
|
upload_queue.cancel.child_token(),
|
||||||
async move {
|
async move {
|
||||||
self_rc.perform_upload_task(task).await;
|
self_rc.perform_upload_task(task).await;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
.instrument(info_span!(parent: None, "remote_upload", %tenant_id, %timeline_id, %upload_task_id)),
|
.instrument(info_span!(parent: None, "remote_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id, %upload_task_id)),
|
||||||
);
|
);
|
||||||
|
|
||||||
// Loop back to process next task
|
// Loop back to process next task
|
||||||
@@ -1187,7 +1290,7 @@ impl RemoteTimelineClient {
|
|||||||
self.generation,
|
self.generation,
|
||||||
)
|
)
|
||||||
.measure_remote_op(
|
.measure_remote_op(
|
||||||
self.tenant_id,
|
self.tenant_shard_id.tenant_id,
|
||||||
self.timeline_id,
|
self.timeline_id,
|
||||||
RemoteOpFileKind::Layer,
|
RemoteOpFileKind::Layer,
|
||||||
RemoteOpKind::Upload,
|
RemoteOpKind::Upload,
|
||||||
@@ -1207,13 +1310,13 @@ impl RemoteTimelineClient {
|
|||||||
|
|
||||||
let res = upload::upload_index_part(
|
let res = upload::upload_index_part(
|
||||||
&self.storage_impl,
|
&self.storage_impl,
|
||||||
&self.tenant_id,
|
&self.tenant_shard_id,
|
||||||
&self.timeline_id,
|
&self.timeline_id,
|
||||||
self.generation,
|
self.generation,
|
||||||
index_part,
|
index_part,
|
||||||
)
|
)
|
||||||
.measure_remote_op(
|
.measure_remote_op(
|
||||||
self.tenant_id,
|
self.tenant_shard_id.tenant_id,
|
||||||
self.timeline_id,
|
self.timeline_id,
|
||||||
RemoteOpFileKind::Index,
|
RemoteOpFileKind::Index,
|
||||||
RemoteOpKind::Upload,
|
RemoteOpKind::Upload,
|
||||||
@@ -1229,20 +1332,22 @@ impl RemoteTimelineClient {
|
|||||||
}
|
}
|
||||||
res
|
res
|
||||||
}
|
}
|
||||||
UploadOp::Delete(delete) => self
|
UploadOp::Delete(delete) => {
|
||||||
.deletion_queue_client
|
pausable_failpoint!("before-delete-layer-pausable");
|
||||||
.push_layers(
|
self.deletion_queue_client
|
||||||
self.tenant_id,
|
.push_layers(
|
||||||
self.timeline_id,
|
self.tenant_shard_id,
|
||||||
self.generation,
|
self.timeline_id,
|
||||||
delete.layers.clone(),
|
self.generation,
|
||||||
)
|
delete.layers.clone(),
|
||||||
.await
|
)
|
||||||
.map_err(|e| anyhow::anyhow!(e)),
|
.await
|
||||||
UploadOp::Barrier(_) => {
|
.map_err(|e| anyhow::anyhow!(e))
|
||||||
|
}
|
||||||
|
unexpected @ UploadOp::Barrier(_) | unexpected @ UploadOp::Shutdown => {
|
||||||
// unreachable. Barrier operations are handled synchronously in
|
// unreachable. Barrier operations are handled synchronously in
|
||||||
// launch_queued_tasks
|
// launch_queued_tasks
|
||||||
warn!("unexpected Barrier operation in perform_upload_task");
|
warn!("unexpected {unexpected:?} operation in perform_upload_task");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@@ -1336,7 +1441,7 @@ impl RemoteTimelineClient {
|
|||||||
upload_queue.num_inprogress_deletions -= 1;
|
upload_queue.num_inprogress_deletions -= 1;
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
UploadOp::Barrier(_) => unreachable!(),
|
UploadOp::Barrier(..) | UploadOp::Shutdown => unreachable!(),
|
||||||
};
|
};
|
||||||
|
|
||||||
// Launch any queued tasks that were unblocked by this one.
|
// Launch any queued tasks that were unblocked by this one.
|
||||||
@@ -1350,7 +1455,7 @@ impl RemoteTimelineClient {
|
|||||||
// data safety guarantees (see docs/rfcs/025-generation-numbers.md)
|
// data safety guarantees (see docs/rfcs/025-generation-numbers.md)
|
||||||
self.deletion_queue_client
|
self.deletion_queue_client
|
||||||
.update_remote_consistent_lsn(
|
.update_remote_consistent_lsn(
|
||||||
self.tenant_id,
|
self.tenant_shard_id,
|
||||||
self.timeline_id,
|
self.timeline_id,
|
||||||
self.generation,
|
self.generation,
|
||||||
lsn,
|
lsn,
|
||||||
@@ -1391,7 +1496,7 @@ impl RemoteTimelineClient {
|
|||||||
reason: "should we track deletes? positive or negative sign?",
|
reason: "should we track deletes? positive or negative sign?",
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
UploadOp::Barrier(_) => {
|
UploadOp::Barrier(..) | UploadOp::Shutdown => {
|
||||||
// we do not account these
|
// we do not account these
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
@@ -1417,10 +1522,13 @@ impl RemoteTimelineClient {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Close the upload queue for new operations and cancel queued operations.
|
/// Close the upload queue for new operations and cancel queued operations.
|
||||||
|
///
|
||||||
|
/// Use [`RemoteTimelineClient::shutdown`] for graceful stop.
|
||||||
|
///
|
||||||
/// In-progress operations will still be running after this function returns.
|
/// In-progress operations will still be running after this function returns.
|
||||||
/// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))`
|
/// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))`
|
||||||
/// to wait for them to complete, after calling this function.
|
/// to wait for them to complete, after calling this function.
|
||||||
pub fn stop(&self) -> Result<(), StopError> {
|
pub(crate) fn stop(&self) -> Result<(), StopError> {
|
||||||
// Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue
|
// Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue
|
||||||
// into stopped state, thereby dropping all off the queued *ops* which haven't become *tasks* yet.
|
// into stopped state, thereby dropping all off the queued *ops* which haven't become *tasks* yet.
|
||||||
// The other *tasks* will come here and observe an already shut down queue and hence simply wrap up their business.
|
// The other *tasks* will come here and observe an already shut down queue and hence simply wrap up their business.
|
||||||
@@ -1458,6 +1566,15 @@ impl RemoteTimelineClient {
|
|||||||
queued_operations: VecDeque::default(),
|
queued_operations: VecDeque::default(),
|
||||||
#[cfg(feature = "testing")]
|
#[cfg(feature = "testing")]
|
||||||
dangling_files: HashMap::default(),
|
dangling_files: HashMap::default(),
|
||||||
|
shutting_down: false,
|
||||||
|
shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
|
||||||
|
// TODO: this is the only place where we cannot reasonably continue the
|
||||||
|
// tree
|
||||||
|
cancel: crate::PAGESERVER_SHUTDOWN_TOKEN
|
||||||
|
.get()
|
||||||
|
.cloned()
|
||||||
|
.unwrap_or_default()
|
||||||
|
.child_token(),
|
||||||
};
|
};
|
||||||
|
|
||||||
let upload_queue = std::mem::replace(
|
let upload_queue = std::mem::replace(
|
||||||
@@ -1503,24 +1620,32 @@ impl RemoteTimelineClient {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn remote_timelines_path(tenant_id: &TenantId) -> RemotePath {
|
pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
|
||||||
let path = format!("tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}");
|
let path = format!("tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}");
|
||||||
RemotePath::from_string(&path).expect("Failed to construct path")
|
RemotePath::from_string(&path).expect("Failed to construct path")
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn remote_timeline_path(tenant_id: &TenantId, timeline_id: &TimelineId) -> RemotePath {
|
pub fn remote_timeline_path(
|
||||||
remote_timelines_path(tenant_id).join(Utf8Path::new(&timeline_id.to_string()))
|
tenant_shard_id: &TenantShardId,
|
||||||
|
timeline_id: &TimelineId,
|
||||||
|
) -> RemotePath {
|
||||||
|
remote_timelines_path(tenant_shard_id).join(Utf8Path::new(&timeline_id.to_string()))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Note that the shard component of a remote layer path is _not_ always the same
|
||||||
|
/// as in the TenantShardId of the caller: tenants may reference layers from a different
|
||||||
|
/// ShardIndex. Use the ShardIndex from the layer's metadata.
|
||||||
pub fn remote_layer_path(
|
pub fn remote_layer_path(
|
||||||
tenant_id: &TenantId,
|
tenant_id: &TenantId,
|
||||||
timeline_id: &TimelineId,
|
timeline_id: &TimelineId,
|
||||||
|
shard: ShardIndex,
|
||||||
layer_file_name: &LayerFileName,
|
layer_file_name: &LayerFileName,
|
||||||
generation: Generation,
|
generation: Generation,
|
||||||
) -> RemotePath {
|
) -> RemotePath {
|
||||||
// Generation-aware key format
|
// Generation-aware key format
|
||||||
let path = format!(
|
let path = format!(
|
||||||
"tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}",
|
"tenants/{tenant_id}{0}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{1}{2}",
|
||||||
|
shard.get_suffix(),
|
||||||
layer_file_name.file_name(),
|
layer_file_name.file_name(),
|
||||||
generation.get_suffix()
|
generation.get_suffix()
|
||||||
);
|
);
|
||||||
@@ -1528,13 +1653,20 @@ pub fn remote_layer_path(
|
|||||||
RemotePath::from_string(&path).expect("Failed to construct path")
|
RemotePath::from_string(&path).expect("Failed to construct path")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn remote_initdb_archive_path(tenant_id: &TenantId, timeline_id: &TimelineId) -> RemotePath {
|
||||||
|
RemotePath::from_string(&format!(
|
||||||
|
"tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{INITDB_PATH}"
|
||||||
|
))
|
||||||
|
.expect("Failed to construct path")
|
||||||
|
}
|
||||||
|
|
||||||
pub fn remote_index_path(
|
pub fn remote_index_path(
|
||||||
tenant_id: &TenantId,
|
tenant_shard_id: &TenantShardId,
|
||||||
timeline_id: &TimelineId,
|
timeline_id: &TimelineId,
|
||||||
generation: Generation,
|
generation: Generation,
|
||||||
) -> RemotePath {
|
) -> RemotePath {
|
||||||
RemotePath::from_string(&format!(
|
RemotePath::from_string(&format!(
|
||||||
"tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}",
|
"tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}",
|
||||||
IndexPart::FILE_NAME,
|
IndexPart::FILE_NAME,
|
||||||
generation.get_suffix()
|
generation.get_suffix()
|
||||||
))
|
))
|
||||||
@@ -1676,14 +1808,14 @@ mod tests {
|
|||||||
Arc::new(RemoteTimelineClient {
|
Arc::new(RemoteTimelineClient {
|
||||||
conf: self.harness.conf,
|
conf: self.harness.conf,
|
||||||
runtime: tokio::runtime::Handle::current(),
|
runtime: tokio::runtime::Handle::current(),
|
||||||
tenant_id: self.harness.tenant_id,
|
tenant_shard_id: self.harness.tenant_shard_id,
|
||||||
timeline_id: TIMELINE_ID,
|
timeline_id: TIMELINE_ID,
|
||||||
generation,
|
generation,
|
||||||
storage_impl: self.harness.remote_storage.clone(),
|
storage_impl: self.harness.remote_storage.clone(),
|
||||||
deletion_queue_client: self.harness.deletion_queue.new_client(),
|
deletion_queue_client: self.harness.deletion_queue.new_client(),
|
||||||
upload_queue: Mutex::new(UploadQueue::Uninitialized),
|
upload_queue: Mutex::new(UploadQueue::Uninitialized),
|
||||||
metrics: Arc::new(RemoteTimelineClientMetrics::new(
|
metrics: Arc::new(RemoteTimelineClientMetrics::new(
|
||||||
&self.harness.tenant_id,
|
&self.harness.tenant_shard_id,
|
||||||
&TIMELINE_ID,
|
&TIMELINE_ID,
|
||||||
)),
|
)),
|
||||||
})
|
})
|
||||||
@@ -1759,6 +1891,7 @@ mod tests {
|
|||||||
println!("remote_timeline_dir: {remote_timeline_dir}");
|
println!("remote_timeline_dir: {remote_timeline_dir}");
|
||||||
|
|
||||||
let generation = harness.generation;
|
let generation = harness.generation;
|
||||||
|
let shard = harness.shard;
|
||||||
|
|
||||||
// Create a couple of dummy files, schedule upload for them
|
// Create a couple of dummy files, schedule upload for them
|
||||||
|
|
||||||
@@ -1775,7 +1908,7 @@ mod tests {
|
|||||||
harness.conf,
|
harness.conf,
|
||||||
&timeline,
|
&timeline,
|
||||||
name,
|
name,
|
||||||
LayerFileMetadata::new(contents.len() as u64, generation),
|
LayerFileMetadata::new(contents.len() as u64, generation, shard),
|
||||||
)
|
)
|
||||||
}).collect::<Vec<_>>();
|
}).collect::<Vec<_>>();
|
||||||
|
|
||||||
@@ -1924,7 +2057,7 @@ mod tests {
|
|||||||
harness.conf,
|
harness.conf,
|
||||||
&timeline,
|
&timeline,
|
||||||
layer_file_name_1.clone(),
|
layer_file_name_1.clone(),
|
||||||
LayerFileMetadata::new(content_1.len() as u64, harness.generation),
|
LayerFileMetadata::new(content_1.len() as u64, harness.generation, harness.shard),
|
||||||
);
|
);
|
||||||
|
|
||||||
#[derive(Debug, PartialEq, Clone, Copy)]
|
#[derive(Debug, PartialEq, Clone, Copy)]
|
||||||
@@ -2010,7 +2143,12 @@ mod tests {
|
|||||||
std::fs::create_dir_all(remote_timeline_dir).expect("creating test dir should work");
|
std::fs::create_dir_all(remote_timeline_dir).expect("creating test dir should work");
|
||||||
|
|
||||||
let index_path = test_state.harness.remote_fs_dir.join(
|
let index_path = test_state.harness.remote_fs_dir.join(
|
||||||
remote_index_path(&test_state.harness.tenant_id, &TIMELINE_ID, generation).get_path(),
|
remote_index_path(
|
||||||
|
&test_state.harness.tenant_shard_id,
|
||||||
|
&TIMELINE_ID,
|
||||||
|
generation,
|
||||||
|
)
|
||||||
|
.get_path(),
|
||||||
);
|
);
|
||||||
eprintln!("Writing {index_path}");
|
eprintln!("Writing {index_path}");
|
||||||
std::fs::write(&index_path, index_part_bytes).unwrap();
|
std::fs::write(&index_path, index_part_bytes).unwrap();
|
||||||
|
|||||||
@@ -8,10 +8,12 @@ use std::future::Future;
|
|||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
|
||||||
use anyhow::{anyhow, Context};
|
use anyhow::{anyhow, Context};
|
||||||
use camino::Utf8Path;
|
use camino::{Utf8Path, Utf8PathBuf};
|
||||||
use tokio::fs;
|
use pageserver_api::shard::TenantShardId;
|
||||||
use tokio::io::AsyncWriteExt;
|
use tokio::fs::{self, File, OpenOptions};
|
||||||
|
use tokio::io::{AsyncSeekExt, AsyncWriteExt};
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
|
use tracing::warn;
|
||||||
use utils::{backoff, crashsafe};
|
use utils::{backoff, crashsafe};
|
||||||
|
|
||||||
use crate::config::PageServerConf;
|
use crate::config::PageServerConf;
|
||||||
@@ -19,14 +21,15 @@ use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_
|
|||||||
use crate::tenant::storage_layer::LayerFileName;
|
use crate::tenant::storage_layer::LayerFileName;
|
||||||
use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||||
use crate::tenant::Generation;
|
use crate::tenant::Generation;
|
||||||
|
use crate::TEMP_FILE_SUFFIX;
|
||||||
use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode};
|
use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode};
|
||||||
use utils::crashsafe::path_with_suffix_extension;
|
use utils::crashsafe::path_with_suffix_extension;
|
||||||
use utils::id::{TenantId, TimelineId};
|
use utils::id::TimelineId;
|
||||||
|
|
||||||
use super::index::{IndexPart, LayerFileMetadata};
|
use super::index::{IndexPart, LayerFileMetadata};
|
||||||
use super::{
|
use super::{
|
||||||
parse_remote_index_path, remote_index_path, FAILED_DOWNLOAD_WARN_THRESHOLD,
|
parse_remote_index_path, remote_index_path, remote_initdb_archive_path,
|
||||||
FAILED_REMOTE_OP_RETRIES,
|
FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
|
||||||
};
|
};
|
||||||
|
|
||||||
static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);
|
static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);
|
||||||
@@ -39,7 +42,7 @@ static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);
|
|||||||
pub async fn download_layer_file<'a>(
|
pub async fn download_layer_file<'a>(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
storage: &'a GenericRemoteStorage,
|
storage: &'a GenericRemoteStorage,
|
||||||
tenant_id: TenantId,
|
tenant_shard_id: TenantShardId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
layer_file_name: &'a LayerFileName,
|
layer_file_name: &'a LayerFileName,
|
||||||
layer_metadata: &'a LayerFileMetadata,
|
layer_metadata: &'a LayerFileMetadata,
|
||||||
@@ -47,12 +50,13 @@ pub async fn download_layer_file<'a>(
|
|||||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
|
|
||||||
let local_path = conf
|
let local_path = conf
|
||||||
.timeline_path(&tenant_id, &timeline_id)
|
.timeline_path(&tenant_shard_id, &timeline_id)
|
||||||
.join(layer_file_name.file_name());
|
.join(layer_file_name.file_name());
|
||||||
|
|
||||||
let remote_path = remote_layer_path(
|
let remote_path = remote_layer_path(
|
||||||
&tenant_id,
|
&tenant_shard_id.tenant_id,
|
||||||
&timeline_id,
|
&timeline_id,
|
||||||
|
layer_metadata.shard,
|
||||||
layer_file_name,
|
layer_file_name,
|
||||||
layer_metadata.generation,
|
layer_metadata.generation,
|
||||||
);
|
);
|
||||||
@@ -71,12 +75,11 @@ pub async fn download_layer_file<'a>(
|
|||||||
|
|
||||||
let (mut destination_file, bytes_amount) = download_retry(
|
let (mut destination_file, bytes_amount) = download_retry(
|
||||||
|| async {
|
|| async {
|
||||||
// TODO: this doesn't use the cached fd for some reason?
|
let destination_file = tokio::fs::File::create(&temp_file_path)
|
||||||
let mut destination_file = fs::File::create(&temp_file_path)
|
|
||||||
.await
|
.await
|
||||||
.with_context(|| format!("create a destination file for layer '{temp_file_path}'"))
|
.with_context(|| format!("create a destination file for layer '{temp_file_path}'"))
|
||||||
.map_err(DownloadError::Other)?;
|
.map_err(DownloadError::Other)?;
|
||||||
let mut download = storage
|
let download = storage
|
||||||
.download(&remote_path)
|
.download(&remote_path)
|
||||||
.await
|
.await
|
||||||
.with_context(|| {
|
.with_context(|| {
|
||||||
@@ -86,9 +89,14 @@ pub async fn download_layer_file<'a>(
|
|||||||
})
|
})
|
||||||
.map_err(DownloadError::Other)?;
|
.map_err(DownloadError::Other)?;
|
||||||
|
|
||||||
|
let mut destination_file =
|
||||||
|
tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);
|
||||||
|
|
||||||
|
let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
|
||||||
|
|
||||||
let bytes_amount = tokio::time::timeout(
|
let bytes_amount = tokio::time::timeout(
|
||||||
MAX_DOWNLOAD_DURATION,
|
MAX_DOWNLOAD_DURATION,
|
||||||
tokio::io::copy(&mut download.download_stream, &mut destination_file),
|
tokio::io::copy_buf(&mut reader, &mut destination_file),
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
.map_err(|e| DownloadError::Other(anyhow::anyhow!("Timed out {:?}", e)))?
|
.map_err(|e| DownloadError::Other(anyhow::anyhow!("Timed out {:?}", e)))?
|
||||||
@@ -99,6 +107,8 @@ pub async fn download_layer_file<'a>(
|
|||||||
})
|
})
|
||||||
.map_err(DownloadError::Other)?;
|
.map_err(DownloadError::Other)?;
|
||||||
|
|
||||||
|
let destination_file = destination_file.into_inner();
|
||||||
|
|
||||||
Ok((destination_file, bytes_amount))
|
Ok((destination_file, bytes_amount))
|
||||||
},
|
},
|
||||||
&format!("download {remote_path:?}"),
|
&format!("download {remote_path:?}"),
|
||||||
@@ -169,10 +179,10 @@ pub fn is_temp_download_file(path: &Utf8Path) -> bool {
|
|||||||
/// List timelines of given tenant in remote storage
|
/// List timelines of given tenant in remote storage
|
||||||
pub async fn list_remote_timelines(
|
pub async fn list_remote_timelines(
|
||||||
storage: &GenericRemoteStorage,
|
storage: &GenericRemoteStorage,
|
||||||
tenant_id: TenantId,
|
tenant_shard_id: TenantShardId,
|
||||||
cancel: CancellationToken,
|
cancel: CancellationToken,
|
||||||
) -> anyhow::Result<(HashSet<TimelineId>, HashSet<String>)> {
|
) -> anyhow::Result<(HashSet<TimelineId>, HashSet<String>)> {
|
||||||
let remote_path = remote_timelines_path(&tenant_id);
|
let remote_path = remote_timelines_path(&tenant_shard_id);
|
||||||
|
|
||||||
fail::fail_point!("storage-sync-list-remote-timelines", |_| {
|
fail::fail_point!("storage-sync-list-remote-timelines", |_| {
|
||||||
anyhow::bail!("storage-sync-list-remote-timelines");
|
anyhow::bail!("storage-sync-list-remote-timelines");
|
||||||
@@ -180,7 +190,7 @@ pub async fn list_remote_timelines(
|
|||||||
|
|
||||||
let listing = download_retry_forever(
|
let listing = download_retry_forever(
|
||||||
|| storage.list(Some(&remote_path), ListingMode::WithDelimiter),
|
|| storage.list(Some(&remote_path), ListingMode::WithDelimiter),
|
||||||
&format!("list timelines for {tenant_id}"),
|
&format!("list timelines for {tenant_shard_id}"),
|
||||||
cancel,
|
cancel,
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
@@ -190,7 +200,7 @@ pub async fn list_remote_timelines(
|
|||||||
|
|
||||||
for timeline_remote_storage_key in listing.prefixes {
|
for timeline_remote_storage_key in listing.prefixes {
|
||||||
let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
|
let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
|
||||||
anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}")
|
anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_shard_id}")
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
match object_name.parse::<TimelineId>() {
|
match object_name.parse::<TimelineId>() {
|
||||||
@@ -211,25 +221,27 @@ pub async fn list_remote_timelines(
|
|||||||
|
|
||||||
async fn do_download_index_part(
|
async fn do_download_index_part(
|
||||||
storage: &GenericRemoteStorage,
|
storage: &GenericRemoteStorage,
|
||||||
tenant_id: &TenantId,
|
tenant_shard_id: &TenantShardId,
|
||||||
timeline_id: &TimelineId,
|
timeline_id: &TimelineId,
|
||||||
index_generation: Generation,
|
index_generation: Generation,
|
||||||
cancel: CancellationToken,
|
cancel: CancellationToken,
|
||||||
) -> Result<IndexPart, DownloadError> {
|
) -> Result<IndexPart, DownloadError> {
|
||||||
let remote_path = remote_index_path(tenant_id, timeline_id, index_generation);
|
use futures::stream::StreamExt;
|
||||||
|
|
||||||
|
let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
|
||||||
|
|
||||||
let index_part_bytes = download_retry_forever(
|
let index_part_bytes = download_retry_forever(
|
||||||
|| async {
|
|| async {
|
||||||
let mut index_part_download = storage.download(&remote_path).await?;
|
let index_part_download = storage.download(&remote_path).await?;
|
||||||
|
|
||||||
let mut index_part_bytes = Vec::new();
|
let mut index_part_bytes = Vec::new();
|
||||||
tokio::io::copy(
|
let mut stream = std::pin::pin!(index_part_download.download_stream);
|
||||||
&mut index_part_download.download_stream,
|
while let Some(chunk) = stream.next().await {
|
||||||
&mut index_part_bytes,
|
let chunk = chunk
|
||||||
)
|
.with_context(|| format!("download index part at {remote_path:?}"))
|
||||||
.await
|
.map_err(DownloadError::Other)?;
|
||||||
.with_context(|| format!("download index part at {remote_path:?}"))
|
index_part_bytes.extend_from_slice(&chunk[..]);
|
||||||
.map_err(DownloadError::Other)?;
|
}
|
||||||
Ok(index_part_bytes)
|
Ok(index_part_bytes)
|
||||||
},
|
},
|
||||||
&format!("download {remote_path:?}"),
|
&format!("download {remote_path:?}"),
|
||||||
@@ -252,7 +264,7 @@ async fn do_download_index_part(
|
|||||||
#[tracing::instrument(skip_all, fields(generation=?my_generation))]
|
#[tracing::instrument(skip_all, fields(generation=?my_generation))]
|
||||||
pub(super) async fn download_index_part(
|
pub(super) async fn download_index_part(
|
||||||
storage: &GenericRemoteStorage,
|
storage: &GenericRemoteStorage,
|
||||||
tenant_id: &TenantId,
|
tenant_shard_id: &TenantShardId,
|
||||||
timeline_id: &TimelineId,
|
timeline_id: &TimelineId,
|
||||||
my_generation: Generation,
|
my_generation: Generation,
|
||||||
cancel: CancellationToken,
|
cancel: CancellationToken,
|
||||||
@@ -261,8 +273,14 @@ pub(super) async fn download_index_part(
|
|||||||
|
|
||||||
if my_generation.is_none() {
|
if my_generation.is_none() {
|
||||||
// Operating without generations: just fetch the generation-less path
|
// Operating without generations: just fetch the generation-less path
|
||||||
return do_download_index_part(storage, tenant_id, timeline_id, my_generation, cancel)
|
return do_download_index_part(
|
||||||
.await;
|
storage,
|
||||||
|
tenant_shard_id,
|
||||||
|
timeline_id,
|
||||||
|
my_generation,
|
||||||
|
cancel,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Stale case: If we were intentionally attached in a stale generation, there may already be a remote
|
// Stale case: If we were intentionally attached in a stale generation, there may already be a remote
|
||||||
@@ -271,7 +289,7 @@ pub(super) async fn download_index_part(
|
|||||||
// This is an optimization to avoid doing the listing for the general case below.
|
// This is an optimization to avoid doing the listing for the general case below.
|
||||||
let res = do_download_index_part(
|
let res = do_download_index_part(
|
||||||
storage,
|
storage,
|
||||||
tenant_id,
|
tenant_shard_id,
|
||||||
timeline_id,
|
timeline_id,
|
||||||
my_generation,
|
my_generation,
|
||||||
cancel.clone(),
|
cancel.clone(),
|
||||||
@@ -298,7 +316,7 @@ pub(super) async fn download_index_part(
|
|||||||
// This is an optimization to avoid doing the listing for the general case below.
|
// This is an optimization to avoid doing the listing for the general case below.
|
||||||
let res = do_download_index_part(
|
let res = do_download_index_part(
|
||||||
storage,
|
storage,
|
||||||
tenant_id,
|
tenant_shard_id,
|
||||||
timeline_id,
|
timeline_id,
|
||||||
my_generation.previous(),
|
my_generation.previous(),
|
||||||
cancel.clone(),
|
cancel.clone(),
|
||||||
@@ -320,8 +338,9 @@ pub(super) async fn download_index_part(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// General case/fallback: if there is no index at my_generation or prev_generation, then list all index_part.json
|
// General case/fallback: if there is no index at my_generation or prev_generation, then list all index_part.json
|
||||||
// objects, and select the highest one with a generation <= my_generation.
|
// objects, and select the highest one with a generation <= my_generation. Constructing the prefix is equivalent
|
||||||
let index_prefix = remote_index_path(tenant_id, timeline_id, Generation::none());
|
// to constructing a full index path with no generation, because the generation is a suffix.
|
||||||
|
let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
|
||||||
let indices = backoff::retry(
|
let indices = backoff::retry(
|
||||||
|| async { storage.list_files(Some(&index_prefix)).await },
|
|| async { storage.list_files(Some(&index_prefix)).await },
|
||||||
|_| false,
|
|_| false,
|
||||||
@@ -347,18 +366,93 @@ pub(super) async fn download_index_part(
|
|||||||
match max_previous_generation {
|
match max_previous_generation {
|
||||||
Some(g) => {
|
Some(g) => {
|
||||||
tracing::debug!("Found index_part in generation {g:?}");
|
tracing::debug!("Found index_part in generation {g:?}");
|
||||||
do_download_index_part(storage, tenant_id, timeline_id, g, cancel).await
|
do_download_index_part(storage, tenant_shard_id, timeline_id, g, cancel).await
|
||||||
}
|
}
|
||||||
None => {
|
None => {
|
||||||
// Migration from legacy pre-generation state: we have a generation but no prior
|
// Migration from legacy pre-generation state: we have a generation but no prior
|
||||||
// attached pageservers did. Try to load from a no-generation path.
|
// attached pageservers did. Try to load from a no-generation path.
|
||||||
tracing::info!("No index_part.json* found");
|
tracing::debug!("No index_part.json* found");
|
||||||
do_download_index_part(storage, tenant_id, timeline_id, Generation::none(), cancel)
|
do_download_index_part(
|
||||||
.await
|
storage,
|
||||||
|
tenant_shard_id,
|
||||||
|
timeline_id,
|
||||||
|
Generation::none(),
|
||||||
|
cancel,
|
||||||
|
)
|
||||||
|
.await
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) async fn download_initdb_tar_zst(
|
||||||
|
conf: &'static PageServerConf,
|
||||||
|
storage: &GenericRemoteStorage,
|
||||||
|
tenant_shard_id: &TenantShardId,
|
||||||
|
timeline_id: &TimelineId,
|
||||||
|
) -> Result<(Utf8PathBuf, File), DownloadError> {
|
||||||
|
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
|
|
||||||
|
let remote_path = remote_initdb_archive_path(&tenant_shard_id.tenant_id, timeline_id);
|
||||||
|
|
||||||
|
let timeline_path = conf.timelines_path(tenant_shard_id);
|
||||||
|
|
||||||
|
if !timeline_path.exists() {
|
||||||
|
tokio::fs::create_dir_all(&timeline_path)
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("timeline dir creation {timeline_path}"))
|
||||||
|
.map_err(DownloadError::Other)?;
|
||||||
|
}
|
||||||
|
let temp_path = timeline_path.join(format!(
|
||||||
|
"{INITDB_PATH}.download-{timeline_id}.{TEMP_FILE_SUFFIX}"
|
||||||
|
));
|
||||||
|
|
||||||
|
let file = download_retry(
|
||||||
|
|| async {
|
||||||
|
let file = OpenOptions::new()
|
||||||
|
.create(true)
|
||||||
|
.truncate(true)
|
||||||
|
.read(true)
|
||||||
|
.write(true)
|
||||||
|
.open(&temp_path)
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("tempfile creation {temp_path}"))
|
||||||
|
.map_err(DownloadError::Other)?;
|
||||||
|
|
||||||
|
let download = storage.download(&remote_path).await?;
|
||||||
|
let mut download = tokio_util::io::StreamReader::new(download.download_stream);
|
||||||
|
let mut writer = tokio::io::BufWriter::with_capacity(8 * 1024, file);
|
||||||
|
|
||||||
|
tokio::io::copy_buf(&mut download, &mut writer)
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("download initdb.tar.zst at {remote_path:?}"))
|
||||||
|
.map_err(DownloadError::Other)?;
|
||||||
|
|
||||||
|
let mut file = writer.into_inner();
|
||||||
|
|
||||||
|
file.seek(std::io::SeekFrom::Start(0))
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("rewinding initdb.tar.zst at: {remote_path:?}"))
|
||||||
|
.map_err(DownloadError::Other)?;
|
||||||
|
|
||||||
|
Ok(file)
|
||||||
|
},
|
||||||
|
&format!("download {remote_path}"),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.map_err(|e| {
|
||||||
|
// Do a best-effort attempt at deleting the temporary file upon encountering an error.
|
||||||
|
// We don't have async here nor do we want to pile on any extra errors.
|
||||||
|
if let Err(e) = std::fs::remove_file(&temp_path) {
|
||||||
|
if e.kind() != std::io::ErrorKind::NotFound {
|
||||||
|
warn!("error deleting temporary file {temp_path}: {e}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
e
|
||||||
|
})?;
|
||||||
|
|
||||||
|
Ok((temp_path, file))
|
||||||
|
}
|
||||||
|
|
||||||
/// Helper function to handle retries for a download operation.
|
/// Helper function to handle retries for a download operation.
|
||||||
///
|
///
|
||||||
/// Remote operations can fail due to rate limits (IAM, S3), spurious network
|
/// Remote operations can fail due to rate limits (IAM, S3), spurious network
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ use crate::tenant::metadata::TimelineMetadata;
|
|||||||
use crate::tenant::storage_layer::LayerFileName;
|
use crate::tenant::storage_layer::LayerFileName;
|
||||||
use crate::tenant::upload_queue::UploadQueueInitialized;
|
use crate::tenant::upload_queue::UploadQueueInitialized;
|
||||||
use crate::tenant::Generation;
|
use crate::tenant::Generation;
|
||||||
|
use pageserver_api::shard::ShardIndex;
|
||||||
|
|
||||||
use utils::lsn::Lsn;
|
use utils::lsn::Lsn;
|
||||||
|
|
||||||
@@ -25,6 +26,8 @@ pub struct LayerFileMetadata {
|
|||||||
file_size: u64,
|
file_size: u64,
|
||||||
|
|
||||||
pub(crate) generation: Generation,
|
pub(crate) generation: Generation,
|
||||||
|
|
||||||
|
pub(crate) shard: ShardIndex,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<&'_ IndexLayerMetadata> for LayerFileMetadata {
|
impl From<&'_ IndexLayerMetadata> for LayerFileMetadata {
|
||||||
@@ -32,15 +35,17 @@ impl From<&'_ IndexLayerMetadata> for LayerFileMetadata {
|
|||||||
LayerFileMetadata {
|
LayerFileMetadata {
|
||||||
file_size: other.file_size,
|
file_size: other.file_size,
|
||||||
generation: other.generation,
|
generation: other.generation,
|
||||||
|
shard: other.shard,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl LayerFileMetadata {
|
impl LayerFileMetadata {
|
||||||
pub fn new(file_size: u64, generation: Generation) -> Self {
|
pub fn new(file_size: u64, generation: Generation, shard: ShardIndex) -> Self {
|
||||||
LayerFileMetadata {
|
LayerFileMetadata {
|
||||||
file_size,
|
file_size,
|
||||||
generation,
|
generation,
|
||||||
|
shard,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -128,6 +133,14 @@ impl IndexPart {
|
|||||||
pub fn get_disk_consistent_lsn(&self) -> Lsn {
|
pub fn get_disk_consistent_lsn(&self) -> Lsn {
|
||||||
self.disk_consistent_lsn
|
self.disk_consistent_lsn
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn from_s3_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
|
||||||
|
serde_json::from_slice::<IndexPart>(bytes)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn to_s3_bytes(&self) -> serde_json::Result<Vec<u8>> {
|
||||||
|
serde_json::to_vec(self)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TryFrom<&UploadQueueInitialized> for IndexPart {
|
impl TryFrom<&UploadQueueInitialized> for IndexPart {
|
||||||
@@ -153,6 +166,10 @@ pub struct IndexLayerMetadata {
|
|||||||
#[serde(default = "Generation::none")]
|
#[serde(default = "Generation::none")]
|
||||||
#[serde(skip_serializing_if = "Generation::is_none")]
|
#[serde(skip_serializing_if = "Generation::is_none")]
|
||||||
pub generation: Generation,
|
pub generation: Generation,
|
||||||
|
|
||||||
|
#[serde(default = "ShardIndex::unsharded")]
|
||||||
|
#[serde(skip_serializing_if = "ShardIndex::is_unsharded")]
|
||||||
|
pub shard: ShardIndex,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<LayerFileMetadata> for IndexLayerMetadata {
|
impl From<LayerFileMetadata> for IndexLayerMetadata {
|
||||||
@@ -160,6 +177,7 @@ impl From<LayerFileMetadata> for IndexLayerMetadata {
|
|||||||
IndexLayerMetadata {
|
IndexLayerMetadata {
|
||||||
file_size: other.file_size,
|
file_size: other.file_size,
|
||||||
generation: other.generation,
|
generation: other.generation,
|
||||||
|
shard: other.shard,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -187,13 +205,15 @@ mod tests {
|
|||||||
layer_metadata: HashMap::from([
|
layer_metadata: HashMap::from([
|
||||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
||||||
file_size: 25600000,
|
file_size: 25600000,
|
||||||
generation: Generation::none()
|
generation: Generation::none(),
|
||||||
|
shard: ShardIndex::unsharded()
|
||||||
}),
|
}),
|
||||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
|
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
|
||||||
// serde_json should always parse this but this might be a double with jq for
|
// serde_json should always parse this but this might be a double with jq for
|
||||||
// example.
|
// example.
|
||||||
file_size: 9007199254741001,
|
file_size: 9007199254741001,
|
||||||
generation: Generation::none()
|
generation: Generation::none(),
|
||||||
|
shard: ShardIndex::unsharded()
|
||||||
})
|
})
|
||||||
]),
|
]),
|
||||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||||
@@ -201,7 +221,7 @@ mod tests {
|
|||||||
deleted_at: None,
|
deleted_at: None,
|
||||||
};
|
};
|
||||||
|
|
||||||
let part = serde_json::from_str::<IndexPart>(example).unwrap();
|
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
||||||
assert_eq!(part, expected);
|
assert_eq!(part, expected);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -225,13 +245,15 @@ mod tests {
|
|||||||
layer_metadata: HashMap::from([
|
layer_metadata: HashMap::from([
|
||||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
||||||
file_size: 25600000,
|
file_size: 25600000,
|
||||||
generation: Generation::none()
|
generation: Generation::none(),
|
||||||
|
shard: ShardIndex::unsharded()
|
||||||
}),
|
}),
|
||||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
|
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
|
||||||
// serde_json should always parse this but this might be a double with jq for
|
// serde_json should always parse this but this might be a double with jq for
|
||||||
// example.
|
// example.
|
||||||
file_size: 9007199254741001,
|
file_size: 9007199254741001,
|
||||||
generation: Generation::none()
|
generation: Generation::none(),
|
||||||
|
shard: ShardIndex::unsharded()
|
||||||
})
|
})
|
||||||
]),
|
]),
|
||||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||||
@@ -239,7 +261,7 @@ mod tests {
|
|||||||
deleted_at: None,
|
deleted_at: None,
|
||||||
};
|
};
|
||||||
|
|
||||||
let part = serde_json::from_str::<IndexPart>(example).unwrap();
|
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
||||||
assert_eq!(part, expected);
|
assert_eq!(part, expected);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -264,13 +286,15 @@ mod tests {
|
|||||||
layer_metadata: HashMap::from([
|
layer_metadata: HashMap::from([
|
||||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
||||||
file_size: 25600000,
|
file_size: 25600000,
|
||||||
generation: Generation::none()
|
generation: Generation::none(),
|
||||||
|
shard: ShardIndex::unsharded()
|
||||||
}),
|
}),
|
||||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
|
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
|
||||||
// serde_json should always parse this but this might be a double with jq for
|
// serde_json should always parse this but this might be a double with jq for
|
||||||
// example.
|
// example.
|
||||||
file_size: 9007199254741001,
|
file_size: 9007199254741001,
|
||||||
generation: Generation::none()
|
generation: Generation::none(),
|
||||||
|
shard: ShardIndex::unsharded()
|
||||||
})
|
})
|
||||||
]),
|
]),
|
||||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||||
@@ -279,7 +303,7 @@ mod tests {
|
|||||||
"2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
|
"2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
|
||||||
};
|
};
|
||||||
|
|
||||||
let part = serde_json::from_str::<IndexPart>(example).unwrap();
|
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
||||||
assert_eq!(part, expected);
|
assert_eq!(part, expected);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -323,7 +347,7 @@ mod tests {
|
|||||||
deleted_at: None,
|
deleted_at: None,
|
||||||
};
|
};
|
||||||
|
|
||||||
let empty_layers_parsed = serde_json::from_str::<IndexPart>(empty_layers_json).unwrap();
|
let empty_layers_parsed = IndexPart::from_s3_bytes(empty_layers_json.as_bytes()).unwrap();
|
||||||
|
|
||||||
assert_eq!(empty_layers_parsed, expected);
|
assert_eq!(empty_layers_parsed, expected);
|
||||||
}
|
}
|
||||||
@@ -346,22 +370,24 @@ mod tests {
|
|||||||
layer_metadata: HashMap::from([
|
layer_metadata: HashMap::from([
|
||||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
||||||
file_size: 25600000,
|
file_size: 25600000,
|
||||||
generation: Generation::none()
|
generation: Generation::none(),
|
||||||
|
shard: ShardIndex::unsharded()
|
||||||
}),
|
}),
|
||||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
|
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
|
||||||
// serde_json should always parse this but this might be a double with jq for
|
// serde_json should always parse this but this might be a double with jq for
|
||||||
// example.
|
// example.
|
||||||
file_size: 9007199254741001,
|
file_size: 9007199254741001,
|
||||||
generation: Generation::none()
|
generation: Generation::none(),
|
||||||
|
shard: ShardIndex::unsharded()
|
||||||
})
|
})
|
||||||
]),
|
]),
|
||||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||||
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
|
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
|
||||||
deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
|
deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
|
||||||
"2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
|
"2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
|
||||||
};
|
};
|
||||||
|
|
||||||
let part = serde_json::from_str::<IndexPart>(example).unwrap();
|
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
||||||
assert_eq!(part, expected);
|
assert_eq!(part, expected);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,13 +3,16 @@
|
|||||||
use anyhow::{bail, Context};
|
use anyhow::{bail, Context};
|
||||||
use camino::Utf8Path;
|
use camino::Utf8Path;
|
||||||
use fail::fail_point;
|
use fail::fail_point;
|
||||||
|
use pageserver_api::shard::TenantShardId;
|
||||||
use std::io::ErrorKind;
|
use std::io::ErrorKind;
|
||||||
use tokio::fs;
|
use tokio::fs::{self, File};
|
||||||
|
|
||||||
use super::Generation;
|
use super::Generation;
|
||||||
use crate::{
|
use crate::{
|
||||||
config::PageServerConf,
|
config::PageServerConf,
|
||||||
tenant::remote_timeline_client::{index::IndexPart, remote_index_path, remote_path},
|
tenant::remote_timeline_client::{
|
||||||
|
index::IndexPart, remote_index_path, remote_initdb_archive_path, remote_path,
|
||||||
|
},
|
||||||
};
|
};
|
||||||
use remote_storage::GenericRemoteStorage;
|
use remote_storage::GenericRemoteStorage;
|
||||||
use utils::id::{TenantId, TimelineId};
|
use utils::id::{TenantId, TimelineId};
|
||||||
@@ -21,7 +24,7 @@ use tracing::info;
|
|||||||
/// Serializes and uploads the given index part data to the remote storage.
|
/// Serializes and uploads the given index part data to the remote storage.
|
||||||
pub(super) async fn upload_index_part<'a>(
|
pub(super) async fn upload_index_part<'a>(
|
||||||
storage: &'a GenericRemoteStorage,
|
storage: &'a GenericRemoteStorage,
|
||||||
tenant_id: &TenantId,
|
tenant_shard_id: &TenantShardId,
|
||||||
timeline_id: &TimelineId,
|
timeline_id: &TimelineId,
|
||||||
generation: Generation,
|
generation: Generation,
|
||||||
index_part: &'a IndexPart,
|
index_part: &'a IndexPart,
|
||||||
@@ -33,16 +36,21 @@ pub(super) async fn upload_index_part<'a>(
|
|||||||
});
|
});
|
||||||
pausable_failpoint!("before-upload-index-pausable");
|
pausable_failpoint!("before-upload-index-pausable");
|
||||||
|
|
||||||
let index_part_bytes =
|
let index_part_bytes = index_part
|
||||||
serde_json::to_vec(&index_part).context("serialize index part file into bytes")?;
|
.to_s3_bytes()
|
||||||
|
.context("serialize index part file into bytes")?;
|
||||||
let index_part_size = index_part_bytes.len();
|
let index_part_size = index_part_bytes.len();
|
||||||
let index_part_bytes = tokio::io::BufReader::new(std::io::Cursor::new(index_part_bytes));
|
let index_part_bytes = bytes::Bytes::from(index_part_bytes);
|
||||||
|
|
||||||
let remote_path = remote_index_path(tenant_id, timeline_id, generation);
|
let remote_path = remote_index_path(tenant_shard_id, timeline_id, generation);
|
||||||
storage
|
storage
|
||||||
.upload_storage_object(Box::new(index_part_bytes), index_part_size, &remote_path)
|
.upload_storage_object(
|
||||||
|
futures::stream::once(futures::future::ready(Ok(index_part_bytes))),
|
||||||
|
index_part_size,
|
||||||
|
&remote_path,
|
||||||
|
)
|
||||||
.await
|
.await
|
||||||
.with_context(|| format!("upload index part for '{tenant_id} / {timeline_id}'"))
|
.with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Attempts to upload given layer files.
|
/// Attempts to upload given layer files.
|
||||||
@@ -96,10 +104,31 @@ pub(super) async fn upload_timeline_layer<'a>(
|
|||||||
let fs_size = usize::try_from(fs_size)
|
let fs_size = usize::try_from(fs_size)
|
||||||
.with_context(|| format!("convert {source_path:?} size {fs_size} usize"))?;
|
.with_context(|| format!("convert {source_path:?} size {fs_size} usize"))?;
|
||||||
|
|
||||||
|
let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE);
|
||||||
|
|
||||||
storage
|
storage
|
||||||
.upload(source_file, fs_size, &storage_path, None)
|
.upload(reader, fs_size, &storage_path, None)
|
||||||
.await
|
.await
|
||||||
.with_context(|| format!("upload layer from local path '{source_path}'"))?;
|
.with_context(|| format!("upload layer from local path '{source_path}'"))?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Uploads the given `initdb` data to the remote storage.
|
||||||
|
pub(crate) async fn upload_initdb_dir(
|
||||||
|
storage: &GenericRemoteStorage,
|
||||||
|
tenant_id: &TenantId,
|
||||||
|
timeline_id: &TimelineId,
|
||||||
|
initdb_tar_zst: File,
|
||||||
|
size: u64,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
tracing::trace!("uploading initdb dir");
|
||||||
|
|
||||||
|
let file = tokio_util::io::ReaderStream::with_capacity(initdb_tar_zst, super::BUFFER_SIZE);
|
||||||
|
|
||||||
|
let remote_path = remote_initdb_archive_path(tenant_id, timeline_id);
|
||||||
|
storage
|
||||||
|
.upload_storage_object(file, size as usize, &remote_path)
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
|
||||||
|
}
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ use std::sync::Arc;
|
|||||||
use anyhow::{bail, Context};
|
use anyhow::{bail, Context};
|
||||||
use tokio::sync::oneshot::error::RecvError;
|
use tokio::sync::oneshot::error::RecvError;
|
||||||
use tokio::sync::Semaphore;
|
use tokio::sync::Semaphore;
|
||||||
|
use tokio_util::sync::CancellationToken;
|
||||||
|
|
||||||
use crate::context::RequestContext;
|
use crate::context::RequestContext;
|
||||||
use crate::pgdatadir_mapping::CalculateLogicalSizeError;
|
use crate::pgdatadir_mapping::CalculateLogicalSizeError;
|
||||||
@@ -113,11 +114,12 @@ pub(super) async fn gather_inputs(
|
|||||||
max_retention_period: Option<u64>,
|
max_retention_period: Option<u64>,
|
||||||
logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
|
logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
|
||||||
cause: LogicalSizeCalculationCause,
|
cause: LogicalSizeCalculationCause,
|
||||||
|
cancel: &CancellationToken,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> anyhow::Result<ModelInputs> {
|
) -> anyhow::Result<ModelInputs> {
|
||||||
// refresh is needed to update gc related pitr_cutoff and horizon_cutoff
|
// refresh is needed to update gc related pitr_cutoff and horizon_cutoff
|
||||||
tenant
|
tenant
|
||||||
.refresh_gc_info(ctx)
|
.refresh_gc_info(cancel, ctx)
|
||||||
.await
|
.await
|
||||||
.context("Failed to refresh gc_info before gathering inputs")?;
|
.context("Failed to refresh gc_info before gathering inputs")?;
|
||||||
|
|
||||||
|
|||||||
@@ -2,9 +2,9 @@
|
|||||||
|
|
||||||
pub mod delta_layer;
|
pub mod delta_layer;
|
||||||
mod filename;
|
mod filename;
|
||||||
mod image_layer;
|
pub mod image_layer;
|
||||||
mod inmemory_layer;
|
mod inmemory_layer;
|
||||||
mod layer;
|
pub(crate) mod layer;
|
||||||
mod layer_desc;
|
mod layer_desc;
|
||||||
|
|
||||||
use crate::context::{AccessStatsBehavior, RequestContext};
|
use crate::context::{AccessStatsBehavior, RequestContext};
|
||||||
@@ -24,10 +24,7 @@ use tracing::warn;
|
|||||||
use utils::history_buffer::HistoryBufferWithDropCounter;
|
use utils::history_buffer::HistoryBufferWithDropCounter;
|
||||||
use utils::rate_limit::RateLimit;
|
use utils::rate_limit::RateLimit;
|
||||||
|
|
||||||
use utils::{
|
use utils::{id::TimelineId, lsn::Lsn};
|
||||||
id::{TenantId, TimelineId},
|
|
||||||
lsn::Lsn,
|
|
||||||
};
|
|
||||||
|
|
||||||
pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
|
pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
|
||||||
pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
|
pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
|
||||||
@@ -304,12 +301,14 @@ pub trait AsLayerDesc {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub mod tests {
|
pub mod tests {
|
||||||
|
use pageserver_api::shard::TenantShardId;
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
impl From<DeltaFileName> for PersistentLayerDesc {
|
impl From<DeltaFileName> for PersistentLayerDesc {
|
||||||
fn from(value: DeltaFileName) -> Self {
|
fn from(value: DeltaFileName) -> Self {
|
||||||
PersistentLayerDesc::new_delta(
|
PersistentLayerDesc::new_delta(
|
||||||
TenantId::from_array([0; 16]),
|
TenantShardId::from([0; 18]),
|
||||||
TimelineId::from_array([0; 16]),
|
TimelineId::from_array([0; 16]),
|
||||||
value.key_range,
|
value.key_range,
|
||||||
value.lsn_range,
|
value.lsn_range,
|
||||||
@@ -321,7 +320,7 @@ pub mod tests {
|
|||||||
impl From<ImageFileName> for PersistentLayerDesc {
|
impl From<ImageFileName> for PersistentLayerDesc {
|
||||||
fn from(value: ImageFileName) -> Self {
|
fn from(value: ImageFileName) -> Self {
|
||||||
PersistentLayerDesc::new_img(
|
PersistentLayerDesc::new_img(
|
||||||
TenantId::from_array([0; 16]),
|
TenantShardId::from([0; 18]),
|
||||||
TimelineId::from_array([0; 16]),
|
TimelineId::from_array([0; 16]),
|
||||||
value.key_range,
|
value.key_range,
|
||||||
value.lsn,
|
value.lsn,
|
||||||
|
|||||||
@@ -42,6 +42,7 @@ use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
|||||||
use anyhow::{bail, ensure, Context, Result};
|
use anyhow::{bail, ensure, Context, Result};
|
||||||
use camino::{Utf8Path, Utf8PathBuf};
|
use camino::{Utf8Path, Utf8PathBuf};
|
||||||
use pageserver_api::models::LayerAccessKind;
|
use pageserver_api::models::LayerAccessKind;
|
||||||
|
use pageserver_api::shard::TenantShardId;
|
||||||
use rand::{distributions::Alphanumeric, Rng};
|
use rand::{distributions::Alphanumeric, Rng};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
@@ -69,13 +70,13 @@ use super::{AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer};
|
|||||||
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
|
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
|
||||||
pub struct Summary {
|
pub struct Summary {
|
||||||
/// Magic value to identify this as a neon delta file. Always DELTA_FILE_MAGIC.
|
/// Magic value to identify this as a neon delta file. Always DELTA_FILE_MAGIC.
|
||||||
magic: u16,
|
pub magic: u16,
|
||||||
format_version: u16,
|
pub format_version: u16,
|
||||||
|
|
||||||
tenant_id: TenantId,
|
pub tenant_id: TenantId,
|
||||||
timeline_id: TimelineId,
|
pub timeline_id: TimelineId,
|
||||||
key_range: Range<Key>,
|
pub key_range: Range<Key>,
|
||||||
lsn_range: Range<Lsn>,
|
pub lsn_range: Range<Lsn>,
|
||||||
|
|
||||||
/// Block number where the 'index' part of the file begins.
|
/// Block number where the 'index' part of the file begins.
|
||||||
pub index_start_blk: u32,
|
pub index_start_blk: u32,
|
||||||
@@ -86,7 +87,7 @@ pub struct Summary {
|
|||||||
impl From<&DeltaLayer> for Summary {
|
impl From<&DeltaLayer> for Summary {
|
||||||
fn from(layer: &DeltaLayer) -> Self {
|
fn from(layer: &DeltaLayer) -> Self {
|
||||||
Self::expected(
|
Self::expected(
|
||||||
layer.desc.tenant_id,
|
layer.desc.tenant_shard_id.tenant_id,
|
||||||
layer.desc.timeline_id,
|
layer.desc.timeline_id,
|
||||||
layer.desc.key_range.clone(),
|
layer.desc.key_range.clone(),
|
||||||
layer.desc.lsn_range.clone(),
|
layer.desc.lsn_range.clone(),
|
||||||
@@ -248,7 +249,7 @@ impl DeltaLayer {
|
|||||||
|
|
||||||
fn temp_path_for(
|
fn temp_path_for(
|
||||||
conf: &PageServerConf,
|
conf: &PageServerConf,
|
||||||
tenant_id: &TenantId,
|
tenant_shard_id: &TenantShardId,
|
||||||
timeline_id: &TimelineId,
|
timeline_id: &TimelineId,
|
||||||
key_start: Key,
|
key_start: Key,
|
||||||
lsn_range: &Range<Lsn>,
|
lsn_range: &Range<Lsn>,
|
||||||
@@ -259,14 +260,15 @@ impl DeltaLayer {
|
|||||||
.map(char::from)
|
.map(char::from)
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
conf.timeline_path(tenant_id, timeline_id).join(format!(
|
conf.timeline_path(tenant_shard_id, timeline_id)
|
||||||
"{}-XXX__{:016X}-{:016X}.{}.{}",
|
.join(format!(
|
||||||
key_start,
|
"{}-XXX__{:016X}-{:016X}.{}.{}",
|
||||||
u64::from(lsn_range.start),
|
key_start,
|
||||||
u64::from(lsn_range.end),
|
u64::from(lsn_range.start),
|
||||||
rand_string,
|
u64::from(lsn_range.end),
|
||||||
TEMP_FILE_SUFFIX,
|
rand_string,
|
||||||
))
|
TEMP_FILE_SUFFIX,
|
||||||
|
))
|
||||||
}
|
}
|
||||||
|
|
||||||
///
|
///
|
||||||
@@ -289,7 +291,9 @@ impl DeltaLayer {
|
|||||||
async fn load_inner(&self, ctx: &RequestContext) -> Result<Arc<DeltaLayerInner>> {
|
async fn load_inner(&self, ctx: &RequestContext) -> Result<Arc<DeltaLayerInner>> {
|
||||||
let path = self.path();
|
let path = self.path();
|
||||||
|
|
||||||
let loaded = DeltaLayerInner::load(&path, None, ctx).await?;
|
let loaded = DeltaLayerInner::load(&path, None, ctx)
|
||||||
|
.await
|
||||||
|
.and_then(|res| res)?;
|
||||||
|
|
||||||
// not production code
|
// not production code
|
||||||
let actual_filename = path.file_name().unwrap().to_owned();
|
let actual_filename = path.file_name().unwrap().to_owned();
|
||||||
@@ -316,10 +320,14 @@ impl DeltaLayer {
|
|||||||
.metadata()
|
.metadata()
|
||||||
.context("get file metadata to determine size")?;
|
.context("get file metadata to determine size")?;
|
||||||
|
|
||||||
|
// TODO(sharding): we must get the TenantShardId from the path instead of reading the Summary.
|
||||||
|
// we should also validate the path against the Summary, as both should contain the same tenant, timeline, key, lsn.
|
||||||
|
let tenant_shard_id = TenantShardId::unsharded(summary.tenant_id);
|
||||||
|
|
||||||
Ok(DeltaLayer {
|
Ok(DeltaLayer {
|
||||||
path: path.to_path_buf(),
|
path: path.to_path_buf(),
|
||||||
desc: PersistentLayerDesc::new_delta(
|
desc: PersistentLayerDesc::new_delta(
|
||||||
summary.tenant_id,
|
tenant_shard_id,
|
||||||
summary.timeline_id,
|
summary.timeline_id,
|
||||||
summary.key_range,
|
summary.key_range,
|
||||||
summary.lsn_range,
|
summary.lsn_range,
|
||||||
@@ -351,7 +359,7 @@ struct DeltaLayerWriterInner {
|
|||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
pub path: Utf8PathBuf,
|
pub path: Utf8PathBuf,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
tenant_id: TenantId,
|
tenant_shard_id: TenantShardId,
|
||||||
|
|
||||||
key_start: Key,
|
key_start: Key,
|
||||||
lsn_range: Range<Lsn>,
|
lsn_range: Range<Lsn>,
|
||||||
@@ -368,7 +376,7 @@ impl DeltaLayerWriterInner {
|
|||||||
async fn new(
|
async fn new(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
tenant_id: TenantId,
|
tenant_shard_id: TenantShardId,
|
||||||
key_start: Key,
|
key_start: Key,
|
||||||
lsn_range: Range<Lsn>,
|
lsn_range: Range<Lsn>,
|
||||||
) -> anyhow::Result<Self> {
|
) -> anyhow::Result<Self> {
|
||||||
@@ -378,7 +386,8 @@ impl DeltaLayerWriterInner {
|
|||||||
//
|
//
|
||||||
// Note: This overwrites any existing file. There shouldn't be any.
|
// Note: This overwrites any existing file. There shouldn't be any.
|
||||||
// FIXME: throw an error instead?
|
// FIXME: throw an error instead?
|
||||||
let path = DeltaLayer::temp_path_for(conf, &tenant_id, &timeline_id, key_start, &lsn_range);
|
let path =
|
||||||
|
DeltaLayer::temp_path_for(conf, &tenant_shard_id, &timeline_id, key_start, &lsn_range);
|
||||||
|
|
||||||
let mut file = VirtualFile::create(&path).await?;
|
let mut file = VirtualFile::create(&path).await?;
|
||||||
// make room for the header block
|
// make room for the header block
|
||||||
@@ -393,7 +402,7 @@ impl DeltaLayerWriterInner {
|
|||||||
conf,
|
conf,
|
||||||
path,
|
path,
|
||||||
timeline_id,
|
timeline_id,
|
||||||
tenant_id,
|
tenant_shard_id,
|
||||||
key_start,
|
key_start,
|
||||||
lsn_range,
|
lsn_range,
|
||||||
tree: tree_builder,
|
tree: tree_builder,
|
||||||
@@ -455,7 +464,7 @@ impl DeltaLayerWriterInner {
|
|||||||
let summary = Summary {
|
let summary = Summary {
|
||||||
magic: DELTA_FILE_MAGIC,
|
magic: DELTA_FILE_MAGIC,
|
||||||
format_version: STORAGE_FORMAT_VERSION,
|
format_version: STORAGE_FORMAT_VERSION,
|
||||||
tenant_id: self.tenant_id,
|
tenant_id: self.tenant_shard_id.tenant_id,
|
||||||
timeline_id: self.timeline_id,
|
timeline_id: self.timeline_id,
|
||||||
key_range: self.key_start..key_end,
|
key_range: self.key_start..key_end,
|
||||||
lsn_range: self.lsn_range.clone(),
|
lsn_range: self.lsn_range.clone(),
|
||||||
@@ -496,7 +505,7 @@ impl DeltaLayerWriterInner {
|
|||||||
// set inner.file here. The first read will have to re-open it.
|
// set inner.file here. The first read will have to re-open it.
|
||||||
|
|
||||||
let desc = PersistentLayerDesc::new_delta(
|
let desc = PersistentLayerDesc::new_delta(
|
||||||
self.tenant_id,
|
self.tenant_shard_id,
|
||||||
self.timeline_id,
|
self.timeline_id,
|
||||||
self.key_start..key_end,
|
self.key_start..key_end,
|
||||||
self.lsn_range.clone(),
|
self.lsn_range.clone(),
|
||||||
@@ -547,14 +556,20 @@ impl DeltaLayerWriter {
|
|||||||
pub async fn new(
|
pub async fn new(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
tenant_id: TenantId,
|
tenant_shard_id: TenantShardId,
|
||||||
key_start: Key,
|
key_start: Key,
|
||||||
lsn_range: Range<Lsn>,
|
lsn_range: Range<Lsn>,
|
||||||
) -> anyhow::Result<Self> {
|
) -> anyhow::Result<Self> {
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
inner: Some(
|
inner: Some(
|
||||||
DeltaLayerWriterInner::new(conf, timeline_id, tenant_id, key_start, lsn_range)
|
DeltaLayerWriterInner::new(
|
||||||
.await?,
|
conf,
|
||||||
|
timeline_id,
|
||||||
|
tenant_shard_id,
|
||||||
|
key_start,
|
||||||
|
lsn_range,
|
||||||
|
)
|
||||||
|
.await?,
|
||||||
),
|
),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@@ -609,19 +624,84 @@ impl Drop for DeltaLayerWriter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(thiserror::Error, Debug)]
|
||||||
|
pub enum RewriteSummaryError {
|
||||||
|
#[error("magic mismatch")]
|
||||||
|
MagicMismatch,
|
||||||
|
#[error(transparent)]
|
||||||
|
Other(#[from] anyhow::Error),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<std::io::Error> for RewriteSummaryError {
|
||||||
|
fn from(e: std::io::Error) -> Self {
|
||||||
|
Self::Other(anyhow::anyhow!(e))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DeltaLayer {
|
||||||
|
pub async fn rewrite_summary<F>(
|
||||||
|
path: &Utf8Path,
|
||||||
|
rewrite: F,
|
||||||
|
ctx: &RequestContext,
|
||||||
|
) -> Result<(), RewriteSummaryError>
|
||||||
|
where
|
||||||
|
F: Fn(Summary) -> Summary,
|
||||||
|
{
|
||||||
|
let file = VirtualFile::open_with_options(
|
||||||
|
path,
|
||||||
|
&*std::fs::OpenOptions::new().read(true).write(true),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("Failed to open file '{}'", path))?;
|
||||||
|
let file = FileBlockReader::new(file);
|
||||||
|
let summary_blk = file.read_blk(0, ctx).await?;
|
||||||
|
let actual_summary = Summary::des_prefix(summary_blk.as_ref()).context("deserialize")?;
|
||||||
|
let mut file = file.file;
|
||||||
|
if actual_summary.magic != DELTA_FILE_MAGIC {
|
||||||
|
return Err(RewriteSummaryError::MagicMismatch);
|
||||||
|
}
|
||||||
|
|
||||||
|
let new_summary = rewrite(actual_summary);
|
||||||
|
|
||||||
|
let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
|
||||||
|
Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
|
||||||
|
if buf.spilled() {
|
||||||
|
// The code in DeltaLayerWriterInner just warn!()s for this.
|
||||||
|
// It should probably error out as well.
|
||||||
|
return Err(RewriteSummaryError::Other(anyhow::anyhow!(
|
||||||
|
"Used more than one page size for summary buffer: {}",
|
||||||
|
buf.len()
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
file.seek(SeekFrom::Start(0)).await?;
|
||||||
|
file.write_all(&buf).await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl DeltaLayerInner {
|
impl DeltaLayerInner {
|
||||||
|
/// Returns nested result following Result<Result<_, OpErr>, Critical>:
|
||||||
|
/// - inner has the success or transient failure
|
||||||
|
/// - outer has the permanent failure
|
||||||
pub(super) async fn load(
|
pub(super) async fn load(
|
||||||
path: &Utf8Path,
|
path: &Utf8Path,
|
||||||
summary: Option<Summary>,
|
summary: Option<Summary>,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> anyhow::Result<Self> {
|
) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
|
||||||
let file = VirtualFile::open(path)
|
let file = match VirtualFile::open(path).await {
|
||||||
.await
|
Ok(file) => file,
|
||||||
.with_context(|| format!("Failed to open file '{path}'"))?;
|
Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
|
||||||
|
};
|
||||||
let file = FileBlockReader::new(file);
|
let file = FileBlockReader::new(file);
|
||||||
|
|
||||||
let summary_blk = file.read_blk(0, ctx).await?;
|
let summary_blk = match file.read_blk(0, ctx).await {
|
||||||
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
|
Ok(blk) => blk,
|
||||||
|
Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
|
||||||
|
};
|
||||||
|
|
||||||
|
// TODO: this should be an assertion instead; see ImageLayerInner::load
|
||||||
|
let actual_summary =
|
||||||
|
Summary::des_prefix(summary_blk.as_ref()).context("deserialize first block")?;
|
||||||
|
|
||||||
if let Some(mut expected_summary) = summary {
|
if let Some(mut expected_summary) = summary {
|
||||||
// production code path
|
// production code path
|
||||||
@@ -636,11 +716,11 @@ impl DeltaLayerInner {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(DeltaLayerInner {
|
Ok(Ok(DeltaLayerInner {
|
||||||
file,
|
file,
|
||||||
index_start_blk: actual_summary.index_start_blk,
|
index_start_blk: actual_summary.index_start_blk,
|
||||||
index_root_blk: actual_summary.index_root_blk,
|
index_root_blk: actual_summary.index_root_blk,
|
||||||
})
|
}))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(super) async fn get_value_reconstruct_data(
|
pub(super) async fn get_value_reconstruct_data(
|
||||||
|
|||||||
@@ -41,6 +41,7 @@ use bytes::Bytes;
|
|||||||
use camino::{Utf8Path, Utf8PathBuf};
|
use camino::{Utf8Path, Utf8PathBuf};
|
||||||
use hex;
|
use hex;
|
||||||
use pageserver_api::models::LayerAccessKind;
|
use pageserver_api::models::LayerAccessKind;
|
||||||
|
use pageserver_api::shard::TenantShardId;
|
||||||
use rand::{distributions::Alphanumeric, Rng};
|
use rand::{distributions::Alphanumeric, Rng};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
@@ -67,27 +68,27 @@ use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer};
|
|||||||
/// the 'index' starts at the block indicated by 'index_start_blk'
|
/// the 'index' starts at the block indicated by 'index_start_blk'
|
||||||
///
|
///
|
||||||
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
|
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
|
||||||
pub(super) struct Summary {
|
pub struct Summary {
|
||||||
/// Magic value to identify this as a neon image file. Always IMAGE_FILE_MAGIC.
|
/// Magic value to identify this as a neon image file. Always IMAGE_FILE_MAGIC.
|
||||||
magic: u16,
|
pub magic: u16,
|
||||||
format_version: u16,
|
pub format_version: u16,
|
||||||
|
|
||||||
tenant_id: TenantId,
|
pub tenant_id: TenantId,
|
||||||
timeline_id: TimelineId,
|
pub timeline_id: TimelineId,
|
||||||
key_range: Range<Key>,
|
pub key_range: Range<Key>,
|
||||||
lsn: Lsn,
|
pub lsn: Lsn,
|
||||||
|
|
||||||
/// Block number where the 'index' part of the file begins.
|
/// Block number where the 'index' part of the file begins.
|
||||||
index_start_blk: u32,
|
pub index_start_blk: u32,
|
||||||
/// Block within the 'index', where the B-tree root page is stored
|
/// Block within the 'index', where the B-tree root page is stored
|
||||||
index_root_blk: u32,
|
pub index_root_blk: u32,
|
||||||
// the 'values' part starts after the summary header, on block 1.
|
// the 'values' part starts after the summary header, on block 1.
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<&ImageLayer> for Summary {
|
impl From<&ImageLayer> for Summary {
|
||||||
fn from(layer: &ImageLayer) -> Self {
|
fn from(layer: &ImageLayer) -> Self {
|
||||||
Self::expected(
|
Self::expected(
|
||||||
layer.desc.tenant_id,
|
layer.desc.tenant_shard_id.tenant_id,
|
||||||
layer.desc.timeline_id,
|
layer.desc.timeline_id,
|
||||||
layer.desc.key_range.clone(),
|
layer.desc.key_range.clone(),
|
||||||
layer.lsn,
|
layer.lsn,
|
||||||
@@ -217,7 +218,7 @@ impl ImageLayer {
|
|||||||
fn temp_path_for(
|
fn temp_path_for(
|
||||||
conf: &PageServerConf,
|
conf: &PageServerConf,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
tenant_id: TenantId,
|
tenant_shard_id: TenantShardId,
|
||||||
fname: &ImageFileName,
|
fname: &ImageFileName,
|
||||||
) -> Utf8PathBuf {
|
) -> Utf8PathBuf {
|
||||||
let rand_string: String = rand::thread_rng()
|
let rand_string: String = rand::thread_rng()
|
||||||
@@ -226,7 +227,7 @@ impl ImageLayer {
|
|||||||
.map(char::from)
|
.map(char::from)
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
conf.timeline_path(&tenant_id, &timeline_id)
|
conf.timeline_path(&tenant_shard_id, &timeline_id)
|
||||||
.join(format!("{fname}.{rand_string}.{TEMP_FILE_SUFFIX}"))
|
.join(format!("{fname}.{rand_string}.{TEMP_FILE_SUFFIX}"))
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -249,7 +250,9 @@ impl ImageLayer {
|
|||||||
async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
|
async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
|
||||||
let path = self.path();
|
let path = self.path();
|
||||||
|
|
||||||
let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, ctx).await?;
|
let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, ctx)
|
||||||
|
.await
|
||||||
|
.and_then(|res| res)?;
|
||||||
|
|
||||||
// not production code
|
// not production code
|
||||||
let actual_filename = path.file_name().unwrap().to_owned();
|
let actual_filename = path.file_name().unwrap().to_owned();
|
||||||
@@ -274,10 +277,15 @@ impl ImageLayer {
|
|||||||
let metadata = file
|
let metadata = file
|
||||||
.metadata()
|
.metadata()
|
||||||
.context("get file metadata to determine size")?;
|
.context("get file metadata to determine size")?;
|
||||||
|
|
||||||
|
// TODO(sharding): we should get TenantShardId from path.
|
||||||
|
// OR, not at all: any layer we load from disk should also get reconciled with remote IndexPart.
|
||||||
|
let tenant_shard_id = TenantShardId::unsharded(summary.tenant_id);
|
||||||
|
|
||||||
Ok(ImageLayer {
|
Ok(ImageLayer {
|
||||||
path: path.to_path_buf(),
|
path: path.to_path_buf(),
|
||||||
desc: PersistentLayerDesc::new_img(
|
desc: PersistentLayerDesc::new_img(
|
||||||
summary.tenant_id,
|
tenant_shard_id,
|
||||||
summary.timeline_id,
|
summary.timeline_id,
|
||||||
summary.key_range,
|
summary.key_range,
|
||||||
summary.lsn,
|
summary.lsn,
|
||||||
@@ -294,19 +302,87 @@ impl ImageLayer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(thiserror::Error, Debug)]
|
||||||
|
pub enum RewriteSummaryError {
|
||||||
|
#[error("magic mismatch")]
|
||||||
|
MagicMismatch,
|
||||||
|
#[error(transparent)]
|
||||||
|
Other(#[from] anyhow::Error),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<std::io::Error> for RewriteSummaryError {
|
||||||
|
fn from(e: std::io::Error) -> Self {
|
||||||
|
Self::Other(anyhow::anyhow!(e))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ImageLayer {
|
||||||
|
pub async fn rewrite_summary<F>(
|
||||||
|
path: &Utf8Path,
|
||||||
|
rewrite: F,
|
||||||
|
ctx: &RequestContext,
|
||||||
|
) -> Result<(), RewriteSummaryError>
|
||||||
|
where
|
||||||
|
F: Fn(Summary) -> Summary,
|
||||||
|
{
|
||||||
|
let file = VirtualFile::open_with_options(
|
||||||
|
path,
|
||||||
|
&*std::fs::OpenOptions::new().read(true).write(true),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("Failed to open file '{}'", path))?;
|
||||||
|
let file = FileBlockReader::new(file);
|
||||||
|
let summary_blk = file.read_blk(0, ctx).await?;
|
||||||
|
let actual_summary = Summary::des_prefix(summary_blk.as_ref()).context("deserialize")?;
|
||||||
|
let mut file = file.file;
|
||||||
|
if actual_summary.magic != IMAGE_FILE_MAGIC {
|
||||||
|
return Err(RewriteSummaryError::MagicMismatch);
|
||||||
|
}
|
||||||
|
|
||||||
|
let new_summary = rewrite(actual_summary);
|
||||||
|
|
||||||
|
let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
|
||||||
|
Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
|
||||||
|
if buf.spilled() {
|
||||||
|
// The code in ImageLayerWriterInner just warn!()s for this.
|
||||||
|
// It should probably error out as well.
|
||||||
|
return Err(RewriteSummaryError::Other(anyhow::anyhow!(
|
||||||
|
"Used more than one page size for summary buffer: {}",
|
||||||
|
buf.len()
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
file.seek(SeekFrom::Start(0)).await?;
|
||||||
|
file.write_all(&buf).await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl ImageLayerInner {
|
impl ImageLayerInner {
|
||||||
|
/// Returns nested result following Result<Result<_, OpErr>, Critical>:
|
||||||
|
/// - inner has the success or transient failure
|
||||||
|
/// - outer has the permanent failure
|
||||||
pub(super) async fn load(
|
pub(super) async fn load(
|
||||||
path: &Utf8Path,
|
path: &Utf8Path,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
summary: Option<Summary>,
|
summary: Option<Summary>,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> anyhow::Result<Self> {
|
) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
|
||||||
let file = VirtualFile::open(path)
|
let file = match VirtualFile::open(path).await {
|
||||||
.await
|
Ok(file) => file,
|
||||||
.with_context(|| format!("Failed to open file '{}'", path))?;
|
Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
|
||||||
|
};
|
||||||
let file = FileBlockReader::new(file);
|
let file = FileBlockReader::new(file);
|
||||||
let summary_blk = file.read_blk(0, ctx).await?;
|
let summary_blk = match file.read_blk(0, ctx).await {
|
||||||
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
|
Ok(blk) => blk,
|
||||||
|
Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
|
||||||
|
};
|
||||||
|
|
||||||
|
// length is the only way how this could fail, so it's not actually likely at all unless
|
||||||
|
// read_blk returns wrong sized block.
|
||||||
|
//
|
||||||
|
// TODO: confirm and make this into assertion
|
||||||
|
let actual_summary =
|
||||||
|
Summary::des_prefix(summary_blk.as_ref()).context("deserialize first block")?;
|
||||||
|
|
||||||
if let Some(mut expected_summary) = summary {
|
if let Some(mut expected_summary) = summary {
|
||||||
// production code path
|
// production code path
|
||||||
@@ -322,12 +398,12 @@ impl ImageLayerInner {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(ImageLayerInner {
|
Ok(Ok(ImageLayerInner {
|
||||||
index_start_blk: actual_summary.index_start_blk,
|
index_start_blk: actual_summary.index_start_blk,
|
||||||
index_root_blk: actual_summary.index_root_blk,
|
index_root_blk: actual_summary.index_root_blk,
|
||||||
lsn,
|
lsn,
|
||||||
file,
|
file,
|
||||||
})
|
}))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(super) async fn get_value_reconstruct_data(
|
pub(super) async fn get_value_reconstruct_data(
|
||||||
@@ -385,7 +461,7 @@ struct ImageLayerWriterInner {
|
|||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
path: Utf8PathBuf,
|
path: Utf8PathBuf,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
tenant_id: TenantId,
|
tenant_shard_id: TenantShardId,
|
||||||
key_range: Range<Key>,
|
key_range: Range<Key>,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
|
|
||||||
@@ -400,7 +476,7 @@ impl ImageLayerWriterInner {
|
|||||||
async fn new(
|
async fn new(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
tenant_id: TenantId,
|
tenant_shard_id: TenantShardId,
|
||||||
key_range: &Range<Key>,
|
key_range: &Range<Key>,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
) -> anyhow::Result<Self> {
|
) -> anyhow::Result<Self> {
|
||||||
@@ -409,7 +485,7 @@ impl ImageLayerWriterInner {
|
|||||||
let path = ImageLayer::temp_path_for(
|
let path = ImageLayer::temp_path_for(
|
||||||
conf,
|
conf,
|
||||||
timeline_id,
|
timeline_id,
|
||||||
tenant_id,
|
tenant_shard_id,
|
||||||
&ImageFileName {
|
&ImageFileName {
|
||||||
key_range: key_range.clone(),
|
key_range: key_range.clone(),
|
||||||
lsn,
|
lsn,
|
||||||
@@ -433,7 +509,7 @@ impl ImageLayerWriterInner {
|
|||||||
conf,
|
conf,
|
||||||
path,
|
path,
|
||||||
timeline_id,
|
timeline_id,
|
||||||
tenant_id,
|
tenant_shard_id,
|
||||||
key_range: key_range.clone(),
|
key_range: key_range.clone(),
|
||||||
lsn,
|
lsn,
|
||||||
tree: tree_builder,
|
tree: tree_builder,
|
||||||
@@ -480,7 +556,7 @@ impl ImageLayerWriterInner {
|
|||||||
let summary = Summary {
|
let summary = Summary {
|
||||||
magic: IMAGE_FILE_MAGIC,
|
magic: IMAGE_FILE_MAGIC,
|
||||||
format_version: STORAGE_FORMAT_VERSION,
|
format_version: STORAGE_FORMAT_VERSION,
|
||||||
tenant_id: self.tenant_id,
|
tenant_id: self.tenant_shard_id.tenant_id,
|
||||||
timeline_id: self.timeline_id,
|
timeline_id: self.timeline_id,
|
||||||
key_range: self.key_range.clone(),
|
key_range: self.key_range.clone(),
|
||||||
lsn: self.lsn,
|
lsn: self.lsn,
|
||||||
@@ -506,7 +582,7 @@ impl ImageLayerWriterInner {
|
|||||||
.context("get metadata to determine file size")?;
|
.context("get metadata to determine file size")?;
|
||||||
|
|
||||||
let desc = PersistentLayerDesc::new_img(
|
let desc = PersistentLayerDesc::new_img(
|
||||||
self.tenant_id,
|
self.tenant_shard_id,
|
||||||
self.timeline_id,
|
self.timeline_id,
|
||||||
self.key_range.clone(),
|
self.key_range.clone(),
|
||||||
self.lsn,
|
self.lsn,
|
||||||
@@ -562,13 +638,14 @@ impl ImageLayerWriter {
|
|||||||
pub async fn new(
|
pub async fn new(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
tenant_id: TenantId,
|
tenant_shard_id: TenantShardId,
|
||||||
key_range: &Range<Key>,
|
key_range: &Range<Key>,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
) -> anyhow::Result<ImageLayerWriter> {
|
) -> anyhow::Result<ImageLayerWriter> {
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
inner: Some(
|
inner: Some(
|
||||||
ImageLayerWriterInner::new(conf, timeline_id, tenant_id, key_range, lsn).await?,
|
ImageLayerWriterInner::new(conf, timeline_id, tenant_shard_id, key_range, lsn)
|
||||||
|
.await?,
|
||||||
),
|
),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -14,15 +14,11 @@ use crate::tenant::Timeline;
|
|||||||
use crate::walrecord;
|
use crate::walrecord;
|
||||||
use anyhow::{ensure, Result};
|
use anyhow::{ensure, Result};
|
||||||
use pageserver_api::models::InMemoryLayerInfo;
|
use pageserver_api::models::InMemoryLayerInfo;
|
||||||
|
use pageserver_api::shard::TenantShardId;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::sync::{Arc, OnceLock};
|
use std::sync::{Arc, OnceLock};
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
use utils::{
|
use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
|
||||||
bin_ser::BeSer,
|
|
||||||
id::{TenantId, TimelineId},
|
|
||||||
lsn::Lsn,
|
|
||||||
vec_map::VecMap,
|
|
||||||
};
|
|
||||||
// avoid binding to Write (conflicts with std::io::Write)
|
// avoid binding to Write (conflicts with std::io::Write)
|
||||||
// while being able to use std::fmt::Write's methods
|
// while being able to use std::fmt::Write's methods
|
||||||
use std::fmt::Write as _;
|
use std::fmt::Write as _;
|
||||||
@@ -33,7 +29,7 @@ use super::{DeltaLayerWriter, ResidentLayer};
|
|||||||
|
|
||||||
pub struct InMemoryLayer {
|
pub struct InMemoryLayer {
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
tenant_id: TenantId,
|
tenant_shard_id: TenantShardId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
|
|
||||||
/// This layer contains all the changes from 'start_lsn'. The
|
/// This layer contains all the changes from 'start_lsn'. The
|
||||||
@@ -226,17 +222,17 @@ impl InMemoryLayer {
|
|||||||
pub async fn create(
|
pub async fn create(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
tenant_id: TenantId,
|
tenant_shard_id: TenantShardId,
|
||||||
start_lsn: Lsn,
|
start_lsn: Lsn,
|
||||||
) -> Result<InMemoryLayer> {
|
) -> Result<InMemoryLayer> {
|
||||||
trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
|
trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
|
||||||
|
|
||||||
let file = EphemeralFile::create(conf, tenant_id, timeline_id).await?;
|
let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?;
|
||||||
|
|
||||||
Ok(InMemoryLayer {
|
Ok(InMemoryLayer {
|
||||||
conf,
|
conf,
|
||||||
timeline_id,
|
timeline_id,
|
||||||
tenant_id,
|
tenant_shard_id,
|
||||||
start_lsn,
|
start_lsn,
|
||||||
end_lsn: OnceLock::new(),
|
end_lsn: OnceLock::new(),
|
||||||
inner: RwLock::new(InMemoryLayerInner {
|
inner: RwLock::new(InMemoryLayerInner {
|
||||||
@@ -335,7 +331,7 @@ impl InMemoryLayer {
|
|||||||
let mut delta_layer_writer = DeltaLayerWriter::new(
|
let mut delta_layer_writer = DeltaLayerWriter::new(
|
||||||
self.conf,
|
self.conf,
|
||||||
self.timeline_id,
|
self.timeline_id,
|
||||||
self.tenant_id,
|
self.tenant_shard_id,
|
||||||
Key::MIN,
|
Key::MIN,
|
||||||
self.start_lsn..end_lsn,
|
self.start_lsn..end_lsn,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ use camino::{Utf8Path, Utf8PathBuf};
|
|||||||
use pageserver_api::models::{
|
use pageserver_api::models::{
|
||||||
HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
|
HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
|
||||||
};
|
};
|
||||||
|
use pageserver_api::shard::ShardIndex;
|
||||||
use std::ops::Range;
|
use std::ops::Range;
|
||||||
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
|
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
|
||||||
use std::sync::{Arc, Weak};
|
use std::sync::{Arc, Weak};
|
||||||
@@ -81,7 +82,7 @@ impl Layer {
|
|||||||
metadata: LayerFileMetadata,
|
metadata: LayerFileMetadata,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
let desc = PersistentLayerDesc::from_filename(
|
let desc = PersistentLayerDesc::from_filename(
|
||||||
timeline.tenant_id,
|
timeline.tenant_shard_id,
|
||||||
timeline.timeline_id,
|
timeline.timeline_id,
|
||||||
file_name,
|
file_name,
|
||||||
metadata.file_size(),
|
metadata.file_size(),
|
||||||
@@ -96,6 +97,7 @@ impl Layer {
|
|||||||
desc,
|
desc,
|
||||||
None,
|
None,
|
||||||
metadata.generation,
|
metadata.generation,
|
||||||
|
metadata.shard,
|
||||||
)));
|
)));
|
||||||
|
|
||||||
debug_assert!(owner.0.needs_download_blocking().unwrap().is_some());
|
debug_assert!(owner.0.needs_download_blocking().unwrap().is_some());
|
||||||
@@ -111,7 +113,7 @@ impl Layer {
|
|||||||
metadata: LayerFileMetadata,
|
metadata: LayerFileMetadata,
|
||||||
) -> ResidentLayer {
|
) -> ResidentLayer {
|
||||||
let desc = PersistentLayerDesc::from_filename(
|
let desc = PersistentLayerDesc::from_filename(
|
||||||
timeline.tenant_id,
|
timeline.tenant_shard_id,
|
||||||
timeline.timeline_id,
|
timeline.timeline_id,
|
||||||
file_name,
|
file_name,
|
||||||
metadata.file_size(),
|
metadata.file_size(),
|
||||||
@@ -136,6 +138,7 @@ impl Layer {
|
|||||||
desc,
|
desc,
|
||||||
Some(inner),
|
Some(inner),
|
||||||
metadata.generation,
|
metadata.generation,
|
||||||
|
metadata.shard,
|
||||||
)
|
)
|
||||||
}));
|
}));
|
||||||
|
|
||||||
@@ -179,6 +182,7 @@ impl Layer {
|
|||||||
desc,
|
desc,
|
||||||
Some(inner),
|
Some(inner),
|
||||||
timeline.generation,
|
timeline.generation,
|
||||||
|
timeline.get_shard_index(),
|
||||||
)
|
)
|
||||||
}));
|
}));
|
||||||
|
|
||||||
@@ -218,14 +222,18 @@ impl Layer {
|
|||||||
///
|
///
|
||||||
/// [gc]: [`RemoteTimelineClient::schedule_gc_update`]
|
/// [gc]: [`RemoteTimelineClient::schedule_gc_update`]
|
||||||
/// [compaction]: [`RemoteTimelineClient::schedule_compaction_update`]
|
/// [compaction]: [`RemoteTimelineClient::schedule_compaction_update`]
|
||||||
pub(crate) fn garbage_collect_on_drop(&self) {
|
pub(crate) fn delete_on_drop(&self) {
|
||||||
self.0.garbage_collect_on_drop();
|
self.0.delete_on_drop();
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return data needed to reconstruct given page at LSN.
|
/// Return data needed to reconstruct given page at LSN.
|
||||||
///
|
///
|
||||||
/// It is up to the caller to collect more data from the previous layer and
|
/// It is up to the caller to collect more data from the previous layer and
|
||||||
/// perform WAL redo, if necessary.
|
/// perform WAL redo, if necessary.
|
||||||
|
///
|
||||||
|
/// # Cancellation-Safety
|
||||||
|
///
|
||||||
|
/// This method is cancellation-safe.
|
||||||
pub(crate) async fn get_value_reconstruct_data(
|
pub(crate) async fn get_value_reconstruct_data(
|
||||||
&self,
|
&self,
|
||||||
key: Key,
|
key: Key,
|
||||||
@@ -322,6 +330,24 @@ impl Layer {
|
|||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Waits until this layer has been dropped (and if needed, local file deletion and remote
|
||||||
|
/// deletion scheduling has completed).
|
||||||
|
///
|
||||||
|
/// Does not start local deletion, use [`Self::delete_on_drop`] for that
|
||||||
|
/// separatedly.
|
||||||
|
#[cfg(feature = "testing")]
|
||||||
|
pub(crate) fn wait_drop(&self) -> impl std::future::Future<Output = ()> + 'static {
|
||||||
|
let mut rx = self.0.status.subscribe();
|
||||||
|
|
||||||
|
async move {
|
||||||
|
loop {
|
||||||
|
if let Err(tokio::sync::broadcast::error::RecvError::Closed) = rx.recv().await {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The download-ness ([`DownloadedLayer`]) can be either resident or wanted evicted.
|
/// The download-ness ([`DownloadedLayer`]) can be either resident or wanted evicted.
|
||||||
@@ -397,8 +423,8 @@ struct LayerInner {
|
|||||||
/// Initialization and deinitialization are done while holding a permit.
|
/// Initialization and deinitialization are done while holding a permit.
|
||||||
inner: heavier_once_cell::OnceCell<ResidentOrWantedEvicted>,
|
inner: heavier_once_cell::OnceCell<ResidentOrWantedEvicted>,
|
||||||
|
|
||||||
/// Do we want to garbage collect this when `LayerInner` is dropped
|
/// Do we want to delete locally and remotely this when `LayerInner` is dropped
|
||||||
wanted_garbage_collected: AtomicBool,
|
wanted_deleted: AtomicBool,
|
||||||
|
|
||||||
/// Do we want to evict this layer as soon as possible? After being set to `true`, all accesses
|
/// Do we want to evict this layer as soon as possible? After being set to `true`, all accesses
|
||||||
/// will try to downgrade [`ResidentOrWantedEvicted`], which will eventually trigger
|
/// will try to downgrade [`ResidentOrWantedEvicted`], which will eventually trigger
|
||||||
@@ -412,10 +438,6 @@ struct LayerInner {
|
|||||||
version: AtomicUsize,
|
version: AtomicUsize,
|
||||||
|
|
||||||
/// Allow subscribing to when the layer actually gets evicted.
|
/// Allow subscribing to when the layer actually gets evicted.
|
||||||
///
|
|
||||||
/// If in future we need to implement "wait until layer instances are gone and done", carrying
|
|
||||||
/// this over to the gc spawn_blocking from LayerInner::drop will do the trick, and adding a
|
|
||||||
/// method for "wait_gc" which will wait to this being closed.
|
|
||||||
status: tokio::sync::broadcast::Sender<Status>,
|
status: tokio::sync::broadcast::Sender<Status>,
|
||||||
|
|
||||||
/// Counter for exponential backoff with the download
|
/// Counter for exponential backoff with the download
|
||||||
@@ -426,6 +448,15 @@ struct LayerInner {
|
|||||||
/// For loaded layers (resident or evicted) this comes from [`LayerFileMetadata::generation`],
|
/// For loaded layers (resident or evicted) this comes from [`LayerFileMetadata::generation`],
|
||||||
/// for created layers from [`Timeline::generation`].
|
/// for created layers from [`Timeline::generation`].
|
||||||
generation: Generation,
|
generation: Generation,
|
||||||
|
|
||||||
|
/// The shard of this Layer.
|
||||||
|
///
|
||||||
|
/// For layers created in this process, this will always be the [`ShardIndex`] of the
|
||||||
|
/// current `ShardIdentity`` (TODO: add link once it's introduced).
|
||||||
|
///
|
||||||
|
/// For loaded layers, this may be some other value if the tenant has undergone
|
||||||
|
/// a shard split since the layer was originally written.
|
||||||
|
shard: ShardIndex,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl std::fmt::Display for LayerInner {
|
impl std::fmt::Display for LayerInner {
|
||||||
@@ -448,24 +479,28 @@ enum Status {
|
|||||||
|
|
||||||
impl Drop for LayerInner {
|
impl Drop for LayerInner {
|
||||||
fn drop(&mut self) {
|
fn drop(&mut self) {
|
||||||
if !*self.wanted_garbage_collected.get_mut() {
|
if !*self.wanted_deleted.get_mut() {
|
||||||
// should we try to evict if the last wish was for eviction?
|
// should we try to evict if the last wish was for eviction?
|
||||||
// feels like there's some hazard of overcrowding near shutdown near by, but we don't
|
// feels like there's some hazard of overcrowding near shutdown near by, but we don't
|
||||||
// run drops during shutdown (yet)
|
// run drops during shutdown (yet)
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
let span = tracing::info_span!(parent: None, "layer_gc", tenant_id = %self.layer_desc().tenant_id, timeline_id = %self.layer_desc().timeline_id);
|
let span = tracing::info_span!(parent: None, "layer_delete", tenant_id = %self.layer_desc().tenant_shard_id.tenant_id, shard_id=%self.layer_desc().tenant_shard_id.shard_slug(), timeline_id = %self.layer_desc().timeline_id);
|
||||||
|
|
||||||
let path = std::mem::take(&mut self.path);
|
let path = std::mem::take(&mut self.path);
|
||||||
let file_name = self.layer_desc().filename();
|
let file_name = self.layer_desc().filename();
|
||||||
let gen = self.generation;
|
|
||||||
let file_size = self.layer_desc().file_size;
|
let file_size = self.layer_desc().file_size;
|
||||||
let timeline = self.timeline.clone();
|
let timeline = self.timeline.clone();
|
||||||
|
let meta = self.metadata();
|
||||||
|
let status = self.status.clone();
|
||||||
|
|
||||||
crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || {
|
crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || {
|
||||||
let _g = span.entered();
|
let _g = span.entered();
|
||||||
|
|
||||||
|
// carry this until we are finished for [`Layer::wait_drop`] support
|
||||||
|
let _status = status;
|
||||||
|
|
||||||
let removed = match std::fs::remove_file(path) {
|
let removed = match std::fs::remove_file(path) {
|
||||||
Ok(()) => true,
|
Ok(()) => true,
|
||||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
|
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
|
||||||
@@ -478,8 +513,8 @@ impl Drop for LayerInner {
|
|||||||
false
|
false
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
tracing::error!("failed to remove garbage collected layer: {e}");
|
tracing::error!("failed to remove wanted deleted layer: {e}");
|
||||||
LAYER_IMPL_METRICS.inc_gc_removes_failed();
|
LAYER_IMPL_METRICS.inc_delete_removes_failed();
|
||||||
false
|
false
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@@ -489,7 +524,7 @@ impl Drop for LayerInner {
|
|||||||
timeline.metrics.resident_physical_size_sub(file_size);
|
timeline.metrics.resident_physical_size_sub(file_size);
|
||||||
}
|
}
|
||||||
if let Some(remote_client) = timeline.remote_client.as_ref() {
|
if let Some(remote_client) = timeline.remote_client.as_ref() {
|
||||||
let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, gen)]);
|
let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]);
|
||||||
|
|
||||||
if let Err(e) = res {
|
if let Err(e) = res {
|
||||||
// test_timeline_deletion_with_files_stuck_in_upload_queue is good at
|
// test_timeline_deletion_with_files_stuck_in_upload_queue is good at
|
||||||
@@ -501,15 +536,15 @@ impl Drop for LayerInner {
|
|||||||
} else {
|
} else {
|
||||||
tracing::warn!("scheduling deletion on drop failed: {e:#}");
|
tracing::warn!("scheduling deletion on drop failed: {e:#}");
|
||||||
}
|
}
|
||||||
LAYER_IMPL_METRICS.inc_gcs_failed(GcFailed::DeleteSchedulingFailed);
|
LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
|
||||||
} else {
|
} else {
|
||||||
LAYER_IMPL_METRICS.inc_completed_gcs();
|
LAYER_IMPL_METRICS.inc_completed_deletes();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// no need to nag that timeline is gone: under normal situation on
|
// no need to nag that timeline is gone: under normal situation on
|
||||||
// task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
|
// task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
|
||||||
LAYER_IMPL_METRICS.inc_gcs_failed(GcFailed::TimelineGone);
|
LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@@ -523,9 +558,10 @@ impl LayerInner {
|
|||||||
desc: PersistentLayerDesc,
|
desc: PersistentLayerDesc,
|
||||||
downloaded: Option<Arc<DownloadedLayer>>,
|
downloaded: Option<Arc<DownloadedLayer>>,
|
||||||
generation: Generation,
|
generation: Generation,
|
||||||
|
shard: ShardIndex,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
let path = conf
|
let path = conf
|
||||||
.timeline_path(&timeline.tenant_id, &timeline.timeline_id)
|
.timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id)
|
||||||
.join(desc.filename().to_string());
|
.join(desc.filename().to_string());
|
||||||
|
|
||||||
let (inner, version) = if let Some(inner) = downloaded {
|
let (inner, version) = if let Some(inner) = downloaded {
|
||||||
@@ -543,26 +579,24 @@ impl LayerInner {
|
|||||||
timeline: Arc::downgrade(timeline),
|
timeline: Arc::downgrade(timeline),
|
||||||
have_remote_client: timeline.remote_client.is_some(),
|
have_remote_client: timeline.remote_client.is_some(),
|
||||||
access_stats,
|
access_stats,
|
||||||
wanted_garbage_collected: AtomicBool::new(false),
|
wanted_deleted: AtomicBool::new(false),
|
||||||
wanted_evicted: AtomicBool::new(false),
|
wanted_evicted: AtomicBool::new(false),
|
||||||
inner,
|
inner,
|
||||||
version: AtomicUsize::new(version),
|
version: AtomicUsize::new(version),
|
||||||
status: tokio::sync::broadcast::channel(1).0,
|
status: tokio::sync::broadcast::channel(1).0,
|
||||||
consecutive_failures: AtomicUsize::new(0),
|
consecutive_failures: AtomicUsize::new(0),
|
||||||
generation,
|
generation,
|
||||||
|
shard,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn garbage_collect_on_drop(&self) {
|
fn delete_on_drop(&self) {
|
||||||
let res = self.wanted_garbage_collected.compare_exchange(
|
let res =
|
||||||
false,
|
self.wanted_deleted
|
||||||
true,
|
.compare_exchange(false, true, Ordering::Release, Ordering::Relaxed);
|
||||||
Ordering::Release,
|
|
||||||
Ordering::Relaxed,
|
|
||||||
);
|
|
||||||
|
|
||||||
if res.is_ok() {
|
if res.is_ok() {
|
||||||
LAYER_IMPL_METRICS.inc_started_gcs();
|
LAYER_IMPL_METRICS.inc_started_deletes();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -630,6 +664,10 @@ impl LayerInner {
|
|||||||
// disable any scheduled but not yet running eviction deletions for this
|
// disable any scheduled but not yet running eviction deletions for this
|
||||||
let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
|
let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
|
||||||
|
|
||||||
|
// count cancellations, which currently remain largely unexpected
|
||||||
|
let init_cancelled =
|
||||||
|
scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
|
||||||
|
|
||||||
// no need to make the evict_and_wait wait for the actual download to complete
|
// no need to make the evict_and_wait wait for the actual download to complete
|
||||||
drop(self.status.send(Status::Downloaded));
|
drop(self.status.send(Status::Downloaded));
|
||||||
|
|
||||||
@@ -638,6 +676,8 @@ impl LayerInner {
|
|||||||
.upgrade()
|
.upgrade()
|
||||||
.ok_or_else(|| DownloadError::TimelineShutdown)?;
|
.ok_or_else(|| DownloadError::TimelineShutdown)?;
|
||||||
|
|
||||||
|
// FIXME: grab a gate
|
||||||
|
|
||||||
let can_ever_evict = timeline.remote_client.as_ref().is_some();
|
let can_ever_evict = timeline.remote_client.as_ref().is_some();
|
||||||
|
|
||||||
// check if we really need to be downloaded; could have been already downloaded by a
|
// check if we really need to be downloaded; could have been already downloaded by a
|
||||||
@@ -698,6 +738,8 @@ impl LayerInner {
|
|||||||
tracing::info!(waiters, "completing the on-demand download for other tasks");
|
tracing::info!(waiters, "completing the on-demand download for other tasks");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
scopeguard::ScopeGuard::into_inner(init_cancelled);
|
||||||
|
|
||||||
Ok((ResidentOrWantedEvicted::Resident(res), permit))
|
Ok((ResidentOrWantedEvicted::Resident(res), permit))
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -795,10 +837,11 @@ impl LayerInner {
|
|||||||
crate::task_mgr::spawn(
|
crate::task_mgr::spawn(
|
||||||
&tokio::runtime::Handle::current(),
|
&tokio::runtime::Handle::current(),
|
||||||
crate::task_mgr::TaskKind::RemoteDownloadTask,
|
crate::task_mgr::TaskKind::RemoteDownloadTask,
|
||||||
Some(self.desc.tenant_id),
|
Some(self.desc.tenant_shard_id),
|
||||||
Some(self.desc.timeline_id),
|
Some(self.desc.timeline_id),
|
||||||
&task_name,
|
&task_name,
|
||||||
false,
|
false,
|
||||||
|
timeline.cancel.child_token(),
|
||||||
async move {
|
async move {
|
||||||
|
|
||||||
let client = timeline
|
let client = timeline
|
||||||
@@ -818,6 +861,21 @@ impl LayerInner {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
|
let consecutive_failures =
|
||||||
|
this.consecutive_failures.fetch_add(1, Ordering::Relaxed);
|
||||||
|
|
||||||
|
let backoff = utils::backoff::exponential_backoff_duration_seconds(
|
||||||
|
consecutive_failures.min(u32::MAX as usize) as u32,
|
||||||
|
1.5,
|
||||||
|
60.0,
|
||||||
|
);
|
||||||
|
let backoff = std::time::Duration::from_secs_f64(backoff);
|
||||||
|
|
||||||
|
tokio::select! {
|
||||||
|
_ = tokio::time::sleep(backoff) => {},
|
||||||
|
_ = crate::task_mgr::shutdown_token().cancelled_owned() => {},
|
||||||
|
};
|
||||||
|
|
||||||
Err(e)
|
Err(e)
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@@ -826,14 +884,13 @@ impl LayerInner {
|
|||||||
match res {
|
match res {
|
||||||
(Ok(()), _) => {
|
(Ok(()), _) => {
|
||||||
// our caller is cancellation safe so this is fine; if someone
|
// our caller is cancellation safe so this is fine; if someone
|
||||||
// else requests the layer, they'll find it already downloaded
|
// else requests the layer, they'll find it already downloaded.
|
||||||
// or redownload.
|
|
||||||
//
|
//
|
||||||
// however, could be that we should consider marking the layer
|
// See counter [`LayerImplMetrics::inc_init_needed_no_download`]
|
||||||
// for eviction? alas, cannot: because only DownloadedLayer
|
//
|
||||||
// will handle that.
|
// FIXME(#6028): however, could be that we should consider marking the
|
||||||
tracing::info!("layer file download completed after requester had cancelled");
|
// layer for eviction? alas, cannot: because only DownloadedLayer will
|
||||||
LAYER_IMPL_METRICS.inc_download_completed_without_requester();
|
// handle that.
|
||||||
},
|
},
|
||||||
(Err(e), _) => {
|
(Err(e), _) => {
|
||||||
// our caller is cancellation safe, but we might be racing with
|
// our caller is cancellation safe, but we might be racing with
|
||||||
@@ -866,21 +923,7 @@ impl LayerInner {
|
|||||||
|
|
||||||
Ok(permit)
|
Ok(permit)
|
||||||
}
|
}
|
||||||
Ok((Err(e), _permit)) => {
|
Ok((Err(_), _permit)) => Err(DownloadError::DownloadFailed),
|
||||||
// FIXME: this should be with the spawned task and be cancellation sensitive
|
|
||||||
let consecutive_failures =
|
|
||||||
self.consecutive_failures.fetch_add(1, Ordering::Relaxed);
|
|
||||||
tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
|
|
||||||
let backoff = utils::backoff::exponential_backoff_duration_seconds(
|
|
||||||
consecutive_failures.min(u32::MAX as usize) as u32,
|
|
||||||
1.5,
|
|
||||||
60.0,
|
|
||||||
);
|
|
||||||
let backoff = std::time::Duration::from_secs_f64(backoff);
|
|
||||||
|
|
||||||
tokio::time::sleep(backoff).await;
|
|
||||||
Err(DownloadError::DownloadFailed)
|
|
||||||
}
|
|
||||||
Err(_gone) => Err(DownloadError::DownloadCancelled),
|
Err(_gone) => Err(DownloadError::DownloadCancelled),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -950,14 +993,17 @@ impl LayerInner {
|
|||||||
|
|
||||||
/// `DownloadedLayer` is being dropped, so it calls this method.
|
/// `DownloadedLayer` is being dropped, so it calls this method.
|
||||||
fn on_downloaded_layer_drop(self: Arc<LayerInner>, version: usize) {
|
fn on_downloaded_layer_drop(self: Arc<LayerInner>, version: usize) {
|
||||||
let gc = self.wanted_garbage_collected.load(Ordering::Acquire);
|
let delete = self.wanted_deleted.load(Ordering::Acquire);
|
||||||
let evict = self.wanted_evicted.load(Ordering::Acquire);
|
let evict = self.wanted_evicted.load(Ordering::Acquire);
|
||||||
let can_evict = self.have_remote_client;
|
let can_evict = self.have_remote_client;
|
||||||
|
|
||||||
if gc {
|
if delete {
|
||||||
// do nothing now, only in LayerInner::drop
|
// do nothing now, only in LayerInner::drop -- this was originally implemented because
|
||||||
|
// we could had already scheduled the deletion at the time.
|
||||||
|
//
|
||||||
|
// FIXME: this is not true anymore, we can safely evict wanted deleted files.
|
||||||
} else if can_evict && evict {
|
} else if can_evict && evict {
|
||||||
let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_id, timeline_id = %self.desc.timeline_id, layer=%self, %version);
|
let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, %version);
|
||||||
|
|
||||||
// downgrade for queueing, in case there's a tear down already ongoing we should not
|
// downgrade for queueing, in case there's a tear down already ongoing we should not
|
||||||
// hold it alive.
|
// hold it alive.
|
||||||
@@ -970,7 +1016,7 @@ impl LayerInner {
|
|||||||
crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || {
|
crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || {
|
||||||
let _g = span.entered();
|
let _g = span.entered();
|
||||||
|
|
||||||
// if LayerInner is already dropped here, do nothing because the garbage collection
|
// if LayerInner is already dropped here, do nothing because the delete on drop
|
||||||
// has already ran while we were in queue
|
// has already ran while we were in queue
|
||||||
let Some(this) = this.upgrade() else {
|
let Some(this) = this.upgrade() else {
|
||||||
LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
|
LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
|
||||||
@@ -1074,7 +1120,7 @@ impl LayerInner {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn metadata(&self) -> LayerFileMetadata {
|
fn metadata(&self) -> LayerFileMetadata {
|
||||||
LayerFileMetadata::new(self.desc.file_size, self.generation)
|
LayerFileMetadata::new(self.desc.file_size, self.generation, self.shard)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1189,41 +1235,50 @@ impl DownloadedLayer {
|
|||||||
|
|
||||||
let res = if owner.desc.is_delta {
|
let res = if owner.desc.is_delta {
|
||||||
let summary = Some(delta_layer::Summary::expected(
|
let summary = Some(delta_layer::Summary::expected(
|
||||||
owner.desc.tenant_id,
|
owner.desc.tenant_shard_id.tenant_id,
|
||||||
owner.desc.timeline_id,
|
owner.desc.timeline_id,
|
||||||
owner.desc.key_range.clone(),
|
owner.desc.key_range.clone(),
|
||||||
owner.desc.lsn_range.clone(),
|
owner.desc.lsn_range.clone(),
|
||||||
));
|
));
|
||||||
delta_layer::DeltaLayerInner::load(&owner.path, summary, ctx)
|
delta_layer::DeltaLayerInner::load(&owner.path, summary, ctx)
|
||||||
.await
|
.await
|
||||||
.map(LayerKind::Delta)
|
.map(|res| res.map(LayerKind::Delta))
|
||||||
} else {
|
} else {
|
||||||
let lsn = owner.desc.image_layer_lsn();
|
let lsn = owner.desc.image_layer_lsn();
|
||||||
let summary = Some(image_layer::Summary::expected(
|
let summary = Some(image_layer::Summary::expected(
|
||||||
owner.desc.tenant_id,
|
owner.desc.tenant_shard_id.tenant_id,
|
||||||
owner.desc.timeline_id,
|
owner.desc.timeline_id,
|
||||||
owner.desc.key_range.clone(),
|
owner.desc.key_range.clone(),
|
||||||
lsn,
|
lsn,
|
||||||
));
|
));
|
||||||
image_layer::ImageLayerInner::load(&owner.path, lsn, summary, ctx)
|
image_layer::ImageLayerInner::load(&owner.path, lsn, summary, ctx)
|
||||||
.await
|
.await
|
||||||
.map(LayerKind::Image)
|
.map(|res| res.map(LayerKind::Image))
|
||||||
}
|
};
|
||||||
// this will be a permanent failure
|
|
||||||
.context("load layer");
|
|
||||||
|
|
||||||
if let Err(e) = res.as_ref() {
|
match res {
|
||||||
LAYER_IMPL_METRICS.inc_permanent_loading_failures();
|
Ok(Ok(layer)) => Ok(Ok(layer)),
|
||||||
// TODO(#5815): we are not logging all errors, so temporarily log them here as well
|
Ok(Err(transient)) => Err(transient),
|
||||||
tracing::error!("layer loading failed permanently: {e:#}");
|
Err(permanent) => {
|
||||||
|
LAYER_IMPL_METRICS.inc_permanent_loading_failures();
|
||||||
|
// TODO(#5815): we are not logging all errors, so temporarily log them **once**
|
||||||
|
// here as well
|
||||||
|
let permanent = permanent.context("load layer");
|
||||||
|
tracing::error!("layer loading failed permanently: {permanent:#}");
|
||||||
|
Ok(Err(permanent))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
res
|
|
||||||
};
|
};
|
||||||
self.kind.get_or_init(init).await.as_ref().map_err(|e| {
|
self.kind
|
||||||
// errors are not clonabled, cannot but stringify
|
.get_or_try_init(init)
|
||||||
// test_broken_timeline matches this string
|
// return transient errors using `?`
|
||||||
anyhow::anyhow!("layer loading failed: {e:#}")
|
.await?
|
||||||
})
|
.as_ref()
|
||||||
|
.map_err(|e| {
|
||||||
|
// errors are not clonabled, cannot but stringify
|
||||||
|
// test_broken_timeline matches this string
|
||||||
|
anyhow::anyhow!("layer loading failed: {e:#}")
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn get_value_reconstruct_data(
|
async fn get_value_reconstruct_data(
|
||||||
@@ -1352,35 +1407,37 @@ impl From<ResidentLayer> for Layer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
use metrics::{IntCounter, IntCounterVec};
|
use metrics::IntCounter;
|
||||||
|
|
||||||
struct LayerImplMetrics {
|
pub(crate) struct LayerImplMetrics {
|
||||||
started_evictions: IntCounter,
|
started_evictions: IntCounter,
|
||||||
completed_evictions: IntCounter,
|
completed_evictions: IntCounter,
|
||||||
cancelled_evictions: IntCounterVec,
|
cancelled_evictions: enum_map::EnumMap<EvictionCancelled, IntCounter>,
|
||||||
|
|
||||||
started_gcs: IntCounter,
|
started_deletes: IntCounter,
|
||||||
completed_gcs: IntCounter,
|
completed_deletes: IntCounter,
|
||||||
failed_gcs: IntCounterVec,
|
failed_deletes: enum_map::EnumMap<DeleteFailed, IntCounter>,
|
||||||
|
|
||||||
rare_counters: IntCounterVec,
|
rare_counters: enum_map::EnumMap<RareEvent, IntCounter>,
|
||||||
|
inits_cancelled: metrics::core::GenericCounter<metrics::core::AtomicU64>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for LayerImplMetrics {
|
impl Default for LayerImplMetrics {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
let evictions = metrics::register_int_counter_vec!(
|
use enum_map::Enum;
|
||||||
"pageserver_layer_evictions_count",
|
|
||||||
"Evictions started and completed in the Layer implementation",
|
// reminder: these will be pageserver_layer_* with "_total" suffix
|
||||||
&["state"]
|
|
||||||
|
let started_evictions = metrics::register_int_counter!(
|
||||||
|
"pageserver_layer_started_evictions",
|
||||||
|
"Evictions started in the Layer implementation"
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
let completed_evictions = metrics::register_int_counter!(
|
||||||
|
"pageserver_layer_completed_evictions",
|
||||||
|
"Evictions completed in the Layer implementation"
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let started_evictions = evictions
|
|
||||||
.get_metric_with_label_values(&["started"])
|
|
||||||
.unwrap();
|
|
||||||
let completed_evictions = evictions
|
|
||||||
.get_metric_with_label_values(&["completed"])
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let cancelled_evictions = metrics::register_int_counter_vec!(
|
let cancelled_evictions = metrics::register_int_counter_vec!(
|
||||||
"pageserver_layer_cancelled_evictions_count",
|
"pageserver_layer_cancelled_evictions_count",
|
||||||
@@ -1389,23 +1446,36 @@ impl Default for LayerImplMetrics {
|
|||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let gcs = metrics::register_int_counter_vec!(
|
let cancelled_evictions = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
|
||||||
"pageserver_layer_gcs_count",
|
let reason = EvictionCancelled::from_usize(i);
|
||||||
"Garbage collections started and completed in the Layer implementation",
|
let s = reason.as_str();
|
||||||
&["state"]
|
cancelled_evictions.with_label_values(&[s])
|
||||||
|
}));
|
||||||
|
|
||||||
|
let started_deletes = metrics::register_int_counter!(
|
||||||
|
"pageserver_layer_started_deletes",
|
||||||
|
"Deletions on drop pending in the Layer implementation"
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
let completed_deletes = metrics::register_int_counter!(
|
||||||
|
"pageserver_layer_completed_deletes",
|
||||||
|
"Deletions on drop completed in the Layer implementation"
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let started_gcs = gcs.get_metric_with_label_values(&["pending"]).unwrap();
|
let failed_deletes = metrics::register_int_counter_vec!(
|
||||||
let completed_gcs = gcs.get_metric_with_label_values(&["completed"]).unwrap();
|
"pageserver_layer_failed_deletes_count",
|
||||||
|
"Different reasons for deletions on drop to have failed",
|
||||||
let failed_gcs = metrics::register_int_counter_vec!(
|
|
||||||
"pageserver_layer_failed_gcs_count",
|
|
||||||
"Different reasons for garbage collections to have failed",
|
|
||||||
&["reason"]
|
&["reason"]
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
|
let failed_deletes = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
|
||||||
|
let reason = DeleteFailed::from_usize(i);
|
||||||
|
let s = reason.as_str();
|
||||||
|
failed_deletes.with_label_values(&[s])
|
||||||
|
}));
|
||||||
|
|
||||||
let rare_counters = metrics::register_int_counter_vec!(
|
let rare_counters = metrics::register_int_counter_vec!(
|
||||||
"pageserver_layer_assumed_rare_count",
|
"pageserver_layer_assumed_rare_count",
|
||||||
"Times unexpected or assumed rare event happened",
|
"Times unexpected or assumed rare event happened",
|
||||||
@@ -1413,16 +1483,29 @@ impl Default for LayerImplMetrics {
|
|||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
|
let rare_counters = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
|
||||||
|
let event = RareEvent::from_usize(i);
|
||||||
|
let s = event.as_str();
|
||||||
|
rare_counters.with_label_values(&[s])
|
||||||
|
}));
|
||||||
|
|
||||||
|
let inits_cancelled = metrics::register_int_counter!(
|
||||||
|
"pageserver_layer_inits_cancelled_count",
|
||||||
|
"Times Layer initialization was cancelled",
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
Self {
|
Self {
|
||||||
started_evictions,
|
started_evictions,
|
||||||
completed_evictions,
|
completed_evictions,
|
||||||
cancelled_evictions,
|
cancelled_evictions,
|
||||||
|
|
||||||
started_gcs,
|
started_deletes,
|
||||||
completed_gcs,
|
completed_deletes,
|
||||||
failed_gcs,
|
failed_deletes,
|
||||||
|
|
||||||
rare_counters,
|
rare_counters,
|
||||||
|
inits_cancelled,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1435,57 +1518,33 @@ impl LayerImplMetrics {
|
|||||||
self.completed_evictions.inc();
|
self.completed_evictions.inc();
|
||||||
}
|
}
|
||||||
fn inc_eviction_cancelled(&self, reason: EvictionCancelled) {
|
fn inc_eviction_cancelled(&self, reason: EvictionCancelled) {
|
||||||
self.cancelled_evictions
|
self.cancelled_evictions[reason].inc()
|
||||||
.get_metric_with_label_values(&[reason.as_str()])
|
|
||||||
.unwrap()
|
|
||||||
.inc()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn inc_started_gcs(&self) {
|
fn inc_started_deletes(&self) {
|
||||||
self.started_gcs.inc();
|
self.started_deletes.inc();
|
||||||
}
|
}
|
||||||
fn inc_completed_gcs(&self) {
|
fn inc_completed_deletes(&self) {
|
||||||
self.completed_gcs.inc();
|
self.completed_deletes.inc();
|
||||||
}
|
}
|
||||||
fn inc_gcs_failed(&self, reason: GcFailed) {
|
fn inc_deletes_failed(&self, reason: DeleteFailed) {
|
||||||
self.failed_gcs
|
self.failed_deletes[reason].inc();
|
||||||
.get_metric_with_label_values(&[reason.as_str()])
|
|
||||||
.unwrap()
|
|
||||||
.inc();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Counted separatedly from failed gcs because we will complete the gc attempt regardless of
|
/// Counted separatedly from failed layer deletes because we will complete the layer deletion
|
||||||
/// failure to delete local file.
|
/// attempt regardless of failure to delete local file.
|
||||||
fn inc_gc_removes_failed(&self) {
|
fn inc_delete_removes_failed(&self) {
|
||||||
self.rare_counters
|
self.rare_counters[RareEvent::RemoveOnDropFailed].inc();
|
||||||
.get_metric_with_label_values(&["gc_remove_failed"])
|
|
||||||
.unwrap()
|
|
||||||
.inc();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Expected rare because requires a race with `evict_blocking` and
|
/// Expected rare because requires a race with `evict_blocking` and `get_or_maybe_download`.
|
||||||
/// `get_or_maybe_download`.
|
|
||||||
fn inc_retried_get_or_maybe_download(&self) {
|
fn inc_retried_get_or_maybe_download(&self) {
|
||||||
self.rare_counters
|
self.rare_counters[RareEvent::RetriedGetOrMaybeDownload].inc();
|
||||||
.get_metric_with_label_values(&["retried_gomd"])
|
|
||||||
.unwrap()
|
|
||||||
.inc();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Expected rare because cancellations are unexpected
|
/// Expected rare because cancellations are unexpected, and failures are unexpected
|
||||||
fn inc_download_completed_without_requester(&self) {
|
|
||||||
self.rare_counters
|
|
||||||
.get_metric_with_label_values(&["download_completed_without"])
|
|
||||||
.unwrap()
|
|
||||||
.inc();
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Expected rare because cancellations are unexpected
|
|
||||||
fn inc_download_failed_without_requester(&self) {
|
fn inc_download_failed_without_requester(&self) {
|
||||||
self.rare_counters
|
self.rare_counters[RareEvent::DownloadFailedWithoutRequester].inc();
|
||||||
.get_metric_with_label_values(&["download_failed_without"])
|
|
||||||
.unwrap()
|
|
||||||
.inc();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The Weak in ResidentOrWantedEvicted::WantedEvicted was successfully upgraded.
|
/// The Weak in ResidentOrWantedEvicted::WantedEvicted was successfully upgraded.
|
||||||
@@ -1493,37 +1552,30 @@ impl LayerImplMetrics {
|
|||||||
/// If this counter is always zero, we should replace ResidentOrWantedEvicted type with an
|
/// If this counter is always zero, we should replace ResidentOrWantedEvicted type with an
|
||||||
/// Option.
|
/// Option.
|
||||||
fn inc_raced_wanted_evicted_accesses(&self) {
|
fn inc_raced_wanted_evicted_accesses(&self) {
|
||||||
self.rare_counters
|
self.rare_counters[RareEvent::UpgradedWantedEvicted].inc();
|
||||||
.get_metric_with_label_values(&["raced_wanted_evicted"])
|
|
||||||
.unwrap()
|
|
||||||
.inc();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// These are only expected for [`Self::inc_download_completed_without_requester`] amount when
|
/// These are only expected for [`Self::inc_init_cancelled`] amount when
|
||||||
/// running with remote storage.
|
/// running with remote storage.
|
||||||
fn inc_init_needed_no_download(&self) {
|
fn inc_init_needed_no_download(&self) {
|
||||||
self.rare_counters
|
self.rare_counters[RareEvent::InitWithoutDownload].inc();
|
||||||
.get_metric_with_label_values(&["init_needed_no_download"])
|
|
||||||
.unwrap()
|
|
||||||
.inc();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Expected rare because all layer files should be readable and good
|
/// Expected rare because all layer files should be readable and good
|
||||||
fn inc_permanent_loading_failures(&self) {
|
fn inc_permanent_loading_failures(&self) {
|
||||||
self.rare_counters
|
self.rare_counters[RareEvent::PermanentLoadingFailure].inc();
|
||||||
.get_metric_with_label_values(&["permanent_loading_failure"])
|
|
||||||
.unwrap()
|
|
||||||
.inc();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn inc_broadcast_lagged(&self) {
|
fn inc_broadcast_lagged(&self) {
|
||||||
self.rare_counters
|
self.rare_counters[RareEvent::EvictAndWaitLagged].inc();
|
||||||
.get_metric_with_label_values(&["broadcast_lagged"])
|
}
|
||||||
.unwrap()
|
|
||||||
.inc();
|
fn inc_init_cancelled(&self) {
|
||||||
|
self.inits_cancelled.inc()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(enum_map::Enum)]
|
||||||
enum EvictionCancelled {
|
enum EvictionCancelled {
|
||||||
LayerGone,
|
LayerGone,
|
||||||
TimelineGone,
|
TimelineGone,
|
||||||
@@ -1552,19 +1604,47 @@ impl EvictionCancelled {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
enum GcFailed {
|
#[derive(enum_map::Enum)]
|
||||||
|
enum DeleteFailed {
|
||||||
TimelineGone,
|
TimelineGone,
|
||||||
DeleteSchedulingFailed,
|
DeleteSchedulingFailed,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl GcFailed {
|
impl DeleteFailed {
|
||||||
fn as_str(&self) -> &'static str {
|
fn as_str(&self) -> &'static str {
|
||||||
match self {
|
match self {
|
||||||
GcFailed::TimelineGone => "timeline_gone",
|
DeleteFailed::TimelineGone => "timeline_gone",
|
||||||
GcFailed::DeleteSchedulingFailed => "delete_scheduling_failed",
|
DeleteFailed::DeleteSchedulingFailed => "delete_scheduling_failed",
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static LAYER_IMPL_METRICS: once_cell::sync::Lazy<LayerImplMetrics> =
|
#[derive(enum_map::Enum)]
|
||||||
|
enum RareEvent {
|
||||||
|
RemoveOnDropFailed,
|
||||||
|
RetriedGetOrMaybeDownload,
|
||||||
|
DownloadFailedWithoutRequester,
|
||||||
|
UpgradedWantedEvicted,
|
||||||
|
InitWithoutDownload,
|
||||||
|
PermanentLoadingFailure,
|
||||||
|
EvictAndWaitLagged,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RareEvent {
|
||||||
|
fn as_str(&self) -> &'static str {
|
||||||
|
use RareEvent::*;
|
||||||
|
|
||||||
|
match self {
|
||||||
|
RemoveOnDropFailed => "remove_on_drop_failed",
|
||||||
|
RetriedGetOrMaybeDownload => "retried_gomd",
|
||||||
|
DownloadFailedWithoutRequester => "download_failed_without",
|
||||||
|
UpgradedWantedEvicted => "raced_wanted_evicted",
|
||||||
|
InitWithoutDownload => "init_needed_no_download",
|
||||||
|
PermanentLoadingFailure => "permanent_loading_failure",
|
||||||
|
EvictAndWaitLagged => "broadcast_lagged",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) static LAYER_IMPL_METRICS: once_cell::sync::Lazy<LayerImplMetrics> =
|
||||||
once_cell::sync::Lazy::new(LayerImplMetrics::default);
|
once_cell::sync::Lazy::new(LayerImplMetrics::default);
|
||||||
|
|||||||
@@ -1,9 +1,7 @@
|
|||||||
use core::fmt::Display;
|
use core::fmt::Display;
|
||||||
|
use pageserver_api::shard::TenantShardId;
|
||||||
use std::ops::Range;
|
use std::ops::Range;
|
||||||
use utils::{
|
use utils::{id::TimelineId, lsn::Lsn};
|
||||||
id::{TenantId, TimelineId},
|
|
||||||
lsn::Lsn,
|
|
||||||
};
|
|
||||||
|
|
||||||
use crate::repository::Key;
|
use crate::repository::Key;
|
||||||
|
|
||||||
@@ -11,12 +9,15 @@ use super::{DeltaFileName, ImageFileName, LayerFileName};
|
|||||||
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
use utils::id::TenantId;
|
||||||
|
|
||||||
/// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the
|
/// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the
|
||||||
/// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides
|
/// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides
|
||||||
/// a unified way to generate layer information like file name.
|
/// a unified way to generate layer information like file name.
|
||||||
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
|
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
|
||||||
pub struct PersistentLayerDesc {
|
pub struct PersistentLayerDesc {
|
||||||
pub tenant_id: TenantId,
|
pub tenant_shard_id: TenantShardId,
|
||||||
pub timeline_id: TimelineId,
|
pub timeline_id: TimelineId,
|
||||||
/// Range of keys that this layer covers
|
/// Range of keys that this layer covers
|
||||||
pub key_range: Range<Key>,
|
pub key_range: Range<Key>,
|
||||||
@@ -56,7 +57,7 @@ impl PersistentLayerDesc {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub fn new_test(key_range: Range<Key>) -> Self {
|
pub fn new_test(key_range: Range<Key>) -> Self {
|
||||||
Self {
|
Self {
|
||||||
tenant_id: TenantId::generate(),
|
tenant_shard_id: TenantShardId::unsharded(TenantId::generate()),
|
||||||
timeline_id: TimelineId::generate(),
|
timeline_id: TimelineId::generate(),
|
||||||
key_range,
|
key_range,
|
||||||
lsn_range: Lsn(0)..Lsn(1),
|
lsn_range: Lsn(0)..Lsn(1),
|
||||||
@@ -66,14 +67,14 @@ impl PersistentLayerDesc {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn new_img(
|
pub fn new_img(
|
||||||
tenant_id: TenantId,
|
tenant_shard_id: TenantShardId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
key_range: Range<Key>,
|
key_range: Range<Key>,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
file_size: u64,
|
file_size: u64,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
Self {
|
Self {
|
||||||
tenant_id,
|
tenant_shard_id,
|
||||||
timeline_id,
|
timeline_id,
|
||||||
key_range,
|
key_range,
|
||||||
lsn_range: Self::image_layer_lsn_range(lsn),
|
lsn_range: Self::image_layer_lsn_range(lsn),
|
||||||
@@ -83,14 +84,14 @@ impl PersistentLayerDesc {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn new_delta(
|
pub fn new_delta(
|
||||||
tenant_id: TenantId,
|
tenant_shard_id: TenantShardId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
key_range: Range<Key>,
|
key_range: Range<Key>,
|
||||||
lsn_range: Range<Lsn>,
|
lsn_range: Range<Lsn>,
|
||||||
file_size: u64,
|
file_size: u64,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
Self {
|
Self {
|
||||||
tenant_id,
|
tenant_shard_id,
|
||||||
timeline_id,
|
timeline_id,
|
||||||
key_range,
|
key_range,
|
||||||
lsn_range,
|
lsn_range,
|
||||||
@@ -100,18 +101,22 @@ impl PersistentLayerDesc {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn from_filename(
|
pub fn from_filename(
|
||||||
tenant_id: TenantId,
|
tenant_shard_id: TenantShardId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
filename: LayerFileName,
|
filename: LayerFileName,
|
||||||
file_size: u64,
|
file_size: u64,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
match filename {
|
match filename {
|
||||||
LayerFileName::Image(i) => {
|
LayerFileName::Image(i) => {
|
||||||
Self::new_img(tenant_id, timeline_id, i.key_range, i.lsn, file_size)
|
Self::new_img(tenant_shard_id, timeline_id, i.key_range, i.lsn, file_size)
|
||||||
}
|
|
||||||
LayerFileName::Delta(d) => {
|
|
||||||
Self::new_delta(tenant_id, timeline_id, d.key_range, d.lsn_range, file_size)
|
|
||||||
}
|
}
|
||||||
|
LayerFileName::Delta(d) => Self::new_delta(
|
||||||
|
tenant_shard_id,
|
||||||
|
timeline_id,
|
||||||
|
d.key_range,
|
||||||
|
d.lsn_range,
|
||||||
|
file_size,
|
||||||
|
),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -172,10 +177,6 @@ impl PersistentLayerDesc {
|
|||||||
self.timeline_id
|
self.timeline_id
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_tenant_id(&self) -> TenantId {
|
|
||||||
self.tenant_id
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Does this layer only contain some data for the key-range (incremental),
|
/// Does this layer only contain some data for the key-range (incremental),
|
||||||
/// or does it contain a version of every page? This is important to know
|
/// or does it contain a version of every page? This is important to know
|
||||||
/// for garbage collecting old layers: an incremental layer depends on
|
/// for garbage collecting old layers: an incremental layer depends on
|
||||||
@@ -192,7 +193,7 @@ impl PersistentLayerDesc {
|
|||||||
if self.is_delta {
|
if self.is_delta {
|
||||||
println!(
|
println!(
|
||||||
"----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} is_incremental {} size {} ----",
|
"----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} is_incremental {} size {} ----",
|
||||||
self.tenant_id,
|
self.tenant_shard_id,
|
||||||
self.timeline_id,
|
self.timeline_id,
|
||||||
self.key_range.start,
|
self.key_range.start,
|
||||||
self.key_range.end,
|
self.key_range.end,
|
||||||
@@ -204,7 +205,7 @@ impl PersistentLayerDesc {
|
|||||||
} else {
|
} else {
|
||||||
println!(
|
println!(
|
||||||
"----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
|
"----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
|
||||||
self.tenant_id,
|
self.tenant_shard_id,
|
||||||
self.timeline_id,
|
self.timeline_id,
|
||||||
self.key_range.start,
|
self.key_range.start,
|
||||||
self.key_range.end,
|
self.key_range.end,
|
||||||
|
|||||||
@@ -44,6 +44,7 @@ pub(crate) enum BackgroundLoopKind {
|
|||||||
Eviction,
|
Eviction,
|
||||||
ConsumptionMetricsCollectMetrics,
|
ConsumptionMetricsCollectMetrics,
|
||||||
ConsumptionMetricsSyntheticSizeWorker,
|
ConsumptionMetricsSyntheticSizeWorker,
|
||||||
|
InitialLogicalSizeCalculation,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl BackgroundLoopKind {
|
impl BackgroundLoopKind {
|
||||||
@@ -53,31 +54,21 @@ impl BackgroundLoopKind {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) enum RateLimitError {
|
pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
|
||||||
Cancelled,
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) async fn concurrent_background_tasks_rate_limit(
|
|
||||||
loop_kind: BackgroundLoopKind,
|
loop_kind: BackgroundLoopKind,
|
||||||
_ctx: &RequestContext,
|
_ctx: &RequestContext,
|
||||||
cancel: &CancellationToken,
|
) -> impl Drop {
|
||||||
) -> Result<impl Drop, RateLimitError> {
|
|
||||||
crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT
|
crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT
|
||||||
.with_label_values(&[loop_kind.as_static_str()])
|
.with_label_values(&[loop_kind.as_static_str()])
|
||||||
.inc();
|
.inc();
|
||||||
|
|
||||||
scopeguard::defer!(
|
scopeguard::defer!(
|
||||||
crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT.with_label_values(&[loop_kind.as_static_str()]).inc();
|
crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT.with_label_values(&[loop_kind.as_static_str()]).inc();
|
||||||
);
|
);
|
||||||
tokio::select! {
|
|
||||||
permit = CONCURRENT_BACKGROUND_TASKS.acquire() => {
|
match CONCURRENT_BACKGROUND_TASKS.acquire().await {
|
||||||
match permit {
|
Ok(permit) => permit,
|
||||||
Ok(permit) => Ok(permit),
|
Err(_closed) => unreachable!("we never close the semaphore"),
|
||||||
Err(_closed) => unreachable!("we never close the semaphore"),
|
|
||||||
}
|
|
||||||
},
|
|
||||||
_ = cancel.cancelled() => {
|
|
||||||
Err(RateLimitError::Cancelled)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -86,14 +77,15 @@ pub fn start_background_loops(
|
|||||||
tenant: &Arc<Tenant>,
|
tenant: &Arc<Tenant>,
|
||||||
background_jobs_can_start: Option<&completion::Barrier>,
|
background_jobs_can_start: Option<&completion::Barrier>,
|
||||||
) {
|
) {
|
||||||
let tenant_id = tenant.tenant_id;
|
let tenant_shard_id = tenant.tenant_shard_id;
|
||||||
task_mgr::spawn(
|
task_mgr::spawn(
|
||||||
BACKGROUND_RUNTIME.handle(),
|
BACKGROUND_RUNTIME.handle(),
|
||||||
TaskKind::Compaction,
|
TaskKind::Compaction,
|
||||||
Some(tenant_id),
|
Some(tenant_shard_id),
|
||||||
None,
|
None,
|
||||||
&format!("compactor for tenant {tenant_id}"),
|
&format!("compactor for tenant {tenant_shard_id}"),
|
||||||
false,
|
false,
|
||||||
|
tenant.cancel.child_token(),
|
||||||
{
|
{
|
||||||
let tenant = Arc::clone(tenant);
|
let tenant = Arc::clone(tenant);
|
||||||
let background_jobs_can_start = background_jobs_can_start.cloned();
|
let background_jobs_can_start = background_jobs_can_start.cloned();
|
||||||
@@ -104,7 +96,7 @@ pub fn start_background_loops(
|
|||||||
_ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
|
_ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
|
||||||
};
|
};
|
||||||
compaction_loop(tenant, cancel)
|
compaction_loop(tenant, cancel)
|
||||||
.instrument(info_span!("compaction_loop", tenant_id = %tenant_id))
|
.instrument(info_span!("compaction_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
|
||||||
.await;
|
.await;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -113,10 +105,11 @@ pub fn start_background_loops(
|
|||||||
task_mgr::spawn(
|
task_mgr::spawn(
|
||||||
BACKGROUND_RUNTIME.handle(),
|
BACKGROUND_RUNTIME.handle(),
|
||||||
TaskKind::GarbageCollector,
|
TaskKind::GarbageCollector,
|
||||||
Some(tenant_id),
|
Some(tenant_shard_id),
|
||||||
None,
|
None,
|
||||||
&format!("garbage collector for tenant {tenant_id}"),
|
&format!("garbage collector for tenant {tenant_shard_id}"),
|
||||||
false,
|
false,
|
||||||
|
tenant.cancel.child_token(),
|
||||||
{
|
{
|
||||||
let tenant = Arc::clone(tenant);
|
let tenant = Arc::clone(tenant);
|
||||||
let background_jobs_can_start = background_jobs_can_start.cloned();
|
let background_jobs_can_start = background_jobs_can_start.cloned();
|
||||||
@@ -127,7 +120,7 @@ pub fn start_background_loops(
|
|||||||
_ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
|
_ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
|
||||||
};
|
};
|
||||||
gc_loop(tenant, cancel)
|
gc_loop(tenant, cancel)
|
||||||
.instrument(info_span!("gc_loop", tenant_id = %tenant_id))
|
.instrument(info_span!("gc_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
|
||||||
.await;
|
.await;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -180,16 +173,16 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
|||||||
// Run compaction
|
// Run compaction
|
||||||
if let Err(e) = tenant.compaction_iteration(&cancel, &ctx).await {
|
if let Err(e) = tenant.compaction_iteration(&cancel, &ctx).await {
|
||||||
let wait_duration = backoff::exponential_backoff_duration_seconds(
|
let wait_duration = backoff::exponential_backoff_duration_seconds(
|
||||||
error_run_count,
|
error_run_count + 1,
|
||||||
1.0,
|
1.0,
|
||||||
MAX_BACKOFF_SECS,
|
MAX_BACKOFF_SECS,
|
||||||
);
|
);
|
||||||
error_run_count += 1;
|
error_run_count += 1;
|
||||||
|
let wait_duration = Duration::from_secs_f64(wait_duration);
|
||||||
error!(
|
error!(
|
||||||
"Compaction failed {error_run_count} times, retrying in {:?}: {e:?}",
|
"Compaction failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
|
||||||
wait_duration
|
|
||||||
);
|
);
|
||||||
Duration::from_secs_f64(wait_duration)
|
wait_duration
|
||||||
} else {
|
} else {
|
||||||
error_run_count = 0;
|
error_run_count = 0;
|
||||||
period
|
period
|
||||||
@@ -198,6 +191,10 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
|||||||
|
|
||||||
warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction);
|
warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction);
|
||||||
|
|
||||||
|
// Perhaps we did no work and the walredo process has been idle for some time:
|
||||||
|
// give it a chance to shut down to avoid leaving walredo process running indefinitely.
|
||||||
|
tenant.walredo_mgr.maybe_quiesce(period * 10);
|
||||||
|
|
||||||
// Sleep
|
// Sleep
|
||||||
if tokio::time::timeout(sleep_duration, cancel.cancelled())
|
if tokio::time::timeout(sleep_duration, cancel.cancelled())
|
||||||
.await
|
.await
|
||||||
@@ -257,20 +254,20 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
|||||||
} else {
|
} else {
|
||||||
// Run gc
|
// Run gc
|
||||||
let res = tenant
|
let res = tenant
|
||||||
.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx)
|
.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx)
|
||||||
.await;
|
.await;
|
||||||
if let Err(e) = res {
|
if let Err(e) = res {
|
||||||
let wait_duration = backoff::exponential_backoff_duration_seconds(
|
let wait_duration = backoff::exponential_backoff_duration_seconds(
|
||||||
error_run_count,
|
error_run_count + 1,
|
||||||
1.0,
|
1.0,
|
||||||
MAX_BACKOFF_SECS,
|
MAX_BACKOFF_SECS,
|
||||||
);
|
);
|
||||||
error_run_count += 1;
|
error_run_count += 1;
|
||||||
|
let wait_duration = Duration::from_secs_f64(wait_duration);
|
||||||
error!(
|
error!(
|
||||||
"Gc failed {error_run_count} times, retrying in {:?}: {e:?}",
|
"Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
|
||||||
wait_duration
|
|
||||||
);
|
);
|
||||||
Duration::from_secs_f64(wait_duration)
|
wait_duration
|
||||||
} else {
|
} else {
|
||||||
error_run_count = 0;
|
error_run_count = 0;
|
||||||
period
|
period
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -4,13 +4,11 @@ use std::{
|
|||||||
};
|
};
|
||||||
|
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use pageserver_api::models::TimelineState;
|
use pageserver_api::{models::TimelineState, shard::TenantShardId};
|
||||||
use tokio::sync::OwnedMutexGuard;
|
use tokio::sync::OwnedMutexGuard;
|
||||||
|
use tokio_util::sync::CancellationToken;
|
||||||
use tracing::{debug, error, info, instrument, warn, Instrument, Span};
|
use tracing::{debug, error, info, instrument, warn, Instrument, Span};
|
||||||
use utils::{
|
use utils::{crashsafe, fs_ext, id::TimelineId};
|
||||||
crashsafe, fs_ext,
|
|
||||||
id::{TenantId, TimelineId},
|
|
||||||
};
|
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
config::PageServerConf,
|
config::PageServerConf,
|
||||||
@@ -24,7 +22,6 @@ use crate::{
|
|||||||
},
|
},
|
||||||
CreateTimelineCause, DeleteTimelineError, Tenant,
|
CreateTimelineCause, DeleteTimelineError, Tenant,
|
||||||
},
|
},
|
||||||
InitializationOrder,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
use super::{Timeline, TimelineResources};
|
use super::{Timeline, TimelineResources};
|
||||||
@@ -47,7 +44,7 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
|
|||||||
// Shut down the layer flush task before the remote client, as one depends on the other
|
// Shut down the layer flush task before the remote client, as one depends on the other
|
||||||
task_mgr::shutdown_tasks(
|
task_mgr::shutdown_tasks(
|
||||||
Some(TaskKind::LayerFlushTask),
|
Some(TaskKind::LayerFlushTask),
|
||||||
Some(timeline.tenant_id),
|
Some(timeline.tenant_shard_id),
|
||||||
Some(timeline.timeline_id),
|
Some(timeline.timeline_id),
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
@@ -73,7 +70,12 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
|
|||||||
// NB: This and other delete_timeline calls do not run as a task_mgr task,
|
// NB: This and other delete_timeline calls do not run as a task_mgr task,
|
||||||
// so, they are not affected by this shutdown_tasks() call.
|
// so, they are not affected by this shutdown_tasks() call.
|
||||||
info!("waiting for timeline tasks to shutdown");
|
info!("waiting for timeline tasks to shutdown");
|
||||||
task_mgr::shutdown_tasks(None, Some(timeline.tenant_id), Some(timeline.timeline_id)).await;
|
task_mgr::shutdown_tasks(
|
||||||
|
None,
|
||||||
|
Some(timeline.tenant_shard_id),
|
||||||
|
Some(timeline.timeline_id),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
|
||||||
fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
|
fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
|
||||||
Err(anyhow::anyhow!(
|
Err(anyhow::anyhow!(
|
||||||
@@ -110,40 +112,11 @@ async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTi
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
// We delete local files first, so if pageserver restarts after local files deletion then remote deletion is not continued.
|
/// Grab the compaction and gc locks, and actually perform the deletion.
|
||||||
// This can be solved with inversion of these steps. But even if these steps are inverted then, when index_part.json
|
|
||||||
// gets deleted there is no way to distinguish between "this timeline is good, we just didnt upload it to remote"
|
|
||||||
// and "this timeline is deleted we should continue with removal of local state". So to avoid the ambiguity we use a mark file.
|
|
||||||
// After index part is deleted presence of this mark file indentifies that it was a deletion intention.
|
|
||||||
// So we can just remove the mark file.
|
|
||||||
async fn create_delete_mark(
|
|
||||||
conf: &PageServerConf,
|
|
||||||
tenant_id: TenantId,
|
|
||||||
timeline_id: TimelineId,
|
|
||||||
) -> Result<(), DeleteTimelineError> {
|
|
||||||
fail::fail_point!("timeline-delete-before-delete-mark", |_| {
|
|
||||||
Err(anyhow::anyhow!(
|
|
||||||
"failpoint: timeline-delete-before-delete-mark"
|
|
||||||
))?
|
|
||||||
});
|
|
||||||
let marker_path = conf.timeline_delete_mark_file_path(tenant_id, timeline_id);
|
|
||||||
|
|
||||||
// Note: we're ok to replace existing file.
|
|
||||||
let _ = std::fs::OpenOptions::new()
|
|
||||||
.write(true)
|
|
||||||
.create(true)
|
|
||||||
.open(&marker_path)
|
|
||||||
.with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
|
|
||||||
|
|
||||||
crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Grab the layer_removal_cs lock, and actually perform the deletion.
|
|
||||||
///
|
///
|
||||||
/// This lock prevents prevents GC or compaction from running at the same time.
|
/// The locks prevent GC or compaction from running at the same time. The background tasks do not
|
||||||
/// The GC task doesn't register itself with the timeline it's operating on,
|
/// register themselves with the timeline it's operating on, so it might still be running even
|
||||||
/// so it might still be running even though we called `shutdown_tasks`.
|
/// though we called `shutdown_tasks`.
|
||||||
///
|
///
|
||||||
/// Note that there are still other race conditions between
|
/// Note that there are still other race conditions between
|
||||||
/// GC, compaction and timeline deletion. See
|
/// GC, compaction and timeline deletion. See
|
||||||
@@ -151,19 +124,24 @@ async fn create_delete_mark(
|
|||||||
///
|
///
|
||||||
/// No timeout here, GC & Compaction should be responsive to the
|
/// No timeout here, GC & Compaction should be responsive to the
|
||||||
/// `TimelineState::Stopping` change.
|
/// `TimelineState::Stopping` change.
|
||||||
async fn delete_local_layer_files(
|
// pub(super): documentation link
|
||||||
|
pub(super) async fn delete_local_layer_files(
|
||||||
conf: &PageServerConf,
|
conf: &PageServerConf,
|
||||||
tenant_id: TenantId,
|
tenant_shard_id: TenantShardId,
|
||||||
timeline: &Timeline,
|
timeline: &Timeline,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
info!("waiting for layer_removal_cs.lock()");
|
let guards = async { tokio::join!(timeline.gc_lock.lock(), timeline.compaction_lock.lock()) };
|
||||||
let layer_removal_guard = timeline.layer_removal_cs.lock().await;
|
let guards = crate::timed(
|
||||||
info!("got layer_removal_cs.lock(), deleting layer files");
|
guards,
|
||||||
|
"acquire gc and compaction locks",
|
||||||
|
std::time::Duration::from_secs(5),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
|
||||||
// NB: storage_sync upload tasks that reference these layers have been cancelled
|
// NB: storage_sync upload tasks that reference these layers have been cancelled
|
||||||
// by the caller.
|
// by the caller.
|
||||||
|
|
||||||
let local_timeline_directory = conf.timeline_path(&tenant_id, &timeline.timeline_id);
|
let local_timeline_directory = conf.timeline_path(&tenant_shard_id, &timeline.timeline_id);
|
||||||
|
|
||||||
fail::fail_point!("timeline-delete-before-rm", |_| {
|
fail::fail_point!("timeline-delete-before-rm", |_| {
|
||||||
Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
|
Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
|
||||||
@@ -179,8 +157,8 @@ async fn delete_local_layer_files(
|
|||||||
// because of a previous failure/cancellation at/after
|
// because of a previous failure/cancellation at/after
|
||||||
// failpoint timeline-delete-after-rm.
|
// failpoint timeline-delete-after-rm.
|
||||||
//
|
//
|
||||||
// It can also happen if we race with tenant detach, because,
|
// ErrorKind::NotFound can also happen if we race with tenant detach, because,
|
||||||
// it doesn't grab the layer_removal_cs lock.
|
// no locks are shared.
|
||||||
//
|
//
|
||||||
// For now, log and continue.
|
// For now, log and continue.
|
||||||
// warn! level is technically not appropriate for the
|
// warn! level is technically not appropriate for the
|
||||||
@@ -199,7 +177,7 @@ async fn delete_local_layer_files(
|
|||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
let metadata_path = conf.metadata_path(&tenant_id, &timeline.timeline_id);
|
let metadata_path = conf.metadata_path(&tenant_shard_id, &timeline.timeline_id);
|
||||||
|
|
||||||
for entry in walkdir::WalkDir::new(&local_timeline_directory).contents_first(true) {
|
for entry in walkdir::WalkDir::new(&local_timeline_directory).contents_first(true) {
|
||||||
#[cfg(feature = "testing")]
|
#[cfg(feature = "testing")]
|
||||||
@@ -248,8 +226,8 @@ async fn delete_local_layer_files(
|
|||||||
.with_context(|| format!("Failed to remove: {}", entry.path().display()))?;
|
.with_context(|| format!("Failed to remove: {}", entry.path().display()))?;
|
||||||
}
|
}
|
||||||
|
|
||||||
info!("finished deleting layer files, releasing layer_removal_cs.lock()");
|
info!("finished deleting layer files, releasing locks");
|
||||||
drop(layer_removal_guard);
|
drop(guards);
|
||||||
|
|
||||||
fail::fail_point!("timeline-delete-after-rm", |_| {
|
fail::fail_point!("timeline-delete-after-rm", |_| {
|
||||||
Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
|
Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
|
||||||
@@ -274,11 +252,11 @@ async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<(
|
|||||||
// (nothing can fail after its deletion)
|
// (nothing can fail after its deletion)
|
||||||
async fn cleanup_remaining_timeline_fs_traces(
|
async fn cleanup_remaining_timeline_fs_traces(
|
||||||
conf: &PageServerConf,
|
conf: &PageServerConf,
|
||||||
tenant_id: TenantId,
|
tenant_shard_id: TenantShardId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
// Remove local metadata
|
// Remove local metadata
|
||||||
tokio::fs::remove_file(conf.metadata_path(&tenant_id, &timeline_id))
|
tokio::fs::remove_file(conf.metadata_path(&tenant_shard_id, &timeline_id))
|
||||||
.await
|
.await
|
||||||
.or_else(fs_ext::ignore_not_found)
|
.or_else(fs_ext::ignore_not_found)
|
||||||
.context("remove metadata")?;
|
.context("remove metadata")?;
|
||||||
@@ -290,7 +268,7 @@ async fn cleanup_remaining_timeline_fs_traces(
|
|||||||
});
|
});
|
||||||
|
|
||||||
// Remove timeline dir
|
// Remove timeline dir
|
||||||
tokio::fs::remove_dir(conf.timeline_path(&tenant_id, &timeline_id))
|
tokio::fs::remove_dir(conf.timeline_path(&tenant_shard_id, &timeline_id))
|
||||||
.await
|
.await
|
||||||
.or_else(fs_ext::ignore_not_found)
|
.or_else(fs_ext::ignore_not_found)
|
||||||
.context("timeline dir")?;
|
.context("timeline dir")?;
|
||||||
@@ -305,13 +283,15 @@ async fn cleanup_remaining_timeline_fs_traces(
|
|||||||
// to be reordered later and thus missed if a crash occurs.
|
// to be reordered later and thus missed if a crash occurs.
|
||||||
// Note that we dont need to sync after mark file is removed
|
// Note that we dont need to sync after mark file is removed
|
||||||
// because we can tolerate the case when mark file reappears on startup.
|
// because we can tolerate the case when mark file reappears on startup.
|
||||||
let timeline_path = conf.timelines_path(&tenant_id);
|
let timeline_path = conf.timelines_path(&tenant_shard_id);
|
||||||
crashsafe::fsync_async(timeline_path)
|
crashsafe::fsync_async(timeline_path)
|
||||||
.await
|
.await
|
||||||
.context("fsync_pre_mark_remove")?;
|
.context("fsync_pre_mark_remove")?;
|
||||||
|
|
||||||
// Remove delete mark
|
// Remove delete mark
|
||||||
tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_id, timeline_id))
|
// TODO: once we are confident that no more exist in the field, remove this
|
||||||
|
// line. It cleans up a legacy marker file that might in rare cases be present.
|
||||||
|
tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_shard_id, timeline_id))
|
||||||
.await
|
.await
|
||||||
.or_else(fs_ext::ignore_not_found)
|
.or_else(fs_ext::ignore_not_found)
|
||||||
.context("remove delete mark")
|
.context("remove delete mark")
|
||||||
@@ -377,7 +357,7 @@ impl DeleteTimelineFlow {
|
|||||||
// NB: If this fails half-way through, and is retried, the retry will go through
|
// NB: If this fails half-way through, and is retried, the retry will go through
|
||||||
// all the same steps again. Make sure the code here is idempotent, and don't
|
// all the same steps again. Make sure the code here is idempotent, and don't
|
||||||
// error out if some of the shutdown tasks have already been completed!
|
// error out if some of the shutdown tasks have already been completed!
|
||||||
#[instrument(skip(tenant), fields(tenant_id=%tenant.tenant_id))]
|
#[instrument(skip(tenant), fields(tenant_id=%tenant.tenant_shard_id.tenant_id, shard_id=%tenant.tenant_shard_id.shard_slug()))]
|
||||||
pub async fn run(
|
pub async fn run(
|
||||||
tenant: &Arc<Tenant>,
|
tenant: &Arc<Tenant>,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
@@ -391,8 +371,6 @@ impl DeleteTimelineFlow {
|
|||||||
|
|
||||||
set_deleted_in_remote_index(&timeline).await?;
|
set_deleted_in_remote_index(&timeline).await?;
|
||||||
|
|
||||||
create_delete_mark(tenant.conf, timeline.tenant_id, timeline.timeline_id).await?;
|
|
||||||
|
|
||||||
fail::fail_point!("timeline-delete-before-schedule", |_| {
|
fail::fail_point!("timeline-delete-before-schedule", |_| {
|
||||||
Err(anyhow::anyhow!(
|
Err(anyhow::anyhow!(
|
||||||
"failpoint: timeline-delete-before-schedule"
|
"failpoint: timeline-delete-before-schedule"
|
||||||
@@ -429,7 +407,7 @@ impl DeleteTimelineFlow {
|
|||||||
local_metadata: &TimelineMetadata,
|
local_metadata: &TimelineMetadata,
|
||||||
remote_client: Option<RemoteTimelineClient>,
|
remote_client: Option<RemoteTimelineClient>,
|
||||||
deletion_queue_client: DeletionQueueClient,
|
deletion_queue_client: DeletionQueueClient,
|
||||||
init_order: Option<&InitializationOrder>,
|
cancel: CancellationToken,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
// Note: here we even skip populating layer map. Timeline is essentially uninitialized.
|
// Note: here we even skip populating layer map. Timeline is essentially uninitialized.
|
||||||
// RemoteTimelineClient is the only functioning part.
|
// RemoteTimelineClient is the only functioning part.
|
||||||
@@ -442,10 +420,10 @@ impl DeleteTimelineFlow {
|
|||||||
remote_client,
|
remote_client,
|
||||||
deletion_queue_client,
|
deletion_queue_client,
|
||||||
},
|
},
|
||||||
init_order,
|
|
||||||
// Important. We dont pass ancestor above because it can be missing.
|
// Important. We dont pass ancestor above because it can be missing.
|
||||||
// Thus we need to skip the validation here.
|
// Thus we need to skip the validation here.
|
||||||
CreateTimelineCause::Delete,
|
CreateTimelineCause::Delete,
|
||||||
|
cancel,
|
||||||
)
|
)
|
||||||
.context("create_timeline_struct")?;
|
.context("create_timeline_struct")?;
|
||||||
|
|
||||||
@@ -464,10 +442,6 @@ impl DeleteTimelineFlow {
|
|||||||
|
|
||||||
guard.mark_in_progress()?;
|
guard.mark_in_progress()?;
|
||||||
|
|
||||||
// Note that delete mark can be missing on resume
|
|
||||||
// because we create delete mark after we set deleted_at in the index part.
|
|
||||||
create_delete_mark(tenant.conf, tenant.tenant_id, timeline_id).await?;
|
|
||||||
|
|
||||||
Self::schedule_background(guard, tenant.conf, tenant, timeline);
|
Self::schedule_background(guard, tenant.conf, tenant, timeline);
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -479,7 +453,8 @@ impl DeleteTimelineFlow {
|
|||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let r =
|
let r =
|
||||||
cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_id, timeline_id).await;
|
cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_shard_id, timeline_id)
|
||||||
|
.await;
|
||||||
info!("Done");
|
info!("Done");
|
||||||
r
|
r
|
||||||
}
|
}
|
||||||
@@ -550,16 +525,17 @@ impl DeleteTimelineFlow {
|
|||||||
tenant: Arc<Tenant>,
|
tenant: Arc<Tenant>,
|
||||||
timeline: Arc<Timeline>,
|
timeline: Arc<Timeline>,
|
||||||
) {
|
) {
|
||||||
let tenant_id = timeline.tenant_id;
|
let tenant_shard_id = timeline.tenant_shard_id;
|
||||||
let timeline_id = timeline.timeline_id;
|
let timeline_id = timeline.timeline_id;
|
||||||
|
|
||||||
task_mgr::spawn(
|
task_mgr::spawn(
|
||||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||||
TaskKind::TimelineDeletionWorker,
|
TaskKind::TimelineDeletionWorker,
|
||||||
Some(tenant_id),
|
Some(tenant_shard_id),
|
||||||
Some(timeline_id),
|
Some(timeline_id),
|
||||||
"timeline_delete",
|
"timeline_delete",
|
||||||
false,
|
false,
|
||||||
|
tenant.cancel.child_token(),
|
||||||
async move {
|
async move {
|
||||||
if let Err(err) = Self::background(guard, conf, &tenant, &timeline).await {
|
if let Err(err) = Self::background(guard, conf, &tenant, &timeline).await {
|
||||||
error!("Error: {err:#}");
|
error!("Error: {err:#}");
|
||||||
@@ -569,7 +545,7 @@ impl DeleteTimelineFlow {
|
|||||||
}
|
}
|
||||||
.instrument({
|
.instrument({
|
||||||
let span =
|
let span =
|
||||||
tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_id, timeline_id=%timeline_id);
|
tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),timeline_id=%timeline_id);
|
||||||
span.follows_from(Span::current());
|
span.follows_from(Span::current());
|
||||||
span
|
span
|
||||||
}),
|
}),
|
||||||
@@ -582,13 +558,14 @@ impl DeleteTimelineFlow {
|
|||||||
tenant: &Tenant,
|
tenant: &Tenant,
|
||||||
timeline: &Timeline,
|
timeline: &Timeline,
|
||||||
) -> Result<(), DeleteTimelineError> {
|
) -> Result<(), DeleteTimelineError> {
|
||||||
delete_local_layer_files(conf, tenant.tenant_id, timeline).await?;
|
delete_local_layer_files(conf, tenant.tenant_shard_id, timeline).await?;
|
||||||
|
|
||||||
delete_remote_layers_and_index(timeline).await?;
|
delete_remote_layers_and_index(timeline).await?;
|
||||||
|
|
||||||
pausable_failpoint!("in_progress_delete");
|
pausable_failpoint!("in_progress_delete");
|
||||||
|
|
||||||
cleanup_remaining_timeline_fs_traces(conf, tenant.tenant_id, timeline.timeline_id).await?;
|
cleanup_remaining_timeline_fs_traces(conf, tenant.tenant_shard_id, timeline.timeline_id)
|
||||||
|
.await?;
|
||||||
|
|
||||||
remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;
|
remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;
|
||||||
|
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ use crate::{
|
|||||||
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
||||||
tenant::{
|
tenant::{
|
||||||
config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
|
config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
|
||||||
tasks::{BackgroundLoopKind, RateLimitError},
|
tasks::BackgroundLoopKind,
|
||||||
timeline::EvictionError,
|
timeline::EvictionError,
|
||||||
LogicalSizeCalculationCause, Tenant,
|
LogicalSizeCalculationCause, Tenant,
|
||||||
},
|
},
|
||||||
@@ -60,10 +60,14 @@ impl Timeline {
|
|||||||
task_mgr::spawn(
|
task_mgr::spawn(
|
||||||
BACKGROUND_RUNTIME.handle(),
|
BACKGROUND_RUNTIME.handle(),
|
||||||
TaskKind::Eviction,
|
TaskKind::Eviction,
|
||||||
Some(self.tenant_id),
|
Some(self.tenant_shard_id),
|
||||||
Some(self.timeline_id),
|
Some(self.timeline_id),
|
||||||
&format!("layer eviction for {}/{}", self.tenant_id, self.timeline_id),
|
&format!(
|
||||||
|
"layer eviction for {}/{}",
|
||||||
|
self.tenant_shard_id, self.timeline_id
|
||||||
|
),
|
||||||
false,
|
false,
|
||||||
|
self.cancel.child_token(),
|
||||||
async move {
|
async move {
|
||||||
let cancel = task_mgr::shutdown_token();
|
let cancel = task_mgr::shutdown_token();
|
||||||
tokio::select! {
|
tokio::select! {
|
||||||
@@ -77,7 +81,7 @@ impl Timeline {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))]
|
#[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
|
||||||
async fn eviction_task(self: Arc<Self>, cancel: CancellationToken) {
|
async fn eviction_task(self: Arc<Self>, cancel: CancellationToken) {
|
||||||
use crate::tenant::tasks::random_init_delay;
|
use crate::tenant::tasks::random_init_delay;
|
||||||
{
|
{
|
||||||
@@ -155,15 +159,14 @@ impl Timeline {
|
|||||||
) -> ControlFlow<()> {
|
) -> ControlFlow<()> {
|
||||||
let now = SystemTime::now();
|
let now = SystemTime::now();
|
||||||
|
|
||||||
let _permit = match crate::tenant::tasks::concurrent_background_tasks_rate_limit(
|
let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
|
||||||
BackgroundLoopKind::Eviction,
|
BackgroundLoopKind::Eviction,
|
||||||
ctx,
|
ctx,
|
||||||
cancel,
|
);
|
||||||
)
|
|
||||||
.await
|
let _permit = tokio::select! {
|
||||||
{
|
permit = acquire_permit => permit,
|
||||||
Ok(permit) => permit,
|
_ = cancel.cancelled() => return ControlFlow::Break(()),
|
||||||
Err(RateLimitError::Cancelled) => return ControlFlow::Break(()),
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// If we evict layers but keep cached values derived from those layers, then
|
// If we evict layers but keep cached values derived from those layers, then
|
||||||
@@ -209,11 +212,21 @@ impl Timeline {
|
|||||||
// Gather layers for eviction.
|
// Gather layers for eviction.
|
||||||
// NB: all the checks can be invalidated as soon as we release the layer map lock.
|
// NB: all the checks can be invalidated as soon as we release the layer map lock.
|
||||||
// We don't want to hold the layer map lock during eviction.
|
// We don't want to hold the layer map lock during eviction.
|
||||||
|
|
||||||
// So, we just need to deal with this.
|
// So, we just need to deal with this.
|
||||||
let candidates: Vec<_> = {
|
|
||||||
|
let remote_client = match self.remote_client.as_ref() {
|
||||||
|
Some(c) => c,
|
||||||
|
None => {
|
||||||
|
error!("no remote storage configured, cannot evict layers");
|
||||||
|
return ControlFlow::Continue(());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut js = tokio::task::JoinSet::new();
|
||||||
|
{
|
||||||
let guard = self.layers.read().await;
|
let guard = self.layers.read().await;
|
||||||
let layers = guard.layer_map();
|
let layers = guard.layer_map();
|
||||||
let mut candidates = Vec::new();
|
|
||||||
for hist_layer in layers.iter_historic_layers() {
|
for hist_layer in layers.iter_historic_layers() {
|
||||||
let hist_layer = guard.get_from_desc(&hist_layer);
|
let hist_layer = guard.get_from_desc(&hist_layer);
|
||||||
|
|
||||||
@@ -259,55 +272,49 @@ impl Timeline {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
let layer = guard.drop_eviction_guard();
|
||||||
if no_activity_for > p.threshold {
|
if no_activity_for > p.threshold {
|
||||||
candidates.push(guard.drop_eviction_guard())
|
let remote_client = remote_client.clone();
|
||||||
|
// this could cause a lot of allocations in some cases
|
||||||
|
js.spawn(async move { layer.evict_and_wait(&remote_client).await });
|
||||||
|
stats.candidates += 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
candidates
|
|
||||||
};
|
|
||||||
stats.candidates = candidates.len();
|
|
||||||
|
|
||||||
let remote_client = match self.remote_client.as_ref() {
|
|
||||||
None => {
|
|
||||||
error!(
|
|
||||||
num_candidates = candidates.len(),
|
|
||||||
"no remote storage configured, cannot evict layers"
|
|
||||||
);
|
|
||||||
return ControlFlow::Continue(());
|
|
||||||
}
|
|
||||||
Some(c) => c,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
let results = match self.evict_layer_batch(remote_client, &candidates).await {
|
let join_all = async move {
|
||||||
Err(pre_err) => {
|
while let Some(next) = js.join_next().await {
|
||||||
stats.errors += candidates.len();
|
match next {
|
||||||
error!("could not do any evictions: {pre_err:#}");
|
Ok(Ok(())) => stats.evicted += 1,
|
||||||
return ControlFlow::Continue(());
|
Ok(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
|
||||||
|
stats.not_evictable += 1;
|
||||||
|
}
|
||||||
|
Err(je) if je.is_cancelled() => unreachable!("not used"),
|
||||||
|
Err(je) if je.is_panic() => {
|
||||||
|
/* already logged */
|
||||||
|
stats.errors += 1;
|
||||||
|
}
|
||||||
|
Err(je) => tracing::error!("unknown JoinError: {je:?}"),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Ok(results) => results,
|
stats
|
||||||
};
|
};
|
||||||
assert_eq!(results.len(), candidates.len());
|
|
||||||
for result in results {
|
tokio::select! {
|
||||||
match result {
|
stats = join_all => {
|
||||||
None => {
|
if stats.candidates == stats.not_evictable {
|
||||||
stats.skipped_for_shutdown += 1;
|
debug!(stats=?stats, "eviction iteration complete");
|
||||||
}
|
} else if stats.errors > 0 || stats.not_evictable > 0 {
|
||||||
Some(Ok(())) => {
|
warn!(stats=?stats, "eviction iteration complete");
|
||||||
stats.evicted += 1;
|
} else {
|
||||||
}
|
info!(stats=?stats, "eviction iteration complete");
|
||||||
Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
|
|
||||||
// compaction/gc removed the file while we were waiting on layer_removal_cs
|
|
||||||
stats.not_evictable += 1;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
_ = cancel.cancelled() => {
|
||||||
|
// just drop the joinset to "abort"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if stats.candidates == stats.not_evictable {
|
|
||||||
debug!(stats=?stats, "eviction iteration complete");
|
|
||||||
} else if stats.errors > 0 || stats.not_evictable > 0 {
|
|
||||||
warn!(stats=?stats, "eviction iteration complete");
|
|
||||||
} else {
|
|
||||||
info!(stats=?stats, "eviction iteration complete");
|
|
||||||
}
|
|
||||||
ControlFlow::Continue(())
|
ControlFlow::Continue(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -341,7 +348,7 @@ impl Timeline {
|
|||||||
// Make one of the tenant's timelines draw the short straw and run the calculation.
|
// Make one of the tenant's timelines draw the short straw and run the calculation.
|
||||||
// The others wait until the calculation is done so that they take into account the
|
// The others wait until the calculation is done so that they take into account the
|
||||||
// imitated accesses that the winner made.
|
// imitated accesses that the winner made.
|
||||||
let tenant = match crate::tenant::mgr::get_tenant(self.tenant_id, true) {
|
let tenant = match crate::tenant::mgr::get_tenant(self.tenant_shard_id, true) {
|
||||||
Ok(t) => t,
|
Ok(t) => t,
|
||||||
Err(_) => {
|
Err(_) => {
|
||||||
return ControlFlow::Break(());
|
return ControlFlow::Break(());
|
||||||
@@ -351,7 +358,7 @@ impl Timeline {
|
|||||||
match state.last_layer_access_imitation {
|
match state.last_layer_access_imitation {
|
||||||
Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
|
Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
|
||||||
_ => {
|
_ => {
|
||||||
self.imitate_synthetic_size_calculation_worker(&tenant, ctx, cancel)
|
self.imitate_synthetic_size_calculation_worker(&tenant, cancel, ctx)
|
||||||
.await;
|
.await;
|
||||||
state.last_layer_access_imitation = Some(tokio::time::Instant::now());
|
state.last_layer_access_imitation = Some(tokio::time::Instant::now());
|
||||||
}
|
}
|
||||||
@@ -417,8 +424,8 @@ impl Timeline {
|
|||||||
async fn imitate_synthetic_size_calculation_worker(
|
async fn imitate_synthetic_size_calculation_worker(
|
||||||
&self,
|
&self,
|
||||||
tenant: &Arc<Tenant>,
|
tenant: &Arc<Tenant>,
|
||||||
ctx: &RequestContext,
|
|
||||||
cancel: &CancellationToken,
|
cancel: &CancellationToken,
|
||||||
|
ctx: &RequestContext,
|
||||||
) {
|
) {
|
||||||
if self.conf.metric_collection_endpoint.is_none() {
|
if self.conf.metric_collection_endpoint.is_none() {
|
||||||
// We don't start the consumption metrics task if this is not set in the config.
|
// We don't start the consumption metrics task if this is not set in the config.
|
||||||
@@ -457,6 +464,7 @@ impl Timeline {
|
|||||||
None,
|
None,
|
||||||
&mut throwaway_cache,
|
&mut throwaway_cache,
|
||||||
LogicalSizeCalculationCause::EvictionTaskImitation,
|
LogicalSizeCalculationCause::EvictionTaskImitation,
|
||||||
|
cancel,
|
||||||
ctx,
|
ctx,
|
||||||
)
|
)
|
||||||
.instrument(info_span!("gather_inputs"));
|
.instrument(info_span!("gather_inputs"));
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ use crate::{
|
|||||||
};
|
};
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use camino::Utf8Path;
|
use camino::Utf8Path;
|
||||||
|
use pageserver_api::shard::ShardIndex;
|
||||||
use std::{collections::HashMap, str::FromStr};
|
use std::{collections::HashMap, str::FromStr};
|
||||||
use utils::lsn::Lsn;
|
use utils::lsn::Lsn;
|
||||||
|
|
||||||
@@ -107,6 +108,7 @@ pub(super) fn reconcile(
|
|||||||
index_part: Option<&IndexPart>,
|
index_part: Option<&IndexPart>,
|
||||||
disk_consistent_lsn: Lsn,
|
disk_consistent_lsn: Lsn,
|
||||||
generation: Generation,
|
generation: Generation,
|
||||||
|
shard: ShardIndex,
|
||||||
) -> Vec<(LayerFileName, Result<Decision, DismissedLayer>)> {
|
) -> Vec<(LayerFileName, Result<Decision, DismissedLayer>)> {
|
||||||
use Decision::*;
|
use Decision::*;
|
||||||
|
|
||||||
@@ -118,10 +120,13 @@ pub(super) fn reconcile(
|
|||||||
.map(|(name, file_size)| {
|
.map(|(name, file_size)| {
|
||||||
(
|
(
|
||||||
name,
|
name,
|
||||||
// The generation here will be corrected to match IndexPart in the merge below, unless
|
// The generation and shard here will be corrected to match IndexPart in the merge below, unless
|
||||||
// it is not in IndexPart, in which case using our current generation makes sense
|
// it is not in IndexPart, in which case using our current generation makes sense
|
||||||
// because it will be uploaded in this generation.
|
// because it will be uploaded in this generation.
|
||||||
(Some(LayerFileMetadata::new(file_size, generation)), None),
|
(
|
||||||
|
Some(LayerFileMetadata::new(file_size, generation, shard)),
|
||||||
|
None,
|
||||||
|
),
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
.collect::<Collected>();
|
.collect::<Collected>();
|
||||||
|
|||||||
@@ -1,8 +1,9 @@
|
|||||||
use anyhow::{bail, ensure, Context, Result};
|
use anyhow::{bail, ensure, Context, Result};
|
||||||
|
use pageserver_api::shard::TenantShardId;
|
||||||
use std::{collections::HashMap, sync::Arc};
|
use std::{collections::HashMap, sync::Arc};
|
||||||
use tracing::trace;
|
use tracing::trace;
|
||||||
use utils::{
|
use utils::{
|
||||||
id::{TenantId, TimelineId},
|
id::TimelineId,
|
||||||
lsn::{AtomicLsn, Lsn},
|
lsn::{AtomicLsn, Lsn},
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -73,7 +74,7 @@ impl LayerManager {
|
|||||||
last_record_lsn: Lsn,
|
last_record_lsn: Lsn,
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
tenant_id: TenantId,
|
tenant_shard_id: TenantShardId,
|
||||||
) -> Result<Arc<InMemoryLayer>> {
|
) -> Result<Arc<InMemoryLayer>> {
|
||||||
ensure!(lsn.is_aligned());
|
ensure!(lsn.is_aligned());
|
||||||
|
|
||||||
@@ -109,7 +110,8 @@ impl LayerManager {
|
|||||||
lsn
|
lsn
|
||||||
);
|
);
|
||||||
|
|
||||||
let new_layer = InMemoryLayer::create(conf, timeline_id, tenant_id, start_lsn).await?;
|
let new_layer =
|
||||||
|
InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn).await?;
|
||||||
let layer = Arc::new(new_layer);
|
let layer = Arc::new(new_layer);
|
||||||
|
|
||||||
self.layer_map.open_layer = Some(layer.clone());
|
self.layer_map.open_layer = Some(layer.clone());
|
||||||
@@ -190,7 +192,6 @@ impl LayerManager {
|
|||||||
/// Called when compaction is completed.
|
/// Called when compaction is completed.
|
||||||
pub(crate) fn finish_compact_l0(
|
pub(crate) fn finish_compact_l0(
|
||||||
&mut self,
|
&mut self,
|
||||||
layer_removal_cs: &Arc<tokio::sync::OwnedMutexGuard<()>>,
|
|
||||||
compact_from: &[Layer],
|
compact_from: &[Layer],
|
||||||
compact_to: &[ResidentLayer],
|
compact_to: &[ResidentLayer],
|
||||||
metrics: &TimelineMetrics,
|
metrics: &TimelineMetrics,
|
||||||
@@ -201,25 +202,16 @@ impl LayerManager {
|
|||||||
metrics.record_new_file_metrics(l.layer_desc().file_size);
|
metrics.record_new_file_metrics(l.layer_desc().file_size);
|
||||||
}
|
}
|
||||||
for l in compact_from {
|
for l in compact_from {
|
||||||
Self::delete_historic_layer(layer_removal_cs, l, &mut updates, &mut self.layer_fmgr);
|
Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr);
|
||||||
}
|
}
|
||||||
updates.flush();
|
updates.flush();
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Called when garbage collect the timeline. Returns a guard that will apply the updates to the layer map.
|
/// Called when garbage collect has selected the layers to be removed.
|
||||||
pub(crate) fn finish_gc_timeline(
|
pub(crate) fn finish_gc_timeline(&mut self, gc_layers: &[Layer]) {
|
||||||
&mut self,
|
|
||||||
layer_removal_cs: &Arc<tokio::sync::OwnedMutexGuard<()>>,
|
|
||||||
gc_layers: Vec<Layer>,
|
|
||||||
) {
|
|
||||||
let mut updates = self.layer_map.batch_update();
|
let mut updates = self.layer_map.batch_update();
|
||||||
for doomed_layer in gc_layers {
|
for doomed_layer in gc_layers {
|
||||||
Self::delete_historic_layer(
|
Self::delete_historic_layer(doomed_layer, &mut updates, &mut self.layer_fmgr);
|
||||||
layer_removal_cs,
|
|
||||||
&doomed_layer,
|
|
||||||
&mut updates,
|
|
||||||
&mut self.layer_fmgr,
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
updates.flush()
|
updates.flush()
|
||||||
}
|
}
|
||||||
@@ -238,7 +230,6 @@ impl LayerManager {
|
|||||||
/// Remote storage is not affected by this operation.
|
/// Remote storage is not affected by this operation.
|
||||||
fn delete_historic_layer(
|
fn delete_historic_layer(
|
||||||
// we cannot remove layers otherwise, since gc and compaction will race
|
// we cannot remove layers otherwise, since gc and compaction will race
|
||||||
_layer_removal_cs: &Arc<tokio::sync::OwnedMutexGuard<()>>,
|
|
||||||
layer: &Layer,
|
layer: &Layer,
|
||||||
updates: &mut BatchedUpdates<'_>,
|
updates: &mut BatchedUpdates<'_>,
|
||||||
mapping: &mut LayerFileManager<Layer>,
|
mapping: &mut LayerFileManager<Layer>,
|
||||||
@@ -252,7 +243,7 @@ impl LayerManager {
|
|||||||
// map index without actually rebuilding the index.
|
// map index without actually rebuilding the index.
|
||||||
updates.remove_historic(desc);
|
updates.remove_historic(desc);
|
||||||
mapping.remove(layer);
|
mapping.remove(layer);
|
||||||
layer.garbage_collect_on_drop();
|
layer.delete_on_drop();
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn contains(&self, layer: &Layer) -> bool {
|
pub(crate) fn contains(&self, layer: &Layer) -> bool {
|
||||||
|
|||||||
@@ -1,11 +1,10 @@
|
|||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use once_cell::sync::OnceCell;
|
|
||||||
|
|
||||||
use tokio::sync::Semaphore;
|
use once_cell::sync::OnceCell;
|
||||||
|
use tokio_util::sync::CancellationToken;
|
||||||
use utils::lsn::Lsn;
|
use utils::lsn::Lsn;
|
||||||
|
|
||||||
use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
|
use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering};
|
||||||
use std::sync::Arc;
|
|
||||||
|
|
||||||
/// Internal structure to hold all data needed for logical size calculation.
|
/// Internal structure to hold all data needed for logical size calculation.
|
||||||
///
|
///
|
||||||
@@ -23,10 +22,17 @@ pub(super) struct LogicalSize {
|
|||||||
///
|
///
|
||||||
/// NOTE: size at a given LSN is constant, but after a restart we will calculate
|
/// NOTE: size at a given LSN is constant, but after a restart we will calculate
|
||||||
/// the initial size at a different LSN.
|
/// the initial size at a different LSN.
|
||||||
pub initial_logical_size: OnceCell<u64>,
|
pub initial_logical_size: OnceCell<(
|
||||||
|
u64,
|
||||||
|
crate::metrics::initial_logical_size::FinishedCalculationGuard,
|
||||||
|
)>,
|
||||||
|
|
||||||
/// Semaphore to track ongoing calculation of `initial_logical_size`.
|
/// Cancellation for the best-effort logical size calculation.
|
||||||
pub initial_size_computation: Arc<tokio::sync::Semaphore>,
|
///
|
||||||
|
/// The token is kept in a once-cell so that we can error out if a higher priority
|
||||||
|
/// request comes in *before* we have started the normal logical size calculation.
|
||||||
|
pub(crate) cancel_wait_for_background_loop_concurrency_limit_semaphore:
|
||||||
|
OnceCell<CancellationToken>,
|
||||||
|
|
||||||
/// Latest Lsn that has its size uncalculated, could be absent for freshly created timelines.
|
/// Latest Lsn that has its size uncalculated, could be absent for freshly created timelines.
|
||||||
pub initial_part_end: Option<Lsn>,
|
pub initial_part_end: Option<Lsn>,
|
||||||
@@ -52,25 +58,57 @@ pub(super) struct LogicalSize {
|
|||||||
/// see `current_logical_size_gauge`. Use the `update_current_logical_size`
|
/// see `current_logical_size_gauge`. Use the `update_current_logical_size`
|
||||||
/// to modify this, it will also keep the prometheus metric in sync.
|
/// to modify this, it will also keep the prometheus metric in sync.
|
||||||
pub size_added_after_initial: AtomicI64,
|
pub size_added_after_initial: AtomicI64,
|
||||||
|
|
||||||
|
/// For [`crate::metrics::initial_logical_size::TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE`].
|
||||||
|
pub(super) did_return_approximate_to_walreceiver: AtomicBool,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Normalized current size, that the data in pageserver occupies.
|
/// Normalized current size, that the data in pageserver occupies.
|
||||||
#[derive(Debug, Clone, Copy)]
|
#[derive(Debug, Clone, Copy)]
|
||||||
pub(super) enum CurrentLogicalSize {
|
pub(crate) enum CurrentLogicalSize {
|
||||||
/// The size is not yet calculated to the end, this is an intermediate result,
|
/// The size is not yet calculated to the end, this is an intermediate result,
|
||||||
/// constructed from walreceiver increments and normalized: logical data could delete some objects, hence be negative,
|
/// constructed from walreceiver increments and normalized: logical data could delete some objects, hence be negative,
|
||||||
/// yet total logical size cannot be below 0.
|
/// yet total logical size cannot be below 0.
|
||||||
Approximate(u64),
|
Approximate(Approximate),
|
||||||
// Fully calculated logical size, only other future walreceiver increments are changing it, and those changes are
|
// Fully calculated logical size, only other future walreceiver increments are changing it, and those changes are
|
||||||
// available for observation without any calculations.
|
// available for observation without any calculations.
|
||||||
Exact(u64),
|
Exact(Exact),
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
|
||||||
|
pub(crate) enum Accuracy {
|
||||||
|
Approximate,
|
||||||
|
Exact,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy)]
|
||||||
|
pub(crate) struct Approximate(u64);
|
||||||
|
#[derive(Debug, Clone, Copy)]
|
||||||
|
pub(crate) struct Exact(u64);
|
||||||
|
|
||||||
|
impl From<&Approximate> for u64 {
|
||||||
|
fn from(value: &Approximate) -> Self {
|
||||||
|
value.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<&Exact> for u64 {
|
||||||
|
fn from(val: &Exact) -> Self {
|
||||||
|
val.0
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl CurrentLogicalSize {
|
impl CurrentLogicalSize {
|
||||||
pub(super) fn size(&self) -> u64 {
|
pub(crate) fn size_dont_care_about_accuracy(&self) -> u64 {
|
||||||
*match self {
|
match self {
|
||||||
Self::Approximate(size) => size,
|
Self::Approximate(size) => size.into(),
|
||||||
Self::Exact(size) => size,
|
Self::Exact(size) => size.into(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pub(crate) fn accuracy(&self) -> Accuracy {
|
||||||
|
match self {
|
||||||
|
Self::Approximate(_) => Accuracy::Approximate,
|
||||||
|
Self::Exact(_) => Accuracy::Exact,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -78,36 +116,42 @@ impl CurrentLogicalSize {
|
|||||||
impl LogicalSize {
|
impl LogicalSize {
|
||||||
pub(super) fn empty_initial() -> Self {
|
pub(super) fn empty_initial() -> Self {
|
||||||
Self {
|
Self {
|
||||||
initial_logical_size: OnceCell::with_value(0),
|
initial_logical_size: OnceCell::with_value((0, {
|
||||||
// initial_logical_size already computed, so, don't admit any calculations
|
crate::metrics::initial_logical_size::START_CALCULATION
|
||||||
initial_size_computation: Arc::new(Semaphore::new(0)),
|
.first(crate::metrics::initial_logical_size::StartCircumstances::EmptyInitial)
|
||||||
|
.calculation_result_saved()
|
||||||
|
})),
|
||||||
|
cancel_wait_for_background_loop_concurrency_limit_semaphore: OnceCell::new(),
|
||||||
initial_part_end: None,
|
initial_part_end: None,
|
||||||
size_added_after_initial: AtomicI64::new(0),
|
size_added_after_initial: AtomicI64::new(0),
|
||||||
|
did_return_approximate_to_walreceiver: AtomicBool::new(false),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(super) fn deferred_initial(compute_to: Lsn) -> Self {
|
pub(super) fn deferred_initial(compute_to: Lsn) -> Self {
|
||||||
Self {
|
Self {
|
||||||
initial_logical_size: OnceCell::new(),
|
initial_logical_size: OnceCell::new(),
|
||||||
initial_size_computation: Arc::new(Semaphore::new(1)),
|
cancel_wait_for_background_loop_concurrency_limit_semaphore: OnceCell::new(),
|
||||||
initial_part_end: Some(compute_to),
|
initial_part_end: Some(compute_to),
|
||||||
size_added_after_initial: AtomicI64::new(0),
|
size_added_after_initial: AtomicI64::new(0),
|
||||||
|
did_return_approximate_to_walreceiver: AtomicBool::new(false),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(super) fn current_size(&self) -> anyhow::Result<CurrentLogicalSize> {
|
pub(super) fn current_size(&self) -> CurrentLogicalSize {
|
||||||
let size_increment: i64 = self.size_added_after_initial.load(AtomicOrdering::Acquire);
|
let size_increment: i64 = self.size_added_after_initial.load(AtomicOrdering::Acquire);
|
||||||
// ^^^ keep this type explicit so that the casts in this function break if
|
// ^^^ keep this type explicit so that the casts in this function break if
|
||||||
// we change the type.
|
// we change the type.
|
||||||
match self.initial_logical_size.get() {
|
match self.initial_logical_size.get() {
|
||||||
Some(initial_size) => {
|
Some((initial_size, _)) => {
|
||||||
initial_size.checked_add_signed(size_increment)
|
CurrentLogicalSize::Exact(Exact(initial_size.checked_add_signed(size_increment)
|
||||||
.with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}"))
|
.with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}"))
|
||||||
.map(CurrentLogicalSize::Exact)
|
.unwrap()))
|
||||||
}
|
}
|
||||||
None => {
|
None => {
|
||||||
|
|
||||||
let non_negative_size_increment = u64::try_from(size_increment).unwrap_or(0);
|
let non_negative_size_increment = u64::try_from(size_increment).unwrap_or(0);
|
||||||
Ok(CurrentLogicalSize::Approximate(non_negative_size_increment))
|
CurrentLogicalSize::Approximate(Approximate(non_negative_size_increment))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -121,7 +165,7 @@ impl LogicalSize {
|
|||||||
/// available for re-use. This doesn't contain the incremental part.
|
/// available for re-use. This doesn't contain the incremental part.
|
||||||
pub(super) fn initialized_size(&self, lsn: Lsn) -> Option<u64> {
|
pub(super) fn initialized_size(&self, lsn: Lsn) -> Option<u64> {
|
||||||
match self.initial_part_end {
|
match self.initial_part_end {
|
||||||
Some(v) if v == lsn => self.initial_logical_size.get().copied(),
|
Some(v) if v == lsn => self.initial_logical_size.get().map(|(s, _)| *s),
|
||||||
_ => None,
|
_ => None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -43,37 +43,52 @@ impl<'t> UninitializedTimeline<'t> {
|
|||||||
/// The caller is responsible for activating the timeline (function `.activate()`).
|
/// The caller is responsible for activating the timeline (function `.activate()`).
|
||||||
pub(crate) fn finish_creation(mut self) -> anyhow::Result<Arc<Timeline>> {
|
pub(crate) fn finish_creation(mut self) -> anyhow::Result<Arc<Timeline>> {
|
||||||
let timeline_id = self.timeline_id;
|
let timeline_id = self.timeline_id;
|
||||||
let tenant_id = self.owning_tenant.tenant_id;
|
let tenant_shard_id = self.owning_tenant.tenant_shard_id;
|
||||||
|
|
||||||
let (new_timeline, uninit_mark) = self.raw_timeline.take().with_context(|| {
|
if self.raw_timeline.is_none() {
|
||||||
format!("No timeline for initalization found for {tenant_id}/{timeline_id}")
|
return Err(anyhow::anyhow!(
|
||||||
})?;
|
"No timeline for initialization found for {tenant_shard_id}/{timeline_id}"
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
// Check that the caller initialized disk_consistent_lsn
|
// Check that the caller initialized disk_consistent_lsn
|
||||||
let new_disk_consistent_lsn = new_timeline.get_disk_consistent_lsn();
|
let new_disk_consistent_lsn = self
|
||||||
|
.raw_timeline
|
||||||
|
.as_ref()
|
||||||
|
.expect("checked above")
|
||||||
|
.0
|
||||||
|
.get_disk_consistent_lsn();
|
||||||
|
|
||||||
anyhow::ensure!(
|
anyhow::ensure!(
|
||||||
new_disk_consistent_lsn.is_valid(),
|
new_disk_consistent_lsn.is_valid(),
|
||||||
"new timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn"
|
"new timeline {tenant_shard_id}/{timeline_id} has invalid disk_consistent_lsn"
|
||||||
);
|
);
|
||||||
|
|
||||||
let mut timelines = self.owning_tenant.timelines.lock().unwrap();
|
let mut timelines = self.owning_tenant.timelines.lock().unwrap();
|
||||||
match timelines.entry(timeline_id) {
|
match timelines.entry(timeline_id) {
|
||||||
Entry::Occupied(_) => anyhow::bail!(
|
Entry::Occupied(_) => anyhow::bail!(
|
||||||
"Found freshly initialized timeline {tenant_id}/{timeline_id} in the tenant map"
|
"Found freshly initialized timeline {tenant_shard_id}/{timeline_id} in the tenant map"
|
||||||
),
|
),
|
||||||
Entry::Vacant(v) => {
|
Entry::Vacant(v) => {
|
||||||
|
// after taking here should be no fallible operations, because the drop guard will not
|
||||||
|
// cleanup after and would block for example the tenant deletion
|
||||||
|
let (new_timeline, uninit_mark) =
|
||||||
|
self.raw_timeline.take().expect("already checked");
|
||||||
|
|
||||||
|
// this is the mutual exclusion between different retries to create the timeline;
|
||||||
|
// this should be an assertion.
|
||||||
uninit_mark.remove_uninit_mark().with_context(|| {
|
uninit_mark.remove_uninit_mark().with_context(|| {
|
||||||
format!(
|
format!(
|
||||||
"Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}"
|
"Failed to remove uninit mark file for timeline {tenant_shard_id}/{timeline_id}"
|
||||||
)
|
)
|
||||||
})?;
|
})?;
|
||||||
v.insert(Arc::clone(&new_timeline));
|
v.insert(Arc::clone(&new_timeline));
|
||||||
|
|
||||||
new_timeline.maybe_spawn_flush_loop();
|
new_timeline.maybe_spawn_flush_loop();
|
||||||
|
|
||||||
|
Ok(new_timeline)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(new_timeline)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Prepares timeline data by loading it from the basebackup archive.
|
/// Prepares timeline data by loading it from the basebackup archive.
|
||||||
@@ -119,7 +134,7 @@ impl<'t> UninitializedTimeline<'t> {
|
|||||||
.with_context(|| {
|
.with_context(|| {
|
||||||
format!(
|
format!(
|
||||||
"No raw timeline {}/{} found",
|
"No raw timeline {}/{} found",
|
||||||
self.owning_tenant.tenant_id, self.timeline_id
|
self.owning_tenant.tenant_shard_id, self.timeline_id
|
||||||
)
|
)
|
||||||
})?
|
})?
|
||||||
.0)
|
.0)
|
||||||
@@ -129,7 +144,7 @@ impl<'t> UninitializedTimeline<'t> {
|
|||||||
impl Drop for UninitializedTimeline<'_> {
|
impl Drop for UninitializedTimeline<'_> {
|
||||||
fn drop(&mut self) {
|
fn drop(&mut self) {
|
||||||
if let Some((_, uninit_mark)) = self.raw_timeline.take() {
|
if let Some((_, uninit_mark)) = self.raw_timeline.take() {
|
||||||
let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_id, timeline_id = %self.timeline_id).entered();
|
let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_shard_id.tenant_id, shard_id = %self.owning_tenant.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id).entered();
|
||||||
error!("Timeline got dropped without initializing, cleaning its files");
|
error!("Timeline got dropped without initializing, cleaning its files");
|
||||||
cleanup_timeline_directory(uninit_mark);
|
cleanup_timeline_directory(uninit_mark);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -30,6 +30,7 @@ use crate::tenant::timeline::walreceiver::connection_manager::{
|
|||||||
connection_manager_loop_step, ConnectionManagerState,
|
connection_manager_loop_step, ConnectionManagerState,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
use pageserver_api::shard::TenantShardId;
|
||||||
use std::future::Future;
|
use std::future::Future;
|
||||||
use std::num::NonZeroU64;
|
use std::num::NonZeroU64;
|
||||||
use std::ops::ControlFlow;
|
use std::ops::ControlFlow;
|
||||||
@@ -41,7 +42,7 @@ use tokio::sync::watch;
|
|||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
|
|
||||||
use utils::id::TenantTimelineId;
|
use utils::id::TimelineId;
|
||||||
|
|
||||||
use self::connection_manager::ConnectionManagerStatus;
|
use self::connection_manager::ConnectionManagerStatus;
|
||||||
|
|
||||||
@@ -60,7 +61,8 @@ pub struct WalReceiverConf {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub struct WalReceiver {
|
pub struct WalReceiver {
|
||||||
timeline: TenantTimelineId,
|
tenant_shard_id: TenantShardId,
|
||||||
|
timeline_id: TimelineId,
|
||||||
manager_status: Arc<std::sync::RwLock<Option<ConnectionManagerStatus>>>,
|
manager_status: Arc<std::sync::RwLock<Option<ConnectionManagerStatus>>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -71,7 +73,7 @@ impl WalReceiver {
|
|||||||
mut broker_client: BrokerClientChannel,
|
mut broker_client: BrokerClientChannel,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
let tenant_id = timeline.tenant_id;
|
let tenant_shard_id = timeline.tenant_shard_id;
|
||||||
let timeline_id = timeline.timeline_id;
|
let timeline_id = timeline.timeline_id;
|
||||||
let walreceiver_ctx =
|
let walreceiver_ctx =
|
||||||
ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error);
|
ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error);
|
||||||
@@ -81,10 +83,11 @@ impl WalReceiver {
|
|||||||
task_mgr::spawn(
|
task_mgr::spawn(
|
||||||
WALRECEIVER_RUNTIME.handle(),
|
WALRECEIVER_RUNTIME.handle(),
|
||||||
TaskKind::WalReceiverManager,
|
TaskKind::WalReceiverManager,
|
||||||
Some(tenant_id),
|
Some(timeline.tenant_shard_id),
|
||||||
Some(timeline_id),
|
Some(timeline_id),
|
||||||
&format!("walreceiver for timeline {tenant_id}/{timeline_id}"),
|
&format!("walreceiver for timeline {tenant_shard_id}/{timeline_id}"),
|
||||||
false,
|
false,
|
||||||
|
timeline.cancel.child_token(),
|
||||||
async move {
|
async move {
|
||||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
debug!("WAL receiver manager started, connecting to broker");
|
debug!("WAL receiver manager started, connecting to broker");
|
||||||
@@ -117,11 +120,12 @@ impl WalReceiver {
|
|||||||
*loop_status.write().unwrap() = None;
|
*loop_status.write().unwrap() = None;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
.instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_id, timeline_id = %timeline_id))
|
.instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), timeline_id = %timeline_id))
|
||||||
);
|
);
|
||||||
|
|
||||||
Self {
|
Self {
|
||||||
timeline: TenantTimelineId::new(tenant_id, timeline_id),
|
tenant_shard_id,
|
||||||
|
timeline_id,
|
||||||
manager_status,
|
manager_status,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -129,8 +133,8 @@ impl WalReceiver {
|
|||||||
pub async fn stop(self) {
|
pub async fn stop(self) {
|
||||||
task_mgr::shutdown_tasks(
|
task_mgr::shutdown_tasks(
|
||||||
Some(TaskKind::WalReceiverManager),
|
Some(TaskKind::WalReceiverManager),
|
||||||
Some(self.timeline.tenant_id),
|
Some(self.tenant_shard_id),
|
||||||
Some(self.timeline.timeline_id),
|
Some(self.timeline_id),
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -75,7 +75,7 @@ pub(super) async fn connection_manager_loop_step(
|
|||||||
}
|
}
|
||||||
|
|
||||||
let id = TenantTimelineId {
|
let id = TenantTimelineId {
|
||||||
tenant_id: connection_manager_state.timeline.tenant_id,
|
tenant_id: connection_manager_state.timeline.tenant_shard_id.tenant_id,
|
||||||
timeline_id: connection_manager_state.timeline.timeline_id,
|
timeline_id: connection_manager_state.timeline.timeline_id,
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -388,7 +388,7 @@ struct BrokerSkTimeline {
|
|||||||
impl ConnectionManagerState {
|
impl ConnectionManagerState {
|
||||||
pub(super) fn new(timeline: Arc<Timeline>, conf: WalReceiverConf) -> Self {
|
pub(super) fn new(timeline: Arc<Timeline>, conf: WalReceiverConf) -> Self {
|
||||||
let id = TenantTimelineId {
|
let id = TenantTimelineId {
|
||||||
tenant_id: timeline.tenant_id,
|
tenant_id: timeline.tenant_shard_id.tenant_id,
|
||||||
timeline_id: timeline.timeline_id,
|
timeline_id: timeline.timeline_id,
|
||||||
};
|
};
|
||||||
Self {
|
Self {
|
||||||
|
|||||||
@@ -163,10 +163,11 @@ pub(super) async fn handle_walreceiver_connection(
|
|||||||
task_mgr::spawn(
|
task_mgr::spawn(
|
||||||
WALRECEIVER_RUNTIME.handle(),
|
WALRECEIVER_RUNTIME.handle(),
|
||||||
TaskKind::WalReceiverConnectionPoller,
|
TaskKind::WalReceiverConnectionPoller,
|
||||||
Some(timeline.tenant_id),
|
Some(timeline.tenant_shard_id),
|
||||||
Some(timeline.timeline_id),
|
Some(timeline.timeline_id),
|
||||||
"walreceiver connection",
|
"walreceiver connection",
|
||||||
false,
|
false,
|
||||||
|
cancellation.clone(),
|
||||||
async move {
|
async move {
|
||||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
|
|
||||||
@@ -396,11 +397,15 @@ pub(super) async fn handle_walreceiver_connection(
|
|||||||
|
|
||||||
// Send the replication feedback message.
|
// Send the replication feedback message.
|
||||||
// Regular standby_status_update fields are put into this message.
|
// Regular standby_status_update fields are put into this message.
|
||||||
let (timeline_logical_size, _) = timeline
|
let current_timeline_size = timeline
|
||||||
.get_current_logical_size(&ctx)
|
.get_current_logical_size(
|
||||||
.context("Status update creation failed to get current logical size")?;
|
crate::tenant::timeline::GetLogicalSizePriority::User,
|
||||||
|
&ctx,
|
||||||
|
)
|
||||||
|
// FIXME: https://github.com/neondatabase/neon/issues/5963
|
||||||
|
.size_dont_care_about_accuracy();
|
||||||
let status_update = PageserverFeedback {
|
let status_update = PageserverFeedback {
|
||||||
current_timeline_size: timeline_logical_size,
|
current_timeline_size,
|
||||||
last_received_lsn,
|
last_received_lsn,
|
||||||
disk_consistent_lsn,
|
disk_consistent_lsn,
|
||||||
remote_consistent_lsn,
|
remote_consistent_lsn,
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user