mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-20 05:40:38 +00:00
Compare commits
119 Commits
jcsp/ha-te
...
proxy-cpla
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
136ed19387 | ||
|
|
cdf12ed008 | ||
|
|
12512f3173 | ||
|
|
b3b7ce457c | ||
|
|
6814bb4b59 | ||
|
|
b3bb1d1cad | ||
|
|
47d2b3a483 | ||
|
|
8dfe3a070c | ||
|
|
3426619a79 | ||
|
|
de03742ca3 | ||
|
|
ad072de420 | ||
|
|
6c18109734 | ||
|
|
5dee58f492 | ||
|
|
6313f1fa7a | ||
|
|
f72415e1fd | ||
|
|
d837ce0686 | ||
|
|
2713142308 | ||
|
|
a6c1fdcaf6 | ||
|
|
adb0526262 | ||
|
|
0099dfa56b | ||
|
|
3a4ebfb95d | ||
|
|
3220f830b7 | ||
|
|
72103d481d | ||
|
|
643683f41a | ||
|
|
35f4c04c9b | ||
|
|
1787cf19e3 | ||
|
|
2668a1dfab | ||
|
|
77f3a30440 | ||
|
|
62b318c928 | ||
|
|
6770ddba2e | ||
|
|
3ee34a3f26 | ||
|
|
fb60278e02 | ||
|
|
d5304337cf | ||
|
|
06cb582d91 | ||
|
|
bb47d536fb | ||
|
|
59cdee749e | ||
|
|
c75b584430 | ||
|
|
5ec6862bcf | ||
|
|
94138c1a28 | ||
|
|
2206e14c26 | ||
|
|
a95c41f463 | ||
|
|
041b653a1a | ||
|
|
55c4ef408b | ||
|
|
5f0d9f2360 | ||
|
|
34fa34d15c | ||
|
|
e961e0d3df | ||
|
|
2726b1934e | ||
|
|
3d16cda846 | ||
|
|
fb66a3dd85 | ||
|
|
6d996427b1 | ||
|
|
4ba3f3518e | ||
|
|
a5d5c2a6a0 | ||
|
|
64c6dfd3e4 | ||
|
|
a8384a074e | ||
|
|
b80704cd34 | ||
|
|
49be446d95 | ||
|
|
ad5efb49ee | ||
|
|
2bc2fd9cfd | ||
|
|
877fd14401 | ||
|
|
db749914d8 | ||
|
|
1d3ae57f18 | ||
|
|
30a3d80d2f | ||
|
|
5cec5cb3cf | ||
|
|
0694ee9531 | ||
|
|
9752ad8489 | ||
|
|
ad6f538aef | ||
|
|
1aa159acca | ||
|
|
60f30000ef | ||
|
|
bc1efa827f | ||
|
|
67522ce83d | ||
|
|
7d32af5ad5 | ||
|
|
59b6cce418 | ||
|
|
bf187aa13f | ||
|
|
22c26d610b | ||
|
|
516f793ab4 | ||
|
|
6443dbef90 | ||
|
|
23416cc358 | ||
|
|
46098ea0ea | ||
|
|
49bc734e02 | ||
|
|
76c44dc140 | ||
|
|
58ef78cf41 | ||
|
|
678ed39de2 | ||
|
|
3d8830ac35 | ||
|
|
38767ace68 | ||
|
|
9fe0193e51 | ||
|
|
8075f0965a | ||
|
|
44f42627dd | ||
|
|
3bd6551b36 | ||
|
|
69338e53e3 | ||
|
|
5309711691 | ||
|
|
8a53d576e6 | ||
|
|
b0aff04157 | ||
|
|
0554bee022 | ||
|
|
83855a907c | ||
|
|
1b41db8bdd | ||
|
|
bac06ea1ac | ||
|
|
7ae8364b0b | ||
|
|
1f7d54f987 | ||
|
|
580e136b2e | ||
|
|
09699d4bd8 | ||
|
|
89cf714890 | ||
|
|
621ea2ec44 | ||
|
|
74d09b78c7 | ||
|
|
0cf0731d8b | ||
|
|
98723844ee | ||
|
|
73a8c97ac8 | ||
|
|
17a3c9036e | ||
|
|
8c5b310090 | ||
|
|
8224580f3e | ||
|
|
2b0f3549f7 | ||
|
|
b4972d07d4 | ||
|
|
26ae7b0b3e | ||
|
|
f8483cc4a3 | ||
|
|
cc5d6c66b3 | ||
|
|
d894d2b450 | ||
|
|
b09d686335 | ||
|
|
74d24582cf | ||
|
|
4834d22d2d | ||
|
|
86e8c43ddf |
26
.github/workflows/build_and_test.yml
vendored
26
.github/workflows/build_and_test.yml
vendored
@@ -461,6 +461,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Pytest regression tests
|
- name: Pytest regression tests
|
||||||
uses: ./.github/actions/run-python-test-set
|
uses: ./.github/actions/run-python-test-set
|
||||||
|
timeout-minutes: 60
|
||||||
with:
|
with:
|
||||||
build_type: ${{ matrix.build_type }}
|
build_type: ${{ matrix.build_type }}
|
||||||
test_selection: regress
|
test_selection: regress
|
||||||
@@ -474,7 +475,7 @@ jobs:
|
|||||||
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||||
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
|
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
|
||||||
BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
|
BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
|
||||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
|
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
||||||
PAGESERVER_GET_VECTORED_IMPL: vectored
|
PAGESERVER_GET_VECTORED_IMPL: vectored
|
||||||
|
|
||||||
# Temporary disable this step until we figure out why it's so flaky
|
# Temporary disable this step until we figure out why it's so flaky
|
||||||
@@ -554,7 +555,7 @@ jobs:
|
|||||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||||
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
|
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
|
||||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
|
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
||||||
# XXX: no coverage data handling here, since benchmarks are run on release builds,
|
# XXX: no coverage data handling here, since benchmarks are run on release builds,
|
||||||
# while coverage is currently collected for the debug ones
|
# while coverage is currently collected for the debug ones
|
||||||
|
|
||||||
@@ -1120,10 +1121,16 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
|
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
|
||||||
|
|
||||||
# TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions
|
|
||||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
|
|
||||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||||
|
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
|
||||||
|
-f deployPgSniRouter=false \
|
||||||
|
-f deployProxy=false \
|
||||||
|
-f deployStorage=true \
|
||||||
|
-f deployStorageBroker=true \
|
||||||
|
-f branch=main \
|
||||||
|
-f dockerTag=${{needs.tag.outputs.build-tag}} \
|
||||||
|
-f deployPreprodRegion=true
|
||||||
|
|
||||||
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
|
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
|
||||||
-f deployPgSniRouter=false \
|
-f deployPgSniRouter=false \
|
||||||
-f deployProxy=false \
|
-f deployProxy=false \
|
||||||
@@ -1132,6 +1139,15 @@ jobs:
|
|||||||
-f branch=main \
|
-f branch=main \
|
||||||
-f dockerTag=${{needs.tag.outputs.build-tag}}
|
-f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||||
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
|
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
|
||||||
|
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
|
||||||
|
-f deployPgSniRouter=true \
|
||||||
|
-f deployProxy=true \
|
||||||
|
-f deployStorage=false \
|
||||||
|
-f deployStorageBroker=false \
|
||||||
|
-f branch=main \
|
||||||
|
-f dockerTag=${{needs.tag.outputs.build-tag}} \
|
||||||
|
-f deployPreprodRegion=true
|
||||||
|
|
||||||
gh workflow --repo neondatabase/aws run deploy-proxy-prod.yml --ref main \
|
gh workflow --repo neondatabase/aws run deploy-proxy-prod.yml --ref main \
|
||||||
-f deployPgSniRouter=true \
|
-f deployPgSniRouter=true \
|
||||||
-f deployProxy=true \
|
-f deployProxy=true \
|
||||||
|
|||||||
@@ -1,12 +1,13 @@
|
|||||||
/compute_tools/ @neondatabase/control-plane @neondatabase/compute
|
/compute_tools/ @neondatabase/control-plane @neondatabase/compute
|
||||||
/control_plane/attachment_service @neondatabase/storage
|
/control_plane/attachment_service @neondatabase/storage
|
||||||
/libs/pageserver_api/ @neondatabase/storage
|
/libs/pageserver_api/ @neondatabase/storage
|
||||||
/libs/postgres_ffi/ @neondatabase/compute
|
/libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers
|
||||||
/libs/remote_storage/ @neondatabase/storage
|
/libs/remote_storage/ @neondatabase/storage
|
||||||
/libs/safekeeper_api/ @neondatabase/safekeepers
|
/libs/safekeeper_api/ @neondatabase/safekeepers
|
||||||
/libs/vm_monitor/ @neondatabase/autoscaling
|
/libs/vm_monitor/ @neondatabase/autoscaling
|
||||||
/pageserver/ @neondatabase/storage
|
/pageserver/ @neondatabase/storage
|
||||||
/pgxn/ @neondatabase/compute
|
/pgxn/ @neondatabase/compute
|
||||||
|
/pgxn/neon/ @neondatabase/compute @neondatabase/safekeepers
|
||||||
/proxy/ @neondatabase/proxy
|
/proxy/ @neondatabase/proxy
|
||||||
/safekeeper/ @neondatabase/safekeepers
|
/safekeeper/ @neondatabase/safekeepers
|
||||||
/vendor/ @neondatabase/compute
|
/vendor/ @neondatabase/compute
|
||||||
|
|||||||
212
Cargo.lock
generated
212
Cargo.lock
generated
@@ -276,7 +276,7 @@ version = "0.1.0"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"aws-config",
|
"aws-config",
|
||||||
"aws-sdk-secretsmanager",
|
"bytes",
|
||||||
"camino",
|
"camino",
|
||||||
"clap",
|
"clap",
|
||||||
"control_plane",
|
"control_plane",
|
||||||
@@ -288,6 +288,8 @@ dependencies = [
|
|||||||
"hex",
|
"hex",
|
||||||
"humantime",
|
"humantime",
|
||||||
"hyper",
|
"hyper",
|
||||||
|
"lasso",
|
||||||
|
"measured",
|
||||||
"metrics",
|
"metrics",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"pageserver_api",
|
"pageserver_api",
|
||||||
@@ -295,6 +297,7 @@ dependencies = [
|
|||||||
"postgres_connection",
|
"postgres_connection",
|
||||||
"r2d2",
|
"r2d2",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
|
"routerify",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
@@ -343,9 +346,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "aws-credential-types"
|
name = "aws-credential-types"
|
||||||
version = "1.1.4"
|
version = "1.1.8"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "33cc49dcdd31c8b6e79850a179af4c367669150c7ac0135f176c61bec81a70f7"
|
checksum = "fa8587ae17c8e967e4b05a62d495be2fb7701bec52a97f7acfe8a29f938384c8"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"aws-smithy-async",
|
"aws-smithy-async",
|
||||||
"aws-smithy-runtime-api",
|
"aws-smithy-runtime-api",
|
||||||
@@ -355,9 +358,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "aws-runtime"
|
name = "aws-runtime"
|
||||||
version = "1.1.4"
|
version = "1.1.8"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "eb031bff99877c26c28895766f7bb8484a05e24547e370768d6cc9db514662aa"
|
checksum = "b13dc54b4b49f8288532334bba8f87386a40571c47c37b1304979b556dc613c8"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"aws-credential-types",
|
"aws-credential-types",
|
||||||
"aws-sigv4",
|
"aws-sigv4",
|
||||||
@@ -377,6 +380,29 @@ dependencies = [
|
|||||||
"uuid",
|
"uuid",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "aws-sdk-iam"
|
||||||
|
version = "1.17.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b8ae76026bfb1b80a6aed0bb400c1139cd9c0563e26bce1986cd021c6a968c7b"
|
||||||
|
dependencies = [
|
||||||
|
"aws-credential-types",
|
||||||
|
"aws-runtime",
|
||||||
|
"aws-smithy-async",
|
||||||
|
"aws-smithy-http",
|
||||||
|
"aws-smithy-json",
|
||||||
|
"aws-smithy-query",
|
||||||
|
"aws-smithy-runtime",
|
||||||
|
"aws-smithy-runtime-api",
|
||||||
|
"aws-smithy-types",
|
||||||
|
"aws-smithy-xml",
|
||||||
|
"aws-types",
|
||||||
|
"http 0.2.9",
|
||||||
|
"once_cell",
|
||||||
|
"regex-lite",
|
||||||
|
"tracing",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "aws-sdk-s3"
|
name = "aws-sdk-s3"
|
||||||
version = "1.14.0"
|
version = "1.14.0"
|
||||||
@@ -406,29 +432,6 @@ dependencies = [
|
|||||||
"url",
|
"url",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "aws-sdk-secretsmanager"
|
|
||||||
version = "1.14.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "0a0b64e61e7d632d9df90a2e0f32630c68c24960cab1d27d848718180af883d3"
|
|
||||||
dependencies = [
|
|
||||||
"aws-credential-types",
|
|
||||||
"aws-runtime",
|
|
||||||
"aws-smithy-async",
|
|
||||||
"aws-smithy-http",
|
|
||||||
"aws-smithy-json",
|
|
||||||
"aws-smithy-runtime",
|
|
||||||
"aws-smithy-runtime-api",
|
|
||||||
"aws-smithy-types",
|
|
||||||
"aws-types",
|
|
||||||
"bytes",
|
|
||||||
"fastrand 2.0.0",
|
|
||||||
"http 0.2.9",
|
|
||||||
"once_cell",
|
|
||||||
"regex-lite",
|
|
||||||
"tracing",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "aws-sdk-sso"
|
name = "aws-sdk-sso"
|
||||||
version = "1.12.0"
|
version = "1.12.0"
|
||||||
@@ -498,9 +501,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "aws-sigv4"
|
name = "aws-sigv4"
|
||||||
version = "1.1.4"
|
version = "1.2.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "c371c6b0ac54d4605eb6f016624fb5c7c2925d315fdf600ac1bf21b19d5f1742"
|
checksum = "11d6f29688a4be9895c0ba8bef861ad0c0dac5c15e9618b9b7a6c233990fc263"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"aws-credential-types",
|
"aws-credential-types",
|
||||||
"aws-smithy-eventstream",
|
"aws-smithy-eventstream",
|
||||||
@@ -513,7 +516,7 @@ dependencies = [
|
|||||||
"hex",
|
"hex",
|
||||||
"hmac",
|
"hmac",
|
||||||
"http 0.2.9",
|
"http 0.2.9",
|
||||||
"http 1.0.0",
|
"http 1.1.0",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"p256",
|
"p256",
|
||||||
"percent-encoding",
|
"percent-encoding",
|
||||||
@@ -527,9 +530,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "aws-smithy-async"
|
name = "aws-smithy-async"
|
||||||
version = "1.1.4"
|
version = "1.1.8"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "72ee2d09cce0ef3ae526679b522835d63e75fb427aca5413cd371e490d52dcc6"
|
checksum = "d26ea8fa03025b2face2b3038a63525a10891e3d8829901d502e5384a0d8cd46"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"futures-util",
|
"futures-util",
|
||||||
"pin-project-lite",
|
"pin-project-lite",
|
||||||
@@ -570,9 +573,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "aws-smithy-http"
|
name = "aws-smithy-http"
|
||||||
version = "0.60.4"
|
version = "0.60.7"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "dab56aea3cd9e1101a0a999447fb346afb680ab1406cebc44b32346e25b4117d"
|
checksum = "3f10fa66956f01540051b0aa7ad54574640f748f9839e843442d99b970d3aff9"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"aws-smithy-eventstream",
|
"aws-smithy-eventstream",
|
||||||
"aws-smithy-runtime-api",
|
"aws-smithy-runtime-api",
|
||||||
@@ -591,18 +594,18 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "aws-smithy-json"
|
name = "aws-smithy-json"
|
||||||
version = "0.60.4"
|
version = "0.60.7"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "fd3898ca6518f9215f62678870064398f00031912390efd03f1f6ef56d83aa8e"
|
checksum = "4683df9469ef09468dad3473d129960119a0d3593617542b7d52086c8486f2d6"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"aws-smithy-types",
|
"aws-smithy-types",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "aws-smithy-query"
|
name = "aws-smithy-query"
|
||||||
version = "0.60.4"
|
version = "0.60.7"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "bda4b1dfc9810e35fba8a620e900522cd1bd4f9578c446e82f49d1ce41d2e9f9"
|
checksum = "f2fbd61ceb3fe8a1cb7352e42689cec5335833cd9f94103a61e98f9bb61c64bb"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"aws-smithy-types",
|
"aws-smithy-types",
|
||||||
"urlencoding",
|
"urlencoding",
|
||||||
@@ -610,9 +613,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "aws-smithy-runtime"
|
name = "aws-smithy-runtime"
|
||||||
version = "1.1.4"
|
version = "1.1.8"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "fafdab38f40ad7816e7da5dec279400dd505160780083759f01441af1bbb10ea"
|
checksum = "ec81002d883e5a7fd2bb063d6fb51c4999eb55d404f4fff3dd878bf4733b9f01"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"aws-smithy-async",
|
"aws-smithy-async",
|
||||||
"aws-smithy-http",
|
"aws-smithy-http",
|
||||||
@@ -635,14 +638,15 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "aws-smithy-runtime-api"
|
name = "aws-smithy-runtime-api"
|
||||||
version = "1.1.4"
|
version = "1.2.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "c18276dd28852f34b3bf501f4f3719781f4999a51c7bff1a5c6dc8c4529adc29"
|
checksum = "9acb931e0adaf5132de878f1398d83f8677f90ba70f01f65ff87f6d7244be1c5"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"aws-smithy-async",
|
"aws-smithy-async",
|
||||||
"aws-smithy-types",
|
"aws-smithy-types",
|
||||||
"bytes",
|
"bytes",
|
||||||
"http 0.2.9",
|
"http 0.2.9",
|
||||||
|
"http 1.1.0",
|
||||||
"pin-project-lite",
|
"pin-project-lite",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tracing",
|
"tracing",
|
||||||
@@ -651,9 +655,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "aws-smithy-types"
|
name = "aws-smithy-types"
|
||||||
version = "1.1.4"
|
version = "1.1.8"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "bb3e134004170d3303718baa2a4eb4ca64ee0a1c0a7041dca31b38be0fb414f3"
|
checksum = "abe14dceea1e70101d38fbf2a99e6a34159477c0fb95e68e05c66bd7ae4c3729"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"base64-simd",
|
"base64-simd",
|
||||||
"bytes",
|
"bytes",
|
||||||
@@ -674,18 +678,18 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "aws-smithy-xml"
|
name = "aws-smithy-xml"
|
||||||
version = "0.60.4"
|
version = "0.60.7"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "8604a11b25e9ecaf32f9aa56b9fe253c5e2f606a3477f0071e96d3155a5ed218"
|
checksum = "872c68cf019c0e4afc5de7753c4f7288ce4b71663212771bf5e4542eb9346ca9"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"xmlparser",
|
"xmlparser",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "aws-types"
|
name = "aws-types"
|
||||||
version = "1.1.4"
|
version = "1.1.8"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "789bbe008e65636fe1b6dbbb374c40c8960d1232b96af5ff4aec349f9c4accf4"
|
checksum = "0dbf2f3da841a8930f159163175cf6a3d16ddde517c1b0fba7aa776822800f40"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"aws-credential-types",
|
"aws-credential-types",
|
||||||
"aws-smithy-async",
|
"aws-smithy-async",
|
||||||
@@ -1346,6 +1350,7 @@ dependencies = [
|
|||||||
"futures",
|
"futures",
|
||||||
"git-version",
|
"git-version",
|
||||||
"hex",
|
"hex",
|
||||||
|
"humantime",
|
||||||
"hyper",
|
"hyper",
|
||||||
"nix 0.27.1",
|
"nix 0.27.1",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
@@ -2391,9 +2396,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "http"
|
name = "http"
|
||||||
version = "1.0.0"
|
version = "1.1.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "b32afd38673a8016f7c9ae69e5af41a58f81b1d31689040f2f1959594ce194ea"
|
checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bytes",
|
"bytes",
|
||||||
"fnv",
|
"fnv",
|
||||||
@@ -2493,7 +2498,7 @@ dependencies = [
|
|||||||
"hyper",
|
"hyper",
|
||||||
"log",
|
"log",
|
||||||
"rustls 0.21.9",
|
"rustls 0.21.9",
|
||||||
"rustls-native-certs",
|
"rustls-native-certs 0.6.2",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tokio-rustls 0.24.0",
|
"tokio-rustls 0.24.0",
|
||||||
]
|
]
|
||||||
@@ -2879,6 +2884,35 @@ version = "0.7.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
|
checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "measured"
|
||||||
|
version = "0.0.13"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f246648d027839a34b420e27c7de1165ace96e19ef894985d0a6ff89a7840a9f"
|
||||||
|
dependencies = [
|
||||||
|
"bytes",
|
||||||
|
"hashbrown 0.14.0",
|
||||||
|
"itoa",
|
||||||
|
"lasso",
|
||||||
|
"measured-derive",
|
||||||
|
"memchr",
|
||||||
|
"parking_lot 0.12.1",
|
||||||
|
"rustc-hash",
|
||||||
|
"ryu",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "measured-derive"
|
||||||
|
version = "0.0.13"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "edaa5cc22d99d5d6d7d99c3b5b5f7e7f8034c22f1b5d62a1adecd2ed005d9b80"
|
||||||
|
dependencies = [
|
||||||
|
"heck",
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn 2.0.52",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "memchr"
|
name = "memchr"
|
||||||
version = "2.6.4"
|
version = "2.6.4"
|
||||||
@@ -3529,6 +3563,7 @@ dependencies = [
|
|||||||
"postgres_connection",
|
"postgres_connection",
|
||||||
"postgres_ffi",
|
"postgres_ffi",
|
||||||
"pq_proto",
|
"pq_proto",
|
||||||
|
"procfs",
|
||||||
"rand 0.8.5",
|
"rand 0.8.5",
|
||||||
"regex",
|
"regex",
|
||||||
"remote_storage",
|
"remote_storage",
|
||||||
@@ -3546,6 +3581,7 @@ dependencies = [
|
|||||||
"strum_macros",
|
"strum_macros",
|
||||||
"svg_fmt",
|
"svg_fmt",
|
||||||
"sync_wrapper",
|
"sync_wrapper",
|
||||||
|
"sysinfo",
|
||||||
"tenant_size_model",
|
"tenant_size_model",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
"tokio",
|
"tokio",
|
||||||
@@ -3899,7 +3935,7 @@ dependencies = [
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "postgres"
|
name = "postgres"
|
||||||
version = "0.19.4"
|
version = "0.19.4"
|
||||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
|
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bytes",
|
"bytes",
|
||||||
"fallible-iterator",
|
"fallible-iterator",
|
||||||
@@ -3912,7 +3948,7 @@ dependencies = [
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "postgres-native-tls"
|
name = "postgres-native-tls"
|
||||||
version = "0.5.0"
|
version = "0.5.0"
|
||||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
|
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"native-tls",
|
"native-tls",
|
||||||
"tokio",
|
"tokio",
|
||||||
@@ -3923,7 +3959,7 @@ dependencies = [
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "postgres-protocol"
|
name = "postgres-protocol"
|
||||||
version = "0.6.4"
|
version = "0.6.4"
|
||||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
|
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"base64 0.20.0",
|
"base64 0.20.0",
|
||||||
"byteorder",
|
"byteorder",
|
||||||
@@ -3936,12 +3972,13 @@ dependencies = [
|
|||||||
"rand 0.8.5",
|
"rand 0.8.5",
|
||||||
"sha2",
|
"sha2",
|
||||||
"stringprep",
|
"stringprep",
|
||||||
|
"tokio",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "postgres-types"
|
name = "postgres-types"
|
||||||
version = "0.2.4"
|
version = "0.2.4"
|
||||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
|
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bytes",
|
"bytes",
|
||||||
"fallible-iterator",
|
"fallible-iterator",
|
||||||
@@ -4163,6 +4200,10 @@ version = "0.1.0"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"async-trait",
|
"async-trait",
|
||||||
|
"aws-config",
|
||||||
|
"aws-sdk-iam",
|
||||||
|
"aws-sigv4",
|
||||||
|
"aws-types",
|
||||||
"base64 0.13.1",
|
"base64 0.13.1",
|
||||||
"bstr",
|
"bstr",
|
||||||
"bytes",
|
"bytes",
|
||||||
@@ -4173,6 +4214,7 @@ dependencies = [
|
|||||||
"consumption_metrics",
|
"consumption_metrics",
|
||||||
"dashmap",
|
"dashmap",
|
||||||
"env_logger",
|
"env_logger",
|
||||||
|
"fallible-iterator",
|
||||||
"futures",
|
"futures",
|
||||||
"git-version",
|
"git-version",
|
||||||
"hashbrown 0.13.2",
|
"hashbrown 0.13.2",
|
||||||
@@ -4180,6 +4222,7 @@ dependencies = [
|
|||||||
"hex",
|
"hex",
|
||||||
"hmac",
|
"hmac",
|
||||||
"hostname",
|
"hostname",
|
||||||
|
"http 1.1.0",
|
||||||
"humantime",
|
"humantime",
|
||||||
"hyper",
|
"hyper",
|
||||||
"hyper-tungstenite",
|
"hyper-tungstenite",
|
||||||
@@ -4223,6 +4266,7 @@ dependencies = [
|
|||||||
"smallvec",
|
"smallvec",
|
||||||
"smol_str",
|
"smol_str",
|
||||||
"socket2 0.5.5",
|
"socket2 0.5.5",
|
||||||
|
"subtle",
|
||||||
"sync_wrapper",
|
"sync_wrapper",
|
||||||
"task-local-extensions",
|
"task-local-extensions",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
@@ -4394,9 +4438,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "redis"
|
name = "redis"
|
||||||
version = "0.24.0"
|
version = "0.25.2"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "c580d9cbbe1d1b479e8d67cf9daf6a62c957e6846048408b80b43ac3f6af84cd"
|
checksum = "71d64e978fd98a0e6b105d066ba4889a7301fca65aeac850a877d8797343feeb"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"bytes",
|
"bytes",
|
||||||
@@ -4405,15 +4449,15 @@ dependencies = [
|
|||||||
"itoa",
|
"itoa",
|
||||||
"percent-encoding",
|
"percent-encoding",
|
||||||
"pin-project-lite",
|
"pin-project-lite",
|
||||||
"rustls 0.21.9",
|
"rustls 0.22.2",
|
||||||
"rustls-native-certs",
|
"rustls-native-certs 0.7.0",
|
||||||
"rustls-pemfile 1.0.2",
|
"rustls-pemfile 2.1.1",
|
||||||
"rustls-webpki 0.101.7",
|
"rustls-pki-types",
|
||||||
"ryu",
|
"ryu",
|
||||||
"sha1_smol",
|
"sha1_smol",
|
||||||
"socket2 0.4.9",
|
"socket2 0.5.5",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tokio-rustls 0.24.0",
|
"tokio-rustls 0.25.0",
|
||||||
"tokio-util",
|
"tokio-util",
|
||||||
"url",
|
"url",
|
||||||
]
|
]
|
||||||
@@ -4842,6 +4886,19 @@ dependencies = [
|
|||||||
"security-framework",
|
"security-framework",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rustls-native-certs"
|
||||||
|
version = "0.7.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "8f1fb85efa936c42c6d5fc28d2629bb51e4b2f4b8a5211e297d599cc5a093792"
|
||||||
|
dependencies = [
|
||||||
|
"openssl-probe",
|
||||||
|
"rustls-pemfile 2.1.1",
|
||||||
|
"rustls-pki-types",
|
||||||
|
"schannel",
|
||||||
|
"security-framework",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rustls-pemfile"
|
name = "rustls-pemfile"
|
||||||
version = "1.0.2"
|
version = "1.0.2"
|
||||||
@@ -5344,13 +5401,23 @@ checksum = "ae1a47186c03a32177042e55dbc5fd5aee900b8e0069a8d70fba96a9375cd012"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "sha2"
|
name = "sha2"
|
||||||
version = "0.10.6"
|
version = "0.10.8"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0"
|
checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"cfg-if",
|
"cfg-if",
|
||||||
"cpufeatures",
|
"cpufeatures",
|
||||||
"digest",
|
"digest",
|
||||||
|
"sha2-asm",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "sha2-asm"
|
||||||
|
version = "0.6.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f27ba7066011e3fb30d808b51affff34f0a66d3a03a58edd787c6e420e40e44e"
|
||||||
|
dependencies = [
|
||||||
|
"cc",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -5886,7 +5953,7 @@ dependencies = [
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "tokio-epoll-uring"
|
name = "tokio-epoll-uring"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#868d2c42b5d54ca82fead6e8f2f233b69a540d3e"
|
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"futures",
|
"futures",
|
||||||
"nix 0.26.4",
|
"nix 0.26.4",
|
||||||
@@ -5933,7 +6000,7 @@ dependencies = [
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "tokio-postgres"
|
name = "tokio-postgres"
|
||||||
version = "0.7.7"
|
version = "0.7.7"
|
||||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
|
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"byteorder",
|
"byteorder",
|
||||||
@@ -6099,7 +6166,7 @@ dependencies = [
|
|||||||
"percent-encoding",
|
"percent-encoding",
|
||||||
"pin-project",
|
"pin-project",
|
||||||
"prost",
|
"prost",
|
||||||
"rustls-native-certs",
|
"rustls-native-certs 0.6.2",
|
||||||
"rustls-pemfile 1.0.2",
|
"rustls-pemfile 1.0.2",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tokio-rustls 0.24.0",
|
"tokio-rustls 0.24.0",
|
||||||
@@ -6423,7 +6490,7 @@ dependencies = [
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "uring-common"
|
name = "uring-common"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#868d2c42b5d54ca82fead6e8f2f233b69a540d3e"
|
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bytes",
|
"bytes",
|
||||||
"io-uring",
|
"io-uring",
|
||||||
@@ -6466,6 +6533,7 @@ version = "0.1.0"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"arc-swap",
|
"arc-swap",
|
||||||
|
"async-compression",
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"bincode",
|
"bincode",
|
||||||
"byteorder",
|
"byteorder",
|
||||||
@@ -6504,12 +6572,14 @@ dependencies = [
|
|||||||
"thiserror",
|
"thiserror",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tokio-stream",
|
"tokio-stream",
|
||||||
|
"tokio-tar",
|
||||||
"tokio-util",
|
"tokio-util",
|
||||||
"tracing",
|
"tracing",
|
||||||
"tracing-error",
|
"tracing-error",
|
||||||
"tracing-subscriber",
|
"tracing-subscriber",
|
||||||
"url",
|
"url",
|
||||||
"uuid",
|
"uuid",
|
||||||
|
"walkdir",
|
||||||
"workspace_hack",
|
"workspace_hack",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -6981,7 +7051,6 @@ dependencies = [
|
|||||||
"aws-sigv4",
|
"aws-sigv4",
|
||||||
"aws-smithy-async",
|
"aws-smithy-async",
|
||||||
"aws-smithy-http",
|
"aws-smithy-http",
|
||||||
"aws-smithy-runtime-api",
|
|
||||||
"aws-smithy-types",
|
"aws-smithy-types",
|
||||||
"axum",
|
"axum",
|
||||||
"base64 0.21.1",
|
"base64 0.21.1",
|
||||||
@@ -7027,6 +7096,7 @@ dependencies = [
|
|||||||
"scopeguard",
|
"scopeguard",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
|
"sha2",
|
||||||
"smallvec",
|
"smallvec",
|
||||||
"subtle",
|
"subtle",
|
||||||
"syn 1.0.109",
|
"syn 1.0.109",
|
||||||
|
|||||||
10
Cargo.toml
10
Cargo.toml
@@ -52,10 +52,12 @@ async-stream = "0.3"
|
|||||||
async-trait = "0.1"
|
async-trait = "0.1"
|
||||||
aws-config = { version = "1.1.4", default-features = false, features=["rustls"] }
|
aws-config = { version = "1.1.4", default-features = false, features=["rustls"] }
|
||||||
aws-sdk-s3 = "1.14"
|
aws-sdk-s3 = "1.14"
|
||||||
aws-sdk-secretsmanager = { version = "1.14.0" }
|
aws-sdk-iam = "1.15.0"
|
||||||
aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] }
|
aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] }
|
||||||
aws-smithy-types = "1.1.4"
|
aws-smithy-types = "1.1.4"
|
||||||
aws-credential-types = "1.1.4"
|
aws-credential-types = "1.1.4"
|
||||||
|
aws-sigv4 = { version = "1.2.0", features = ["sign-http"] }
|
||||||
|
aws-types = "1.1.7"
|
||||||
axum = { version = "0.6.20", features = ["ws"] }
|
axum = { version = "0.6.20", features = ["ws"] }
|
||||||
base64 = "0.13.0"
|
base64 = "0.13.0"
|
||||||
bincode = "1.3"
|
bincode = "1.3"
|
||||||
@@ -76,6 +78,7 @@ either = "1.8"
|
|||||||
enum-map = "2.4.2"
|
enum-map = "2.4.2"
|
||||||
enumset = "1.0.12"
|
enumset = "1.0.12"
|
||||||
fail = "0.5.0"
|
fail = "0.5.0"
|
||||||
|
fallible-iterator = "0.2"
|
||||||
fs2 = "0.4.3"
|
fs2 = "0.4.3"
|
||||||
futures = "0.3"
|
futures = "0.3"
|
||||||
futures-core = "0.3"
|
futures-core = "0.3"
|
||||||
@@ -88,6 +91,7 @@ hex = "0.4"
|
|||||||
hex-literal = "0.4"
|
hex-literal = "0.4"
|
||||||
hmac = "0.12.1"
|
hmac = "0.12.1"
|
||||||
hostname = "0.3.1"
|
hostname = "0.3.1"
|
||||||
|
http = {version = "1.1.0", features = ["std"]}
|
||||||
http-types = { version = "2", default-features = false }
|
http-types = { version = "2", default-features = false }
|
||||||
humantime = "2.1"
|
humantime = "2.1"
|
||||||
humantime-serde = "1.1.1"
|
humantime-serde = "1.1.1"
|
||||||
@@ -101,6 +105,7 @@ lasso = "0.7"
|
|||||||
leaky-bucket = "1.0.1"
|
leaky-bucket = "1.0.1"
|
||||||
libc = "0.2"
|
libc = "0.2"
|
||||||
md5 = "0.7.0"
|
md5 = "0.7.0"
|
||||||
|
measured = { version = "0.0.13", features=["default", "lasso"] }
|
||||||
memoffset = "0.8"
|
memoffset = "0.8"
|
||||||
native-tls = "0.2"
|
native-tls = "0.2"
|
||||||
nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
|
nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
|
||||||
@@ -120,7 +125,7 @@ procfs = "0.14"
|
|||||||
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
|
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
|
||||||
prost = "0.11"
|
prost = "0.11"
|
||||||
rand = "0.8"
|
rand = "0.8"
|
||||||
redis = { version = "0.24.0", features = ["tokio-rustls-comp", "keep-alive"] }
|
redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
|
||||||
regex = "1.10.2"
|
regex = "1.10.2"
|
||||||
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
|
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
|
||||||
reqwest-tracing = { version = "0.4.7", features = ["opentelemetry_0_20"] }
|
reqwest-tracing = { version = "0.4.7", features = ["opentelemetry_0_20"] }
|
||||||
@@ -148,6 +153,7 @@ smol_str = { version = "0.2.0", features = ["serde"] }
|
|||||||
socket2 = "0.5"
|
socket2 = "0.5"
|
||||||
strum = "0.24"
|
strum = "0.24"
|
||||||
strum_macros = "0.24"
|
strum_macros = "0.24"
|
||||||
|
"subtle" = "2.5.0"
|
||||||
svg_fmt = "0.4.1"
|
svg_fmt = "0.4.1"
|
||||||
sync_wrapper = "0.1.2"
|
sync_wrapper = "0.1.2"
|
||||||
tar = "0.4"
|
tar = "0.4"
|
||||||
|
|||||||
@@ -135,7 +135,7 @@ WORKDIR /home/nonroot
|
|||||||
|
|
||||||
# Rust
|
# Rust
|
||||||
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
|
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
|
||||||
ENV RUSTC_VERSION=1.76.0
|
ENV RUSTC_VERSION=1.77.0
|
||||||
ENV RUSTUP_HOME="/home/nonroot/.rustup"
|
ENV RUSTUP_HOME="/home/nonroot/.rustup"
|
||||||
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
|
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
|
||||||
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
|
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
|
||||||
@@ -149,7 +149,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
|
|||||||
cargo install --git https://github.com/paritytech/cachepot && \
|
cargo install --git https://github.com/paritytech/cachepot && \
|
||||||
cargo install rustfilt && \
|
cargo install rustfilt && \
|
||||||
cargo install cargo-hakari && \
|
cargo install cargo-hakari && \
|
||||||
cargo install cargo-deny && \
|
cargo install cargo-deny --locked && \
|
||||||
cargo install cargo-hack && \
|
cargo install cargo-hack && \
|
||||||
cargo install cargo-nextest && \
|
cargo install cargo-nextest && \
|
||||||
rm -rf /home/nonroot/.cargo/registry && \
|
rm -rf /home/nonroot/.cargo/registry && \
|
||||||
|
|||||||
2
Makefile
2
Makefile
@@ -51,7 +51,7 @@ CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS))
|
|||||||
CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+)
|
CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+)
|
||||||
# Force cargo not to print progress bar
|
# Force cargo not to print progress bar
|
||||||
CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
|
CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
|
||||||
# Set PQ_LIB_DIR to make sure `attachment_service` get linked with bundled libpq (through diesel)
|
# Set PQ_LIB_DIR to make sure `storage_controller` get linked with bundled libpq (through diesel)
|
||||||
CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib
|
CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib
|
||||||
|
|
||||||
#
|
#
|
||||||
|
|||||||
@@ -238,6 +238,14 @@ If you encounter errors during setting up the initial tenant, it's best to stop
|
|||||||
|
|
||||||
## Running tests
|
## Running tests
|
||||||
|
|
||||||
|
### Rust unit tests
|
||||||
|
|
||||||
|
We are using [`cargo-nextest`](https://nexte.st/) to run the tests in Github Workflows.
|
||||||
|
Some crates do not support running plain `cargo test` anymore, prefer `cargo nextest run` instead.
|
||||||
|
You can install `cargo-nextest` with `cargo install cargo-nextest`.
|
||||||
|
|
||||||
|
### Integration tests
|
||||||
|
|
||||||
Ensure your dependencies are installed as described [here](https://github.com/neondatabase/neon#dependency-installation-notes).
|
Ensure your dependencies are installed as described [here](https://github.com/neondatabase/neon#dependency-installation-notes).
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
|
|||||||
@@ -2,6 +2,8 @@ disallowed-methods = [
|
|||||||
"tokio::task::block_in_place",
|
"tokio::task::block_in_place",
|
||||||
# Allow this for now, to deny it later once we stop using Handle::block_on completely
|
# Allow this for now, to deny it later once we stop using Handle::block_on completely
|
||||||
# "tokio::runtime::Handle::block_on",
|
# "tokio::runtime::Handle::block_on",
|
||||||
|
# use tokio_epoll_uring_ext instead
|
||||||
|
"tokio_epoll_uring::thread_local_system",
|
||||||
]
|
]
|
||||||
|
|
||||||
disallowed-macros = [
|
disallowed-macros = [
|
||||||
|
|||||||
@@ -32,6 +32,29 @@ compute_ctl -D /var/db/postgres/compute \
|
|||||||
-b /usr/local/bin/postgres
|
-b /usr/local/bin/postgres
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## State Diagram
|
||||||
|
|
||||||
|
Computes can be in various states. Below is a diagram that details how a
|
||||||
|
compute moves between states.
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
%% https://mermaid.js.org/syntax/stateDiagram.html
|
||||||
|
stateDiagram-v2
|
||||||
|
[*] --> Empty : Compute spawned
|
||||||
|
Empty --> ConfigurationPending : Waiting for compute spec
|
||||||
|
ConfigurationPending --> Configuration : Received compute spec
|
||||||
|
Configuration --> Failed : Failed to configure the compute
|
||||||
|
Configuration --> Running : Compute has been configured
|
||||||
|
Empty --> Init : Compute spec is immediately available
|
||||||
|
Empty --> TerminationPending : Requested termination
|
||||||
|
Init --> Failed : Failed to start Postgres
|
||||||
|
Init --> Running : Started Postgres
|
||||||
|
Running --> TerminationPending : Requested termination
|
||||||
|
TerminationPending --> Terminated : Terminated compute
|
||||||
|
Failed --> [*] : Compute exited
|
||||||
|
Terminated --> [*] : Compute exited
|
||||||
|
```
|
||||||
|
|
||||||
## Tests
|
## Tests
|
||||||
|
|
||||||
Cargo formatter:
|
Cargo formatter:
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ use chrono::{DateTime, Utc};
|
|||||||
use futures::future::join_all;
|
use futures::future::join_all;
|
||||||
use futures::stream::FuturesUnordered;
|
use futures::stream::FuturesUnordered;
|
||||||
use futures::StreamExt;
|
use futures::StreamExt;
|
||||||
|
use nix::unistd::Pid;
|
||||||
use postgres::error::SqlState;
|
use postgres::error::SqlState;
|
||||||
use postgres::{Client, NoTls};
|
use postgres::{Client, NoTls};
|
||||||
use tracing::{debug, error, info, instrument, warn};
|
use tracing::{debug, error, info, instrument, warn};
|
||||||
@@ -722,8 +723,12 @@ impl ComputeNode {
|
|||||||
// Stop it when it's ready
|
// Stop it when it's ready
|
||||||
info!("waiting for postgres");
|
info!("waiting for postgres");
|
||||||
wait_for_postgres(&mut pg, Path::new(pgdata))?;
|
wait_for_postgres(&mut pg, Path::new(pgdata))?;
|
||||||
pg.kill()?;
|
// SIGQUIT orders postgres to exit immediately. We don't want to SIGKILL
|
||||||
info!("sent kill signal");
|
// it to avoid orphaned processes prowling around while datadir is
|
||||||
|
// wiped.
|
||||||
|
let pm_pid = Pid::from_raw(pg.id() as i32);
|
||||||
|
kill(pm_pid, Signal::SIGQUIT)?;
|
||||||
|
info!("sent SIGQUIT signal");
|
||||||
pg.wait()?;
|
pg.wait()?;
|
||||||
info!("done prewarming");
|
info!("done prewarming");
|
||||||
|
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
|
|||||||
.write(true)
|
.write(true)
|
||||||
.create(true)
|
.create(true)
|
||||||
.append(false)
|
.append(false)
|
||||||
|
.truncate(false)
|
||||||
.open(path)?;
|
.open(path)?;
|
||||||
let buf = io::BufReader::new(&file);
|
let buf = io::BufReader::new(&file);
|
||||||
let mut count: usize = 0;
|
let mut count: usize = 0;
|
||||||
|
|||||||
@@ -302,9 +302,9 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
|||||||
RoleAction::Create => {
|
RoleAction::Create => {
|
||||||
// This branch only runs when roles are created through the console, so it is
|
// This branch only runs when roles are created through the console, so it is
|
||||||
// safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
|
// safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
|
||||||
// from neon_superuser.
|
// from neon_superuser. (NOTE: REPLICATION has been removed from here for now).
|
||||||
let mut query: String = format!(
|
let mut query: String = format!(
|
||||||
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
|
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
|
||||||
name.pg_quote()
|
name.pg_quote()
|
||||||
);
|
);
|
||||||
info!("running role create query: '{}'", &query);
|
info!("running role create query: '{}'", &query);
|
||||||
@@ -743,19 +743,21 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
|
|||||||
// which may happen in two cases:
|
// which may happen in two cases:
|
||||||
// - extension was just installed
|
// - extension was just installed
|
||||||
// - extension was already installed and is up to date
|
// - extension was already installed and is up to date
|
||||||
let query = "ALTER EXTENSION neon UPDATE";
|
// DISABLED due to compute node unpinning epic
|
||||||
info!("update neon extension version with query: {}", query);
|
// let query = "ALTER EXTENSION neon UPDATE";
|
||||||
client.simple_query(query)?;
|
// info!("update neon extension version with query: {}", query);
|
||||||
|
// client.simple_query(query)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[instrument(skip_all)]
|
#[instrument(skip_all)]
|
||||||
pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
|
pub fn handle_neon_extension_upgrade(_client: &mut Client) -> Result<()> {
|
||||||
info!("handle neon extension upgrade");
|
info!("handle neon extension upgrade (not really)");
|
||||||
let query = "ALTER EXTENSION neon UPDATE";
|
// DISABLED due to compute node unpinning epic
|
||||||
info!("update neon extension version with query: {}", query);
|
// let query = "ALTER EXTENSION neon UPDATE";
|
||||||
client.simple_query(query)?;
|
// info!("update neon extension version with query: {}", query);
|
||||||
|
// client.simple_query(query)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -805,6 +807,18 @@ $$;"#,
|
|||||||
"",
|
"",
|
||||||
"",
|
"",
|
||||||
// Add new migrations below.
|
// Add new migrations below.
|
||||||
|
r#"
|
||||||
|
DO $$
|
||||||
|
DECLARE
|
||||||
|
role_name TEXT;
|
||||||
|
BEGIN
|
||||||
|
FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
|
||||||
|
LOOP
|
||||||
|
RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
|
||||||
|
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
|
||||||
|
END LOOP;
|
||||||
|
END
|
||||||
|
$$;"#,
|
||||||
];
|
];
|
||||||
|
|
||||||
let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
|
let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ clap.workspace = true
|
|||||||
comfy-table.workspace = true
|
comfy-table.workspace = true
|
||||||
futures.workspace = true
|
futures.workspace = true
|
||||||
git-version.workspace = true
|
git-version.workspace = true
|
||||||
|
humantime.workspace = true
|
||||||
nix.workspace = true
|
nix.workspace = true
|
||||||
once_cell.workspace = true
|
once_cell.workspace = true
|
||||||
postgres.workspace = true
|
postgres.workspace = true
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ testing = []
|
|||||||
[dependencies]
|
[dependencies]
|
||||||
anyhow.workspace = true
|
anyhow.workspace = true
|
||||||
aws-config.workspace = true
|
aws-config.workspace = true
|
||||||
aws-sdk-secretsmanager.workspace = true
|
bytes.workspace = true
|
||||||
camino.workspace = true
|
camino.workspace = true
|
||||||
clap.workspace = true
|
clap.workspace = true
|
||||||
fail.workspace = true
|
fail.workspace = true
|
||||||
@@ -25,17 +25,20 @@ git-version.workspace = true
|
|||||||
hex.workspace = true
|
hex.workspace = true
|
||||||
hyper.workspace = true
|
hyper.workspace = true
|
||||||
humantime.workspace = true
|
humantime.workspace = true
|
||||||
|
lasso.workspace = true
|
||||||
once_cell.workspace = true
|
once_cell.workspace = true
|
||||||
pageserver_api.workspace = true
|
pageserver_api.workspace = true
|
||||||
pageserver_client.workspace = true
|
pageserver_client.workspace = true
|
||||||
postgres_connection.workspace = true
|
postgres_connection.workspace = true
|
||||||
reqwest.workspace = true
|
reqwest.workspace = true
|
||||||
|
routerify.workspace = true
|
||||||
serde.workspace = true
|
serde.workspace = true
|
||||||
serde_json.workspace = true
|
serde_json.workspace = true
|
||||||
thiserror.workspace = true
|
thiserror.workspace = true
|
||||||
tokio.workspace = true
|
tokio.workspace = true
|
||||||
tokio-util.workspace = true
|
tokio-util.workspace = true
|
||||||
tracing.workspace = true
|
tracing.workspace = true
|
||||||
|
measured.workspace = true
|
||||||
|
|
||||||
diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
|
diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
|
||||||
diesel_migrations = { version = "2.1.0" }
|
diesel_migrations = { version = "2.1.0" }
|
||||||
|
|||||||
@@ -0,0 +1,3 @@
|
|||||||
|
|
||||||
|
UPDATE tenant_shards set placement_policy='{"Double": 1}' where placement_policy='{"Attached": 1}';
|
||||||
|
UPDATE tenant_shards set placement_policy='"Single"' where placement_policy='{"Attached": 0}';
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
|
||||||
|
UPDATE tenant_shards set placement_policy='{"Attached": 1}' where placement_policy='{"Double": 1}';
|
||||||
|
UPDATE tenant_shards set placement_policy='{"Attached": 0}' where placement_policy='"Single"';
|
||||||
@@ -1,4 +1,3 @@
|
|||||||
use std::sync::Arc;
|
|
||||||
use std::{collections::HashMap, time::Duration};
|
use std::{collections::HashMap, time::Duration};
|
||||||
|
|
||||||
use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
|
use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
|
||||||
@@ -24,13 +23,10 @@ struct ShardedComputeHookTenant {
|
|||||||
stripe_size: ShardStripeSize,
|
stripe_size: ShardStripeSize,
|
||||||
shard_count: ShardCount,
|
shard_count: ShardCount,
|
||||||
shards: Vec<(ShardNumber, NodeId)>,
|
shards: Vec<(ShardNumber, NodeId)>,
|
||||||
|
|
||||||
// Async lock used for ensuring that remote compute hook calls are ordered identically to updates to this structure
|
|
||||||
lock: Arc<tokio::sync::Mutex<()>>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
enum ComputeHookTenant {
|
enum ComputeHookTenant {
|
||||||
Unsharded((NodeId, Arc<tokio::sync::Mutex<()>>)),
|
Unsharded(NodeId),
|
||||||
Sharded(ShardedComputeHookTenant),
|
Sharded(ShardedComputeHookTenant),
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -42,17 +38,9 @@ impl ComputeHookTenant {
|
|||||||
shards: vec![(tenant_shard_id.shard_number, node_id)],
|
shards: vec![(tenant_shard_id.shard_number, node_id)],
|
||||||
stripe_size,
|
stripe_size,
|
||||||
shard_count: tenant_shard_id.shard_count,
|
shard_count: tenant_shard_id.shard_count,
|
||||||
lock: Arc::default(),
|
|
||||||
})
|
})
|
||||||
} else {
|
} else {
|
||||||
Self::Unsharded((node_id, Arc::default()))
|
Self::Unsharded(node_id)
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn get_lock(&self) -> &Arc<tokio::sync::Mutex<()>> {
|
|
||||||
match self {
|
|
||||||
Self::Unsharded((_node_id, lock)) => lock,
|
|
||||||
Self::Sharded(sharded_tenant) => &sharded_tenant.lock,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -65,9 +53,7 @@ impl ComputeHookTenant {
|
|||||||
node_id: NodeId,
|
node_id: NodeId,
|
||||||
) {
|
) {
|
||||||
match self {
|
match self {
|
||||||
Self::Unsharded((existing_node_id, _lock))
|
Self::Unsharded(existing_node_id) if tenant_shard_id.shard_count.count() == 1 => {
|
||||||
if tenant_shard_id.shard_count.count() == 1 =>
|
|
||||||
{
|
|
||||||
*existing_node_id = node_id
|
*existing_node_id = node_id
|
||||||
}
|
}
|
||||||
Self::Sharded(sharded_tenant)
|
Self::Sharded(sharded_tenant)
|
||||||
@@ -136,15 +122,9 @@ pub(crate) enum NotifyError {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl ComputeHookTenant {
|
impl ComputeHookTenant {
|
||||||
fn maybe_reconfigure(
|
fn maybe_reconfigure(&self, tenant_id: TenantId) -> Option<ComputeHookNotifyRequest> {
|
||||||
&self,
|
match self {
|
||||||
tenant_id: TenantId,
|
Self::Unsharded(node_id) => Some(ComputeHookNotifyRequest {
|
||||||
) -> Option<(
|
|
||||||
ComputeHookNotifyRequest,
|
|
||||||
impl std::future::Future<Output = tokio::sync::OwnedMutexGuard<()>>,
|
|
||||||
)> {
|
|
||||||
let request = match self {
|
|
||||||
Self::Unsharded((node_id, _lock)) => Some(ComputeHookNotifyRequest {
|
|
||||||
tenant_id,
|
tenant_id,
|
||||||
shards: vec![ComputeHookNotifyRequestShard {
|
shards: vec![ComputeHookNotifyRequestShard {
|
||||||
shard_number: ShardNumber(0),
|
shard_number: ShardNumber(0),
|
||||||
@@ -178,9 +158,7 @@ impl ComputeHookTenant {
|
|||||||
);
|
);
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
};
|
}
|
||||||
|
|
||||||
request.map(|r| (r, self.get_lock().clone().lock_owned()))
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -189,11 +167,8 @@ impl ComputeHookTenant {
|
|||||||
/// the compute connection string.
|
/// the compute connection string.
|
||||||
pub(super) struct ComputeHook {
|
pub(super) struct ComputeHook {
|
||||||
config: Config,
|
config: Config,
|
||||||
state: std::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
|
state: tokio::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
|
||||||
authorization_header: Option<String>,
|
authorization_header: Option<String>,
|
||||||
|
|
||||||
// This lock is only used in testing enviroments, to serialize calls into neon_lock
|
|
||||||
neon_local_lock: tokio::sync::Mutex<()>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ComputeHook {
|
impl ComputeHook {
|
||||||
@@ -207,7 +182,6 @@ impl ComputeHook {
|
|||||||
state: Default::default(),
|
state: Default::default(),
|
||||||
config,
|
config,
|
||||||
authorization_header,
|
authorization_header,
|
||||||
neon_local_lock: Default::default(),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -216,10 +190,6 @@ impl ComputeHook {
|
|||||||
&self,
|
&self,
|
||||||
reconfigure_request: ComputeHookNotifyRequest,
|
reconfigure_request: ComputeHookNotifyRequest,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
// neon_local updates are not safe to call concurrently, use a lock to serialize
|
|
||||||
// all calls to this function
|
|
||||||
let _locked = self.neon_local_lock.lock().await;
|
|
||||||
|
|
||||||
let env = match LocalEnv::load_config() {
|
let env = match LocalEnv::load_config() {
|
||||||
Ok(e) => e,
|
Ok(e) => e,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
@@ -370,38 +340,30 @@ impl ComputeHook {
|
|||||||
stripe_size: ShardStripeSize,
|
stripe_size: ShardStripeSize,
|
||||||
cancel: &CancellationToken,
|
cancel: &CancellationToken,
|
||||||
) -> Result<(), NotifyError> {
|
) -> Result<(), NotifyError> {
|
||||||
let reconfigure_request = {
|
let mut locked = self.state.lock().await;
|
||||||
let mut locked = self.state.lock().unwrap();
|
|
||||||
|
|
||||||
use std::collections::hash_map::Entry;
|
use std::collections::hash_map::Entry;
|
||||||
let tenant = match locked.entry(tenant_shard_id.tenant_id) {
|
let tenant = match locked.entry(tenant_shard_id.tenant_id) {
|
||||||
Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
|
Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
|
||||||
tenant_shard_id,
|
tenant_shard_id,
|
||||||
stripe_size,
|
stripe_size,
|
||||||
node_id,
|
node_id,
|
||||||
)),
|
)),
|
||||||
Entry::Occupied(e) => {
|
Entry::Occupied(e) => {
|
||||||
let tenant = e.into_mut();
|
let tenant = e.into_mut();
|
||||||
tenant.update(tenant_shard_id, stripe_size, node_id);
|
tenant.update(tenant_shard_id, stripe_size, node_id);
|
||||||
tenant
|
tenant
|
||||||
}
|
}
|
||||||
};
|
|
||||||
|
|
||||||
tenant.maybe_reconfigure(tenant_shard_id.tenant_id)
|
|
||||||
};
|
};
|
||||||
let Some((reconfigure_request, lock_fut)) = reconfigure_request else {
|
|
||||||
|
let reconfigure_request = tenant.maybe_reconfigure(tenant_shard_id.tenant_id);
|
||||||
|
let Some(reconfigure_request) = reconfigure_request else {
|
||||||
// The tenant doesn't yet have pageservers for all its shards: we won't notify anything
|
// The tenant doesn't yet have pageservers for all its shards: we won't notify anything
|
||||||
// until it does.
|
// until it does.
|
||||||
tracing::info!("Tenant isn't yet ready to emit a notification");
|
tracing::info!("Tenant isn't yet ready to emit a notification");
|
||||||
return Ok(());
|
return Ok(());
|
||||||
};
|
};
|
||||||
|
|
||||||
// Finish acquiring the tenant's async lock: this future was created inside the self.state
|
|
||||||
// lock above, so we are guaranteed to get this lock in the same order as callers took
|
|
||||||
// that lock. This ordering is essential: the cloud control plane must end up with the
|
|
||||||
// same end state for the tenant that we see.
|
|
||||||
let _guard = lock_fut.await;
|
|
||||||
|
|
||||||
if let Some(notify_url) = &self.config.compute_hook_url {
|
if let Some(notify_url) = &self.config.compute_hook_url {
|
||||||
self.do_notify(notify_url, reconfigure_request, cancel)
|
self.do_notify(notify_url, reconfigure_request, cancel)
|
||||||
.await
|
.await
|
||||||
@@ -443,7 +405,6 @@ pub(crate) mod tests {
|
|||||||
tenant_state
|
tenant_state
|
||||||
.maybe_reconfigure(tenant_id)
|
.maybe_reconfigure(tenant_id)
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.0
|
|
||||||
.shards
|
.shards
|
||||||
.len(),
|
.len(),
|
||||||
1
|
1
|
||||||
@@ -451,7 +412,6 @@ pub(crate) mod tests {
|
|||||||
assert!(tenant_state
|
assert!(tenant_state
|
||||||
.maybe_reconfigure(tenant_id)
|
.maybe_reconfigure(tenant_id)
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.0
|
|
||||||
.stripe_size
|
.stripe_size
|
||||||
.is_none());
|
.is_none());
|
||||||
|
|
||||||
@@ -485,7 +445,6 @@ pub(crate) mod tests {
|
|||||||
tenant_state
|
tenant_state
|
||||||
.maybe_reconfigure(tenant_id)
|
.maybe_reconfigure(tenant_id)
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.0
|
|
||||||
.shards
|
.shards
|
||||||
.len(),
|
.len(),
|
||||||
2
|
2
|
||||||
@@ -494,7 +453,6 @@ pub(crate) mod tests {
|
|||||||
tenant_state
|
tenant_state
|
||||||
.maybe_reconfigure(tenant_id)
|
.maybe_reconfigure(tenant_id)
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.0
|
|
||||||
.stripe_size,
|
.stripe_size,
|
||||||
Some(ShardStripeSize(32768))
|
Some(ShardStripeSize(32768))
|
||||||
);
|
);
|
||||||
|
|||||||
227
control_plane/attachment_service/src/heartbeater.rs
Normal file
227
control_plane/attachment_service/src/heartbeater.rs
Normal file
@@ -0,0 +1,227 @@
|
|||||||
|
use futures::{stream::FuturesUnordered, StreamExt};
|
||||||
|
use std::{
|
||||||
|
collections::HashMap,
|
||||||
|
sync::Arc,
|
||||||
|
time::{Duration, Instant},
|
||||||
|
};
|
||||||
|
use tokio_util::sync::CancellationToken;
|
||||||
|
|
||||||
|
use pageserver_api::{
|
||||||
|
controller_api::{NodeAvailability, UtilizationScore},
|
||||||
|
models::PageserverUtilization,
|
||||||
|
};
|
||||||
|
|
||||||
|
use thiserror::Error;
|
||||||
|
use utils::id::NodeId;
|
||||||
|
|
||||||
|
use crate::node::Node;
|
||||||
|
|
||||||
|
struct HeartbeaterTask {
|
||||||
|
receiver: tokio::sync::mpsc::UnboundedReceiver<HeartbeatRequest>,
|
||||||
|
cancel: CancellationToken,
|
||||||
|
|
||||||
|
state: HashMap<NodeId, PageserverState>,
|
||||||
|
|
||||||
|
max_unavailable_interval: Duration,
|
||||||
|
jwt_token: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub(crate) enum PageserverState {
|
||||||
|
Available {
|
||||||
|
last_seen_at: Instant,
|
||||||
|
utilization: PageserverUtilization,
|
||||||
|
},
|
||||||
|
Offline,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub(crate) struct AvailablityDeltas(pub Vec<(NodeId, PageserverState)>);
|
||||||
|
|
||||||
|
#[derive(Debug, Error)]
|
||||||
|
pub(crate) enum HeartbeaterError {
|
||||||
|
#[error("Cancelled")]
|
||||||
|
Cancel,
|
||||||
|
}
|
||||||
|
|
||||||
|
struct HeartbeatRequest {
|
||||||
|
pageservers: Arc<HashMap<NodeId, Node>>,
|
||||||
|
reply: tokio::sync::oneshot::Sender<Result<AvailablityDeltas, HeartbeaterError>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) struct Heartbeater {
|
||||||
|
sender: tokio::sync::mpsc::UnboundedSender<HeartbeatRequest>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Heartbeater {
|
||||||
|
pub(crate) fn new(
|
||||||
|
jwt_token: Option<String>,
|
||||||
|
max_unavailable_interval: Duration,
|
||||||
|
cancel: CancellationToken,
|
||||||
|
) -> Self {
|
||||||
|
let (sender, receiver) = tokio::sync::mpsc::unbounded_channel::<HeartbeatRequest>();
|
||||||
|
let mut heartbeater =
|
||||||
|
HeartbeaterTask::new(receiver, jwt_token, max_unavailable_interval, cancel);
|
||||||
|
tokio::task::spawn(async move { heartbeater.run().await });
|
||||||
|
|
||||||
|
Self { sender }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) async fn heartbeat(
|
||||||
|
&self,
|
||||||
|
pageservers: Arc<HashMap<NodeId, Node>>,
|
||||||
|
) -> Result<AvailablityDeltas, HeartbeaterError> {
|
||||||
|
let (sender, receiver) = tokio::sync::oneshot::channel();
|
||||||
|
self.sender
|
||||||
|
.send(HeartbeatRequest {
|
||||||
|
pageservers,
|
||||||
|
reply: sender,
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
receiver.await.unwrap()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl HeartbeaterTask {
|
||||||
|
fn new(
|
||||||
|
receiver: tokio::sync::mpsc::UnboundedReceiver<HeartbeatRequest>,
|
||||||
|
jwt_token: Option<String>,
|
||||||
|
max_unavailable_interval: Duration,
|
||||||
|
cancel: CancellationToken,
|
||||||
|
) -> Self {
|
||||||
|
Self {
|
||||||
|
receiver,
|
||||||
|
cancel,
|
||||||
|
state: HashMap::new(),
|
||||||
|
max_unavailable_interval,
|
||||||
|
jwt_token,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn run(&mut self) {
|
||||||
|
loop {
|
||||||
|
tokio::select! {
|
||||||
|
request = self.receiver.recv() => {
|
||||||
|
match request {
|
||||||
|
Some(req) => {
|
||||||
|
let res = self.heartbeat(req.pageservers).await;
|
||||||
|
req.reply.send(res).unwrap();
|
||||||
|
},
|
||||||
|
None => { return; }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
_ = self.cancel.cancelled() => return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn heartbeat(
|
||||||
|
&mut self,
|
||||||
|
pageservers: Arc<HashMap<NodeId, Node>>,
|
||||||
|
) -> Result<AvailablityDeltas, HeartbeaterError> {
|
||||||
|
let mut new_state = HashMap::new();
|
||||||
|
|
||||||
|
let mut heartbeat_futs = FuturesUnordered::new();
|
||||||
|
for (node_id, node) in &*pageservers {
|
||||||
|
heartbeat_futs.push({
|
||||||
|
let jwt_token = self.jwt_token.clone();
|
||||||
|
let cancel = self.cancel.clone();
|
||||||
|
|
||||||
|
// Clone the node and mark it as available such that the request
|
||||||
|
// goes through to the pageserver even when the node is marked offline.
|
||||||
|
// This doesn't impact the availability observed by [`crate::service::Service`].
|
||||||
|
let mut node = node.clone();
|
||||||
|
node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
|
||||||
|
|
||||||
|
async move {
|
||||||
|
let response = node
|
||||||
|
.with_client_retries(
|
||||||
|
|client| async move { client.get_utilization().await },
|
||||||
|
&jwt_token,
|
||||||
|
3,
|
||||||
|
3,
|
||||||
|
Duration::from_secs(1),
|
||||||
|
&cancel,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
|
||||||
|
let response = match response {
|
||||||
|
Some(r) => r,
|
||||||
|
None => {
|
||||||
|
// This indicates cancellation of the request.
|
||||||
|
// We ignore the node in this case.
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let status = if let Ok(utilization) = response {
|
||||||
|
PageserverState::Available {
|
||||||
|
last_seen_at: Instant::now(),
|
||||||
|
utilization,
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
PageserverState::Offline
|
||||||
|
};
|
||||||
|
|
||||||
|
Some((*node_id, status))
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
loop {
|
||||||
|
let maybe_status = tokio::select! {
|
||||||
|
next = heartbeat_futs.next() => {
|
||||||
|
match next {
|
||||||
|
Some(result) => result,
|
||||||
|
None => { break; }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
_ = self.cancel.cancelled() => { return Err(HeartbeaterError::Cancel); }
|
||||||
|
};
|
||||||
|
|
||||||
|
if let Some((node_id, status)) = maybe_status {
|
||||||
|
new_state.insert(node_id, status);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut deltas = Vec::new();
|
||||||
|
let now = Instant::now();
|
||||||
|
for (node_id, ps_state) in new_state {
|
||||||
|
use std::collections::hash_map::Entry::*;
|
||||||
|
let entry = self.state.entry(node_id);
|
||||||
|
|
||||||
|
let mut needs_update = false;
|
||||||
|
match entry {
|
||||||
|
Occupied(ref occ) => match (occ.get(), &ps_state) {
|
||||||
|
(PageserverState::Offline, PageserverState::Offline) => {}
|
||||||
|
(PageserverState::Available { last_seen_at, .. }, PageserverState::Offline) => {
|
||||||
|
if now - *last_seen_at >= self.max_unavailable_interval {
|
||||||
|
deltas.push((node_id, ps_state.clone()));
|
||||||
|
needs_update = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
deltas.push((node_id, ps_state.clone()));
|
||||||
|
needs_update = true;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
Vacant(_) => {
|
||||||
|
deltas.push((node_id, ps_state.clone()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
match entry {
|
||||||
|
Occupied(mut occ) if needs_update => {
|
||||||
|
(*occ.get_mut()) = ps_state;
|
||||||
|
}
|
||||||
|
Vacant(vac) => {
|
||||||
|
vac.insert(ps_state);
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(AvailablityDeltas(deltas))
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,5 +1,11 @@
|
|||||||
|
use crate::metrics::{
|
||||||
|
HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup,
|
||||||
|
METRICS_REGISTRY,
|
||||||
|
};
|
||||||
use crate::reconciler::ReconcileError;
|
use crate::reconciler::ReconcileError;
|
||||||
use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
|
use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
|
||||||
|
use futures::Future;
|
||||||
|
use hyper::header::CONTENT_TYPE;
|
||||||
use hyper::{Body, Request, Response};
|
use hyper::{Body, Request, Response};
|
||||||
use hyper::{StatusCode, Uri};
|
use hyper::{StatusCode, Uri};
|
||||||
use pageserver_api::models::{
|
use pageserver_api::models::{
|
||||||
@@ -14,7 +20,7 @@ use tokio_util::sync::CancellationToken;
|
|||||||
use utils::auth::{Scope, SwappableJwtAuth};
|
use utils::auth::{Scope, SwappableJwtAuth};
|
||||||
use utils::failpoint_support::failpoints_handler;
|
use utils::failpoint_support::failpoints_handler;
|
||||||
use utils::http::endpoint::{auth_middleware, check_permission_with, request_span};
|
use utils::http::endpoint::{auth_middleware, check_permission_with, request_span};
|
||||||
use utils::http::request::{must_get_query_param, parse_request_param};
|
use utils::http::request::{must_get_query_param, parse_query_param, parse_request_param};
|
||||||
use utils::id::{TenantId, TimelineId};
|
use utils::id::{TenantId, TimelineId};
|
||||||
|
|
||||||
use utils::{
|
use utils::{
|
||||||
@@ -28,11 +34,13 @@ use utils::{
|
|||||||
};
|
};
|
||||||
|
|
||||||
use pageserver_api::controller_api::{
|
use pageserver_api::controller_api::{
|
||||||
NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
|
NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
|
||||||
};
|
};
|
||||||
use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
|
use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
|
||||||
|
|
||||||
use control_plane::attachment_service::{AttachHookRequest, InspectRequest};
|
use control_plane::storage_controller::{AttachHookRequest, InspectRequest};
|
||||||
|
|
||||||
|
use routerify::Middleware;
|
||||||
|
|
||||||
/// State available to HTTP request handlers
|
/// State available to HTTP request handlers
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
@@ -176,14 +184,14 @@ async fn handle_tenant_location_config(
|
|||||||
service: Arc<Service>,
|
service: Arc<Service>,
|
||||||
mut req: Request<Body>,
|
mut req: Request<Body>,
|
||||||
) -> Result<Response<Body>, ApiError> {
|
) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
|
||||||
check_permissions(&req, Scope::PageServerApi)?;
|
check_permissions(&req, Scope::PageServerApi)?;
|
||||||
|
|
||||||
let config_req = json_request::<TenantLocationConfigRequest>(&mut req).await?;
|
let config_req = json_request::<TenantLocationConfigRequest>(&mut req).await?;
|
||||||
json_response(
|
json_response(
|
||||||
StatusCode::OK,
|
StatusCode::OK,
|
||||||
service
|
service
|
||||||
.tenant_location_config(tenant_id, config_req)
|
.tenant_location_config(tenant_shard_id, config_req)
|
||||||
.await?,
|
.await?,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@@ -248,8 +256,10 @@ async fn handle_tenant_secondary_download(
|
|||||||
req: Request<Body>,
|
req: Request<Body>,
|
||||||
) -> Result<Response<Body>, ApiError> {
|
) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||||
service.tenant_secondary_download(tenant_id).await?;
|
let wait = parse_query_param(&req, "wait_ms")?.map(Duration::from_millis);
|
||||||
json_response(StatusCode::OK, ())
|
|
||||||
|
let (status, progress) = service.tenant_secondary_download(tenant_id, wait).await?;
|
||||||
|
json_response(status, progress)
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn handle_tenant_delete(
|
async fn handle_tenant_delete(
|
||||||
@@ -311,7 +321,7 @@ async fn handle_tenant_timeline_passthrough(
|
|||||||
tracing::info!("Proxying request for tenant {} ({})", tenant_id, path);
|
tracing::info!("Proxying request for tenant {} ({})", tenant_id, path);
|
||||||
|
|
||||||
// Find the node that holds shard zero
|
// Find the node that holds shard zero
|
||||||
let (base_url, tenant_shard_id) = service.tenant_shard0_baseurl(tenant_id)?;
|
let (node, tenant_shard_id) = service.tenant_shard0_node(tenant_id)?;
|
||||||
|
|
||||||
// Callers will always pass an unsharded tenant ID. Before proxying, we must
|
// Callers will always pass an unsharded tenant ID. Before proxying, we must
|
||||||
// rewrite this to a shard-aware shard zero ID.
|
// rewrite this to a shard-aware shard zero ID.
|
||||||
@@ -320,12 +330,39 @@ async fn handle_tenant_timeline_passthrough(
|
|||||||
let tenant_shard_str = format!("{}", tenant_shard_id);
|
let tenant_shard_str = format!("{}", tenant_shard_id);
|
||||||
let path = path.replace(&tenant_str, &tenant_shard_str);
|
let path = path.replace(&tenant_str, &tenant_shard_str);
|
||||||
|
|
||||||
let client = mgmt_api::Client::new(base_url, service.get_config().jwt_token.as_deref());
|
let latency = &METRICS_REGISTRY
|
||||||
|
.metrics_group
|
||||||
|
.storage_controller_passthrough_request_latency;
|
||||||
|
|
||||||
|
// This is a bit awkward. We remove the param from the request
|
||||||
|
// and join the words by '_' to get a label for the request.
|
||||||
|
let just_path = path.replace(&tenant_shard_str, "");
|
||||||
|
let path_label = just_path
|
||||||
|
.split('/')
|
||||||
|
.filter(|token| !token.is_empty())
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
.join("_");
|
||||||
|
let labels = PageserverRequestLabelGroup {
|
||||||
|
pageserver_id: &node.get_id().to_string(),
|
||||||
|
path: &path_label,
|
||||||
|
method: crate::metrics::Method::Get,
|
||||||
|
};
|
||||||
|
|
||||||
|
let _timer = latency.start_timer(labels.clone());
|
||||||
|
|
||||||
|
let client = mgmt_api::Client::new(node.base_url(), service.get_config().jwt_token.as_deref());
|
||||||
let resp = client.get_raw(path).await.map_err(|_e|
|
let resp = client.get_raw(path).await.map_err(|_e|
|
||||||
// FIXME: give APiError a proper Unavailable variant. We return 503 here because
|
// FIXME: give APiError a proper Unavailable variant. We return 503 here because
|
||||||
// if we can't successfully send a request to the pageserver, we aren't available.
|
// if we can't successfully send a request to the pageserver, we aren't available.
|
||||||
ApiError::ShuttingDown)?;
|
ApiError::ShuttingDown)?;
|
||||||
|
|
||||||
|
if !resp.status().is_success() {
|
||||||
|
let error_counter = &METRICS_REGISTRY
|
||||||
|
.metrics_group
|
||||||
|
.storage_controller_passthrough_request_error;
|
||||||
|
error_counter.inc(labels);
|
||||||
|
}
|
||||||
|
|
||||||
// We have a reqest::Response, would like a http::Response
|
// We have a reqest::Response, would like a http::Response
|
||||||
let mut builder = hyper::Response::builder()
|
let mut builder = hyper::Response::builder()
|
||||||
.status(resp.status())
|
.status(resp.status())
|
||||||
@@ -351,6 +388,16 @@ async fn handle_tenant_locate(
|
|||||||
json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
|
json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn handle_tenant_describe(
|
||||||
|
service: Arc<Service>,
|
||||||
|
req: Request<Body>,
|
||||||
|
) -> Result<Response<Body>, ApiError> {
|
||||||
|
check_permissions(&req, Scope::Admin)?;
|
||||||
|
|
||||||
|
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||||
|
json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
|
||||||
|
}
|
||||||
|
|
||||||
async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
check_permissions(&req, Scope::Admin)?;
|
check_permissions(&req, Scope::Admin)?;
|
||||||
|
|
||||||
@@ -389,7 +436,14 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
|
|||||||
|
|
||||||
json_response(
|
json_response(
|
||||||
StatusCode::OK,
|
StatusCode::OK,
|
||||||
state.service.node_configure(config_req).await?,
|
state
|
||||||
|
.service
|
||||||
|
.node_configure(
|
||||||
|
config_req.node_id,
|
||||||
|
config_req.availability.map(NodeAvailability::from),
|
||||||
|
config_req.scheduling,
|
||||||
|
)
|
||||||
|
.await?,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -440,24 +494,6 @@ async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiEr
|
|||||||
state.service.tenants_dump()
|
state.service.tenants_dump()
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn handle_balance_all(
|
|
||||||
service: Arc<Service>,
|
|
||||||
req: Request<Body>,
|
|
||||||
) -> Result<Response<Body>, ApiError> {
|
|
||||||
check_permissions(&req, Scope::Admin)?;
|
|
||||||
service.balance_all()?;
|
|
||||||
json_response(StatusCode::OK, ())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn handle_balance_attached(
|
|
||||||
service: Arc<Service>,
|
|
||||||
req: Request<Body>,
|
|
||||||
) -> Result<Response<Body>, ApiError> {
|
|
||||||
check_permissions(&req, Scope::Admin)?;
|
|
||||||
service.balance_attached()?;
|
|
||||||
json_response(StatusCode::OK, ())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn handle_scheduler_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
async fn handle_scheduler_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
check_permissions(&req, Scope::Admin)?;
|
check_permissions(&req, Scope::Admin)?;
|
||||||
|
|
||||||
@@ -497,7 +533,11 @@ impl From<ReconcileError> for ApiError {
|
|||||||
|
|
||||||
/// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
|
/// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
|
||||||
/// be allowed to run if Service has finished its initial reconciliation.
|
/// be allowed to run if Service has finished its initial reconciliation.
|
||||||
async fn tenant_service_handler<R, H>(request: Request<Body>, handler: H) -> R::Output
|
async fn tenant_service_handler<R, H>(
|
||||||
|
request: Request<Body>,
|
||||||
|
handler: H,
|
||||||
|
request_name: RequestName,
|
||||||
|
) -> R::Output
|
||||||
where
|
where
|
||||||
R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
|
R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
|
||||||
H: FnOnce(Arc<Service>, Request<Body>) -> R + Send + Sync + 'static,
|
H: FnOnce(Arc<Service>, Request<Body>) -> R + Send + Sync + 'static,
|
||||||
@@ -517,9 +557,10 @@ where
|
|||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
request_span(
|
named_request_span(
|
||||||
request,
|
request,
|
||||||
|request| async move { handler(service, request).await },
|
|request| async move { handler(service, request).await },
|
||||||
|
request_name,
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
@@ -530,11 +571,98 @@ fn check_permissions(request: &Request<Body>, required_scope: Scope) -> Result<(
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
struct RequestMeta {
|
||||||
|
method: hyper::http::Method,
|
||||||
|
at: Instant,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn prologue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
|
||||||
|
) -> Middleware<B, ApiError> {
|
||||||
|
Middleware::pre(move |req| async move {
|
||||||
|
let meta = RequestMeta {
|
||||||
|
method: req.method().clone(),
|
||||||
|
at: Instant::now(),
|
||||||
|
};
|
||||||
|
|
||||||
|
req.set_context(meta);
|
||||||
|
|
||||||
|
Ok(req)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn epilogue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
|
||||||
|
) -> Middleware<B, ApiError> {
|
||||||
|
Middleware::post_with_info(move |resp, req_info| async move {
|
||||||
|
let request_name = match req_info.context::<RequestName>() {
|
||||||
|
Some(name) => name,
|
||||||
|
None => {
|
||||||
|
return Ok(resp);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if let Some(meta) = req_info.context::<RequestMeta>() {
|
||||||
|
let status = &crate::metrics::METRICS_REGISTRY
|
||||||
|
.metrics_group
|
||||||
|
.storage_controller_http_request_status;
|
||||||
|
let latency = &crate::metrics::METRICS_REGISTRY
|
||||||
|
.metrics_group
|
||||||
|
.storage_controller_http_request_latency;
|
||||||
|
|
||||||
|
status.inc(HttpRequestStatusLabelGroup {
|
||||||
|
path: request_name.0,
|
||||||
|
method: meta.method.clone().into(),
|
||||||
|
status: crate::metrics::StatusCode(resp.status()),
|
||||||
|
});
|
||||||
|
|
||||||
|
latency.observe(
|
||||||
|
HttpRequestLatencyLabelGroup {
|
||||||
|
path: request_name.0,
|
||||||
|
method: meta.method.into(),
|
||||||
|
},
|
||||||
|
meta.at.elapsed().as_secs_f64(),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
Ok(resp)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn measured_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
|
pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4";
|
||||||
|
|
||||||
|
let payload = crate::metrics::METRICS_REGISTRY.encode();
|
||||||
|
let response = Response::builder()
|
||||||
|
.status(200)
|
||||||
|
.header(CONTENT_TYPE, TEXT_FORMAT)
|
||||||
|
.body(payload.into())
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
Ok(response)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
struct RequestName(&'static str);
|
||||||
|
|
||||||
|
async fn named_request_span<R, H>(
|
||||||
|
request: Request<Body>,
|
||||||
|
handler: H,
|
||||||
|
name: RequestName,
|
||||||
|
) -> R::Output
|
||||||
|
where
|
||||||
|
R: Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
|
||||||
|
H: FnOnce(Request<Body>) -> R + Send + Sync + 'static,
|
||||||
|
{
|
||||||
|
request.set_context(name);
|
||||||
|
request_span(request, handler).await
|
||||||
|
}
|
||||||
|
|
||||||
pub fn make_router(
|
pub fn make_router(
|
||||||
service: Arc<Service>,
|
service: Arc<Service>,
|
||||||
auth: Option<Arc<SwappableJwtAuth>>,
|
auth: Option<Arc<SwappableJwtAuth>>,
|
||||||
) -> RouterBuilder<hyper::Body, ApiError> {
|
) -> RouterBuilder<hyper::Body, ApiError> {
|
||||||
let mut router = endpoint::make_router();
|
let mut router = endpoint::make_router()
|
||||||
|
.middleware(prologue_metrics_middleware())
|
||||||
|
.middleware(epilogue_metrics_middleware());
|
||||||
if auth.is_some() {
|
if auth.is_some() {
|
||||||
router = router.middleware(auth_middleware(|request| {
|
router = router.middleware(auth_middleware(|request| {
|
||||||
let state = get_state(request);
|
let state = get_state(request);
|
||||||
@@ -543,102 +671,166 @@ pub fn make_router(
|
|||||||
} else {
|
} else {
|
||||||
state.auth.as_deref()
|
state.auth.as_deref()
|
||||||
}
|
}
|
||||||
}))
|
}));
|
||||||
}
|
}
|
||||||
|
|
||||||
router
|
router
|
||||||
.data(Arc::new(HttpState::new(service, auth)))
|
.data(Arc::new(HttpState::new(service, auth)))
|
||||||
|
.get("/metrics", |r| {
|
||||||
|
named_request_span(r, measured_metrics_handler, RequestName("metrics"))
|
||||||
|
})
|
||||||
// Non-prefixed generic endpoints (status, metrics)
|
// Non-prefixed generic endpoints (status, metrics)
|
||||||
.get("/status", |r| request_span(r, handle_status))
|
.get("/status", |r| {
|
||||||
.get("/ready", |r| request_span(r, handle_ready))
|
named_request_span(r, handle_status, RequestName("status"))
|
||||||
|
})
|
||||||
|
.get("/ready", |r| {
|
||||||
|
named_request_span(r, handle_ready, RequestName("ready"))
|
||||||
|
})
|
||||||
// Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix
|
// Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix
|
||||||
.post("/upcall/v1/re-attach", |r| {
|
.post("/upcall/v1/re-attach", |r| {
|
||||||
request_span(r, handle_re_attach)
|
named_request_span(r, handle_re_attach, RequestName("upcall_v1_reattach"))
|
||||||
|
})
|
||||||
|
.post("/upcall/v1/validate", |r| {
|
||||||
|
named_request_span(r, handle_validate, RequestName("upcall_v1_validate"))
|
||||||
})
|
})
|
||||||
.post("/upcall/v1/validate", |r| request_span(r, handle_validate))
|
|
||||||
// Test/dev/debug endpoints
|
// Test/dev/debug endpoints
|
||||||
.post("/debug/v1/attach-hook", |r| {
|
.post("/debug/v1/attach-hook", |r| {
|
||||||
request_span(r, handle_attach_hook)
|
named_request_span(r, handle_attach_hook, RequestName("debug_v1_attach_hook"))
|
||||||
|
})
|
||||||
|
.post("/debug/v1/inspect", |r| {
|
||||||
|
named_request_span(r, handle_inspect, RequestName("debug_v1_inspect"))
|
||||||
})
|
})
|
||||||
.post("/debug/v1/inspect", |r| request_span(r, handle_inspect))
|
|
||||||
.post("/debug/v1/tenant/:tenant_id/drop", |r| {
|
.post("/debug/v1/tenant/:tenant_id/drop", |r| {
|
||||||
request_span(r, handle_tenant_drop)
|
named_request_span(r, handle_tenant_drop, RequestName("debug_v1_tenant_drop"))
|
||||||
})
|
})
|
||||||
.post("/debug/v1/node/:node_id/drop", |r| {
|
.post("/debug/v1/node/:node_id/drop", |r| {
|
||||||
request_span(r, handle_node_drop)
|
named_request_span(r, handle_node_drop, RequestName("debug_v1_node_drop"))
|
||||||
|
})
|
||||||
|
.get("/debug/v1/tenant", |r| {
|
||||||
|
named_request_span(r, handle_tenants_dump, RequestName("debug_v1_tenant"))
|
||||||
|
})
|
||||||
|
.get("/debug/v1/tenant/:tenant_id/locate", |r| {
|
||||||
|
tenant_service_handler(
|
||||||
|
r,
|
||||||
|
handle_tenant_locate,
|
||||||
|
RequestName("debug_v1_tenant_locate"),
|
||||||
|
)
|
||||||
})
|
})
|
||||||
.get("/debug/v1/tenant", |r| request_span(r, handle_tenants_dump))
|
|
||||||
.get("/debug/v1/scheduler", |r| {
|
.get("/debug/v1/scheduler", |r| {
|
||||||
request_span(r, handle_scheduler_dump)
|
named_request_span(r, handle_scheduler_dump, RequestName("debug_v1_scheduler"))
|
||||||
})
|
})
|
||||||
.post("/debug/v1/consistency_check", |r| {
|
.post("/debug/v1/consistency_check", |r| {
|
||||||
request_span(r, handle_consistency_check)
|
named_request_span(
|
||||||
|
r,
|
||||||
|
handle_consistency_check,
|
||||||
|
RequestName("debug_v1_consistency_check"),
|
||||||
|
)
|
||||||
})
|
})
|
||||||
.put("/debug/v1/failpoints", |r| {
|
.put("/debug/v1/failpoints", |r| {
|
||||||
request_span(r, |r| failpoints_handler(r, CancellationToken::new()))
|
request_span(r, |r| failpoints_handler(r, CancellationToken::new()))
|
||||||
})
|
})
|
||||||
.get("/control/v1/tenant/:tenant_id/locate", |r| {
|
|
||||||
tenant_service_handler(r, handle_tenant_locate)
|
|
||||||
})
|
|
||||||
// Node operations
|
// Node operations
|
||||||
.post("/control/v1/node", |r| {
|
.post("/control/v1/node", |r| {
|
||||||
request_span(r, handle_node_register)
|
named_request_span(r, handle_node_register, RequestName("control_v1_node"))
|
||||||
|
})
|
||||||
|
.get("/control/v1/node", |r| {
|
||||||
|
named_request_span(r, handle_node_list, RequestName("control_v1_node"))
|
||||||
})
|
})
|
||||||
.get("/control/v1/node", |r| request_span(r, handle_node_list))
|
|
||||||
.put("/control/v1/node/:node_id/config", |r| {
|
.put("/control/v1/node/:node_id/config", |r| {
|
||||||
request_span(r, handle_node_configure)
|
named_request_span(
|
||||||
|
r,
|
||||||
|
handle_node_configure,
|
||||||
|
RequestName("control_v1_node_config"),
|
||||||
|
)
|
||||||
})
|
})
|
||||||
// Tenant Shard operations
|
// Tenant Shard operations
|
||||||
.put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
|
.put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
|
||||||
tenant_service_handler(r, handle_tenant_shard_migrate)
|
tenant_service_handler(
|
||||||
|
r,
|
||||||
|
handle_tenant_shard_migrate,
|
||||||
|
RequestName("control_v1_tenant_migrate"),
|
||||||
|
)
|
||||||
})
|
})
|
||||||
.put("/control/v1/tenant/:tenant_id/shard_split", |r| {
|
.put("/control/v1/tenant/:tenant_id/shard_split", |r| {
|
||||||
tenant_service_handler(r, handle_tenant_shard_split)
|
tenant_service_handler(
|
||||||
|
r,
|
||||||
|
handle_tenant_shard_split,
|
||||||
|
RequestName("control_v1_tenant_shard_split"),
|
||||||
|
)
|
||||||
})
|
})
|
||||||
.post("/control/v1/balance/all", |r| {
|
.get("/control/v1/tenant/:tenant_id", |r| {
|
||||||
tenant_service_handler(r, handle_balance_all)
|
tenant_service_handler(
|
||||||
})
|
r,
|
||||||
.post("/control/v1/balance/attached", |r| {
|
handle_tenant_describe,
|
||||||
tenant_service_handler(r, handle_balance_attached)
|
RequestName("control_v1_tenant_describe"),
|
||||||
|
)
|
||||||
})
|
})
|
||||||
// Tenant operations
|
// Tenant operations
|
||||||
// The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
|
// The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
|
||||||
// this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
|
// this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
|
||||||
.post("/v1/tenant", |r| {
|
.post("/v1/tenant", |r| {
|
||||||
tenant_service_handler(r, handle_tenant_create)
|
tenant_service_handler(r, handle_tenant_create, RequestName("v1_tenant"))
|
||||||
})
|
})
|
||||||
.delete("/v1/tenant/:tenant_id", |r| {
|
.delete("/v1/tenant/:tenant_id", |r| {
|
||||||
tenant_service_handler(r, handle_tenant_delete)
|
tenant_service_handler(r, handle_tenant_delete, RequestName("v1_tenant"))
|
||||||
})
|
})
|
||||||
.put("/v1/tenant/config", |r| {
|
.put("/v1/tenant/config", |r| {
|
||||||
tenant_service_handler(r, handle_tenant_config_set)
|
tenant_service_handler(r, handle_tenant_config_set, RequestName("v1_tenant_config"))
|
||||||
})
|
})
|
||||||
.get("/v1/tenant/:tenant_id/config", |r| {
|
.get("/v1/tenant/:tenant_id/config", |r| {
|
||||||
tenant_service_handler(r, handle_tenant_config_get)
|
tenant_service_handler(r, handle_tenant_config_get, RequestName("v1_tenant_config"))
|
||||||
})
|
})
|
||||||
.put("/v1/tenant/:tenant_id/location_config", |r| {
|
.put("/v1/tenant/:tenant_shard_id/location_config", |r| {
|
||||||
tenant_service_handler(r, handle_tenant_location_config)
|
tenant_service_handler(
|
||||||
|
r,
|
||||||
|
handle_tenant_location_config,
|
||||||
|
RequestName("v1_tenant_location_config"),
|
||||||
|
)
|
||||||
})
|
})
|
||||||
.put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
|
.put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
|
||||||
tenant_service_handler(r, handle_tenant_time_travel_remote_storage)
|
tenant_service_handler(
|
||||||
|
r,
|
||||||
|
handle_tenant_time_travel_remote_storage,
|
||||||
|
RequestName("v1_tenant_time_travel_remote_storage"),
|
||||||
|
)
|
||||||
})
|
})
|
||||||
.post("/v1/tenant/:tenant_id/secondary/download", |r| {
|
.post("/v1/tenant/:tenant_id/secondary/download", |r| {
|
||||||
tenant_service_handler(r, handle_tenant_secondary_download)
|
tenant_service_handler(
|
||||||
|
r,
|
||||||
|
handle_tenant_secondary_download,
|
||||||
|
RequestName("v1_tenant_secondary_download"),
|
||||||
|
)
|
||||||
})
|
})
|
||||||
// Timeline operations
|
// Timeline operations
|
||||||
.delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
|
.delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
|
||||||
tenant_service_handler(r, handle_tenant_timeline_delete)
|
tenant_service_handler(
|
||||||
|
r,
|
||||||
|
handle_tenant_timeline_delete,
|
||||||
|
RequestName("v1_tenant_timeline"),
|
||||||
|
)
|
||||||
})
|
})
|
||||||
.post("/v1/tenant/:tenant_id/timeline", |r| {
|
.post("/v1/tenant/:tenant_id/timeline", |r| {
|
||||||
tenant_service_handler(r, handle_tenant_timeline_create)
|
tenant_service_handler(
|
||||||
|
r,
|
||||||
|
handle_tenant_timeline_create,
|
||||||
|
RequestName("v1_tenant_timeline"),
|
||||||
|
)
|
||||||
})
|
})
|
||||||
// Tenant detail GET passthrough to shard zero
|
// Tenant detail GET passthrough to shard zero
|
||||||
.get("/v1/tenant/:tenant_id", |r| {
|
.get("/v1/tenant/:tenant_id", |r| {
|
||||||
tenant_service_handler(r, handle_tenant_timeline_passthrough)
|
tenant_service_handler(
|
||||||
|
r,
|
||||||
|
handle_tenant_timeline_passthrough,
|
||||||
|
RequestName("v1_tenant_passthrough"),
|
||||||
|
)
|
||||||
})
|
})
|
||||||
// Timeline GET passthrough to shard zero. Note that the `*` in the URL is a wildcard: any future
|
// Timeline GET passthrough to shard zero. Note that the `*` in the URL is a wildcard: any future
|
||||||
// timeline GET APIs will be implicitly included.
|
// timeline GET APIs will be implicitly included.
|
||||||
.get("/v1/tenant/:tenant_id/timeline*", |r| {
|
.get("/v1/tenant/:tenant_id/timeline*", |r| {
|
||||||
tenant_service_handler(r, handle_tenant_timeline_passthrough)
|
tenant_service_handler(
|
||||||
|
r,
|
||||||
|
handle_tenant_timeline_passthrough,
|
||||||
|
RequestName("v1_tenant_timeline_passthrough"),
|
||||||
|
)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,10 +3,12 @@ use utils::seqwait::MonotonicCounter;
|
|||||||
|
|
||||||
mod auth;
|
mod auth;
|
||||||
mod compute_hook;
|
mod compute_hook;
|
||||||
|
mod heartbeater;
|
||||||
pub mod http;
|
pub mod http;
|
||||||
mod id_lock_map;
|
mod id_lock_map;
|
||||||
pub mod metrics;
|
pub mod metrics;
|
||||||
mod node;
|
mod node;
|
||||||
|
mod pageserver_client;
|
||||||
pub mod persistence;
|
pub mod persistence;
|
||||||
mod reconciler;
|
mod reconciler;
|
||||||
mod scheduler;
|
mod scheduler;
|
||||||
|
|||||||
@@ -1,15 +1,8 @@
|
|||||||
/// The attachment service mimics the aspects of the control plane API
|
|
||||||
/// that are required for a pageserver to operate.
|
|
||||||
///
|
|
||||||
/// This enables running & testing pageservers without a full-blown
|
|
||||||
/// deployment of the Neon cloud platform.
|
|
||||||
///
|
|
||||||
use anyhow::{anyhow, Context};
|
use anyhow::{anyhow, Context};
|
||||||
use attachment_service::http::make_router;
|
use attachment_service::http::make_router;
|
||||||
use attachment_service::metrics::preinitialize_metrics;
|
use attachment_service::metrics::preinitialize_metrics;
|
||||||
use attachment_service::persistence::Persistence;
|
use attachment_service::persistence::Persistence;
|
||||||
use attachment_service::service::{Config, Service};
|
use attachment_service::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT};
|
||||||
use aws_config::{BehaviorVersion, Region};
|
|
||||||
use camino::Utf8PathBuf;
|
use camino::Utf8PathBuf;
|
||||||
use clap::Parser;
|
use clap::Parser;
|
||||||
use diesel::Connection;
|
use diesel::Connection;
|
||||||
@@ -60,6 +53,30 @@ struct Cli {
|
|||||||
/// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
|
/// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
database_url: Option<String>,
|
database_url: Option<String>,
|
||||||
|
|
||||||
|
/// Flag to enable dev mode, which permits running without auth
|
||||||
|
#[arg(long, default_value = "false")]
|
||||||
|
dev: bool,
|
||||||
|
|
||||||
|
/// Grace period before marking unresponsive pageserver offline
|
||||||
|
#[arg(long)]
|
||||||
|
max_unavailable_interval: Option<humantime::Duration>,
|
||||||
|
}
|
||||||
|
|
||||||
|
enum StrictMode {
|
||||||
|
/// In strict mode, we will require that all secrets are loaded, i.e. security features
|
||||||
|
/// may not be implicitly turned off by omitting secrets in the environment.
|
||||||
|
Strict,
|
||||||
|
/// In dev mode, secrets are optional, and omitting a particular secret will implicitly
|
||||||
|
/// disable the auth related to it (e.g. no pageserver jwt key -> send unauthenticated
|
||||||
|
/// requests, no public key -> don't authenticate incoming requests).
|
||||||
|
Dev,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for StrictMode {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::Strict
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Secrets may either be provided on the command line (for testing), or loaded from AWS SecretManager: this
|
/// Secrets may either be provided on the command line (for testing), or loaded from AWS SecretManager: this
|
||||||
@@ -72,13 +89,6 @@ struct Secrets {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Secrets {
|
impl Secrets {
|
||||||
const DATABASE_URL_SECRET: &'static str = "rds-neon-storage-controller-url";
|
|
||||||
const PAGESERVER_JWT_TOKEN_SECRET: &'static str =
|
|
||||||
"neon-storage-controller-pageserver-jwt-token";
|
|
||||||
const CONTROL_PLANE_JWT_TOKEN_SECRET: &'static str =
|
|
||||||
"neon-storage-controller-control-plane-jwt-token";
|
|
||||||
const PUBLIC_KEY_SECRET: &'static str = "neon-storage-controller-public-key";
|
|
||||||
|
|
||||||
const DATABASE_URL_ENV: &'static str = "DATABASE_URL";
|
const DATABASE_URL_ENV: &'static str = "DATABASE_URL";
|
||||||
const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN";
|
const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN";
|
||||||
const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN";
|
const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN";
|
||||||
@@ -89,111 +99,41 @@ impl Secrets {
|
|||||||
/// - Environment variables if DATABASE_URL is set.
|
/// - Environment variables if DATABASE_URL is set.
|
||||||
/// - AWS Secrets Manager secrets
|
/// - AWS Secrets Manager secrets
|
||||||
async fn load(args: &Cli) -> anyhow::Result<Self> {
|
async fn load(args: &Cli) -> anyhow::Result<Self> {
|
||||||
match &args.database_url {
|
let Some(database_url) =
|
||||||
Some(url) => Self::load_cli(url, args),
|
Self::load_secret(&args.database_url, Self::DATABASE_URL_ENV).await
|
||||||
None => match std::env::var(Self::DATABASE_URL_ENV) {
|
|
||||||
Ok(database_url) => Self::load_env(database_url),
|
|
||||||
Err(_) => Self::load_aws_sm().await,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn load_env(database_url: String) -> anyhow::Result<Self> {
|
|
||||||
let public_key = match std::env::var(Self::PUBLIC_KEY_ENV) {
|
|
||||||
Ok(public_key) => Some(JwtAuth::from_key(public_key).context("Loading public key")?),
|
|
||||||
Err(_) => None,
|
|
||||||
};
|
|
||||||
Ok(Self {
|
|
||||||
database_url,
|
|
||||||
public_key,
|
|
||||||
jwt_token: std::env::var(Self::PAGESERVER_JWT_TOKEN_ENV).ok(),
|
|
||||||
control_plane_jwt_token: std::env::var(Self::CONTROL_PLANE_JWT_TOKEN_ENV).ok(),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn load_aws_sm() -> anyhow::Result<Self> {
|
|
||||||
let Ok(region) = std::env::var("AWS_REGION") else {
|
|
||||||
anyhow::bail!("AWS_REGION is not set, cannot load secrets automatically: either set this, or use CLI args to supply secrets");
|
|
||||||
};
|
|
||||||
let config = aws_config::defaults(BehaviorVersion::v2023_11_09())
|
|
||||||
.region(Region::new(region.clone()))
|
|
||||||
.load()
|
|
||||||
.await;
|
|
||||||
|
|
||||||
let asm = aws_sdk_secretsmanager::Client::new(&config);
|
|
||||||
|
|
||||||
let Some(database_url) = asm
|
|
||||||
.get_secret_value()
|
|
||||||
.secret_id(Self::DATABASE_URL_SECRET)
|
|
||||||
.send()
|
|
||||||
.await?
|
|
||||||
.secret_string()
|
|
||||||
.map(str::to_string)
|
|
||||||
else {
|
else {
|
||||||
anyhow::bail!(
|
anyhow::bail!(
|
||||||
"Database URL secret not found at {region}/{}",
|
"Database URL is not set (set `--database-url`, or `DATABASE_URL` environment)"
|
||||||
Self::DATABASE_URL_SECRET
|
|
||||||
)
|
)
|
||||||
};
|
};
|
||||||
|
|
||||||
let jwt_token = asm
|
let public_key = match Self::load_secret(&args.public_key, Self::PUBLIC_KEY_ENV).await {
|
||||||
.get_secret_value()
|
Some(v) => Some(JwtAuth::from_key(v).context("Loading public key")?),
|
||||||
.secret_id(Self::PAGESERVER_JWT_TOKEN_SECRET)
|
None => None,
|
||||||
.send()
|
|
||||||
.await?
|
|
||||||
.secret_string()
|
|
||||||
.map(str::to_string);
|
|
||||||
if jwt_token.is_none() {
|
|
||||||
tracing::warn!("No pageserver JWT token set: this will only work if authentication is disabled on the pageserver");
|
|
||||||
}
|
|
||||||
|
|
||||||
let control_plane_jwt_token = asm
|
|
||||||
.get_secret_value()
|
|
||||||
.secret_id(Self::CONTROL_PLANE_JWT_TOKEN_SECRET)
|
|
||||||
.send()
|
|
||||||
.await?
|
|
||||||
.secret_string()
|
|
||||||
.map(str::to_string);
|
|
||||||
if jwt_token.is_none() {
|
|
||||||
tracing::warn!("No control plane JWT token set: this will only work if authentication is disabled on the pageserver");
|
|
||||||
}
|
|
||||||
|
|
||||||
let public_key = asm
|
|
||||||
.get_secret_value()
|
|
||||||
.secret_id(Self::PUBLIC_KEY_SECRET)
|
|
||||||
.send()
|
|
||||||
.await?
|
|
||||||
.secret_string()
|
|
||||||
.map(str::to_string);
|
|
||||||
let public_key = match public_key {
|
|
||||||
Some(key) => Some(JwtAuth::from_key(key)?),
|
|
||||||
None => {
|
|
||||||
tracing::warn!(
|
|
||||||
"No public key set: inccoming HTTP requests will not be authenticated"
|
|
||||||
);
|
|
||||||
None
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok(Self {
|
let this = Self {
|
||||||
database_url,
|
database_url,
|
||||||
public_key,
|
public_key,
|
||||||
jwt_token,
|
jwt_token: Self::load_secret(&args.jwt_token, Self::PAGESERVER_JWT_TOKEN_ENV).await,
|
||||||
control_plane_jwt_token,
|
control_plane_jwt_token: Self::load_secret(
|
||||||
})
|
&args.control_plane_jwt_token,
|
||||||
|
Self::CONTROL_PLANE_JWT_TOKEN_ENV,
|
||||||
|
)
|
||||||
|
.await,
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(this)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn load_cli(database_url: &str, args: &Cli) -> anyhow::Result<Self> {
|
async fn load_secret(cli: &Option<String>, env_name: &str) -> Option<String> {
|
||||||
let public_key = match &args.public_key {
|
if let Some(v) = cli {
|
||||||
None => None,
|
Some(v.clone())
|
||||||
Some(key) => Some(JwtAuth::from_key(key.clone()).context("Loading public key")?),
|
} else if let Ok(v) = std::env::var(env_name) {
|
||||||
};
|
Some(v)
|
||||||
Ok(Self {
|
} else {
|
||||||
database_url: database_url.to_owned(),
|
None
|
||||||
public_key,
|
}
|
||||||
jwt_token: args.jwt_token.clone(),
|
|
||||||
control_plane_jwt_token: args.control_plane_jwt_token.clone(),
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -212,6 +152,12 @@ async fn migration_run(database_url: &str) -> anyhow::Result<()> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn main() -> anyhow::Result<()> {
|
fn main() -> anyhow::Result<()> {
|
||||||
|
let default_panic = std::panic::take_hook();
|
||||||
|
std::panic::set_hook(Box::new(move |info| {
|
||||||
|
default_panic(info);
|
||||||
|
std::process::exit(1);
|
||||||
|
}));
|
||||||
|
|
||||||
tokio::runtime::Builder::new_current_thread()
|
tokio::runtime::Builder::new_current_thread()
|
||||||
// We use spawn_blocking for database operations, so require approximately
|
// We use spawn_blocking for database operations, so require approximately
|
||||||
// as many blocking threads as we will open database connections.
|
// as many blocking threads as we will open database connections.
|
||||||
@@ -243,12 +189,50 @@ async fn async_main() -> anyhow::Result<()> {
|
|||||||
args.listen
|
args.listen
|
||||||
);
|
);
|
||||||
|
|
||||||
|
let strict_mode = if args.dev {
|
||||||
|
StrictMode::Dev
|
||||||
|
} else {
|
||||||
|
StrictMode::Strict
|
||||||
|
};
|
||||||
|
|
||||||
let secrets = Secrets::load(&args).await?;
|
let secrets = Secrets::load(&args).await?;
|
||||||
|
|
||||||
|
// Validate required secrets and arguments are provided in strict mode
|
||||||
|
match strict_mode {
|
||||||
|
StrictMode::Strict
|
||||||
|
if (secrets.public_key.is_none()
|
||||||
|
|| secrets.jwt_token.is_none()
|
||||||
|
|| secrets.control_plane_jwt_token.is_none()) =>
|
||||||
|
{
|
||||||
|
// Production systems should always have secrets configured: if public_key was not set
|
||||||
|
// then we would implicitly disable auth.
|
||||||
|
anyhow::bail!(
|
||||||
|
"Insecure config! One or more secrets is not set. This is only permitted in `--dev` mode"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
StrictMode::Strict if args.compute_hook_url.is_none() => {
|
||||||
|
// Production systems should always have a compute hook set, to prevent falling
|
||||||
|
// back to trying to use neon_local.
|
||||||
|
anyhow::bail!(
|
||||||
|
"`--compute-hook-url` is not set: this is only permitted in `--dev` mode"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
StrictMode::Strict => {
|
||||||
|
tracing::info!("Starting in strict mode: configuration is OK.")
|
||||||
|
}
|
||||||
|
StrictMode::Dev => {
|
||||||
|
tracing::warn!("Starting in dev mode: this may be an insecure configuration.")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let config = Config {
|
let config = Config {
|
||||||
jwt_token: secrets.jwt_token,
|
jwt_token: secrets.jwt_token,
|
||||||
control_plane_jwt_token: secrets.control_plane_jwt_token,
|
control_plane_jwt_token: secrets.control_plane_jwt_token,
|
||||||
compute_hook_url: args.compute_hook_url,
|
compute_hook_url: args.compute_hook_url,
|
||||||
|
max_unavailable_interval: args
|
||||||
|
.max_unavailable_interval
|
||||||
|
.map(humantime::Duration::into)
|
||||||
|
.unwrap_or(MAX_UNAVAILABLE_INTERVAL_DEFAULT),
|
||||||
};
|
};
|
||||||
|
|
||||||
// After loading secrets & config, but before starting anything else, apply database migrations
|
// After loading secrets & config, but before starting anything else, apply database migrations
|
||||||
|
|||||||
@@ -1,32 +1,284 @@
|
|||||||
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
|
//!
|
||||||
|
//! This module provides metric definitions for the storage controller.
|
||||||
|
//!
|
||||||
|
//! All metrics are grouped in [`StorageControllerMetricGroup`]. [`StorageControllerMetrics`] holds
|
||||||
|
//! the mentioned metrics and their encoder. It's globally available via the [`METRICS_REGISTRY`]
|
||||||
|
//! constant.
|
||||||
|
//!
|
||||||
|
//! The rest of the code defines label group types and deals with converting outer types to labels.
|
||||||
|
//!
|
||||||
|
use bytes::Bytes;
|
||||||
|
use measured::{
|
||||||
|
label::{LabelValue, StaticLabelSet},
|
||||||
|
FixedCardinalityLabel, MetricGroup,
|
||||||
|
};
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
|
use std::sync::Mutex;
|
||||||
|
|
||||||
pub(crate) struct ReconcilerMetrics {
|
use crate::persistence::{DatabaseError, DatabaseOperation};
|
||||||
pub(crate) spawned: IntCounter,
|
|
||||||
pub(crate) complete: IntCounterVec,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ReconcilerMetrics {
|
pub(crate) static METRICS_REGISTRY: Lazy<StorageControllerMetrics> =
|
||||||
// Labels used on [`Self::complete`]
|
Lazy::new(StorageControllerMetrics::default);
|
||||||
pub(crate) const SUCCESS: &'static str = "ok";
|
|
||||||
pub(crate) const ERROR: &'static str = "success";
|
|
||||||
pub(crate) const CANCEL: &'static str = "cancel";
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) static RECONCILER: Lazy<ReconcilerMetrics> = Lazy::new(|| ReconcilerMetrics {
|
|
||||||
spawned: register_int_counter!(
|
|
||||||
"storage_controller_reconcile_spawn",
|
|
||||||
"Count of how many times we spawn a reconcile task",
|
|
||||||
)
|
|
||||||
.expect("failed to define a metric"),
|
|
||||||
complete: register_int_counter_vec!(
|
|
||||||
"storage_controller_reconcile_complete",
|
|
||||||
"Reconciler tasks completed, broken down by success/failure/cancelled",
|
|
||||||
&["status"],
|
|
||||||
)
|
|
||||||
.expect("failed to define a metric"),
|
|
||||||
});
|
|
||||||
|
|
||||||
pub fn preinitialize_metrics() {
|
pub fn preinitialize_metrics() {
|
||||||
Lazy::force(&RECONCILER);
|
Lazy::force(&METRICS_REGISTRY);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) struct StorageControllerMetrics {
|
||||||
|
pub(crate) metrics_group: StorageControllerMetricGroup,
|
||||||
|
encoder: Mutex<measured::text::TextEncoder>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(measured::MetricGroup)]
|
||||||
|
pub(crate) struct StorageControllerMetricGroup {
|
||||||
|
/// Count of how many times we spawn a reconcile task
|
||||||
|
pub(crate) storage_controller_reconcile_spawn: measured::Counter,
|
||||||
|
/// Reconciler tasks completed, broken down by success/failure/cancelled
|
||||||
|
pub(crate) storage_controller_reconcile_complete:
|
||||||
|
measured::CounterVec<ReconcileCompleteLabelGroupSet>,
|
||||||
|
|
||||||
|
/// HTTP request status counters for handled requests
|
||||||
|
pub(crate) storage_controller_http_request_status:
|
||||||
|
measured::CounterVec<HttpRequestStatusLabelGroupSet>,
|
||||||
|
/// HTTP request handler latency across all status codes
|
||||||
|
pub(crate) storage_controller_http_request_latency:
|
||||||
|
measured::HistogramVec<HttpRequestLatencyLabelGroupSet, 5>,
|
||||||
|
|
||||||
|
/// Count of HTTP requests to the pageserver that resulted in an error,
|
||||||
|
/// broken down by the pageserver node id, request name and method
|
||||||
|
pub(crate) storage_controller_pageserver_request_error:
|
||||||
|
measured::CounterVec<PageserverRequestLabelGroupSet>,
|
||||||
|
|
||||||
|
/// Latency of HTTP requests to the pageserver, broken down by pageserver
|
||||||
|
/// node id, request name and method. This include both successful and unsuccessful
|
||||||
|
/// requests.
|
||||||
|
pub(crate) storage_controller_pageserver_request_latency:
|
||||||
|
measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
|
||||||
|
|
||||||
|
/// Count of pass-through HTTP requests to the pageserver that resulted in an error,
|
||||||
|
/// broken down by the pageserver node id, request name and method
|
||||||
|
pub(crate) storage_controller_passthrough_request_error:
|
||||||
|
measured::CounterVec<PageserverRequestLabelGroupSet>,
|
||||||
|
|
||||||
|
/// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver
|
||||||
|
/// node id, request name and method. This include both successful and unsuccessful
|
||||||
|
/// requests.
|
||||||
|
pub(crate) storage_controller_passthrough_request_latency:
|
||||||
|
measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
|
||||||
|
|
||||||
|
/// Count of errors in database queries, broken down by error type and operation.
|
||||||
|
pub(crate) storage_controller_database_query_error:
|
||||||
|
measured::CounterVec<DatabaseQueryErrorLabelGroupSet>,
|
||||||
|
|
||||||
|
/// Latency of database queries, broken down by operation.
|
||||||
|
pub(crate) storage_controller_database_query_latency:
|
||||||
|
measured::HistogramVec<DatabaseQueryLatencyLabelGroupSet, 5>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl StorageControllerMetrics {
|
||||||
|
pub(crate) fn encode(&self) -> Bytes {
|
||||||
|
let mut encoder = self.encoder.lock().unwrap();
|
||||||
|
self.metrics_group.collect_into(&mut *encoder);
|
||||||
|
encoder.finish()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for StorageControllerMetrics {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
metrics_group: StorageControllerMetricGroup::new(),
|
||||||
|
encoder: Mutex::new(measured::text::TextEncoder::new()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl StorageControllerMetricGroup {
|
||||||
|
pub(crate) fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
storage_controller_reconcile_spawn: measured::Counter::new(),
|
||||||
|
storage_controller_reconcile_complete: measured::CounterVec::new(
|
||||||
|
ReconcileCompleteLabelGroupSet {
|
||||||
|
status: StaticLabelSet::new(),
|
||||||
|
},
|
||||||
|
),
|
||||||
|
storage_controller_http_request_status: measured::CounterVec::new(
|
||||||
|
HttpRequestStatusLabelGroupSet {
|
||||||
|
path: lasso::ThreadedRodeo::new(),
|
||||||
|
method: StaticLabelSet::new(),
|
||||||
|
status: StaticLabelSet::new(),
|
||||||
|
},
|
||||||
|
),
|
||||||
|
storage_controller_http_request_latency: measured::HistogramVec::new(
|
||||||
|
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
|
||||||
|
),
|
||||||
|
storage_controller_pageserver_request_error: measured::CounterVec::new(
|
||||||
|
PageserverRequestLabelGroupSet {
|
||||||
|
pageserver_id: lasso::ThreadedRodeo::new(),
|
||||||
|
path: lasso::ThreadedRodeo::new(),
|
||||||
|
method: StaticLabelSet::new(),
|
||||||
|
},
|
||||||
|
),
|
||||||
|
storage_controller_pageserver_request_latency: measured::HistogramVec::new(
|
||||||
|
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
|
||||||
|
),
|
||||||
|
storage_controller_passthrough_request_error: measured::CounterVec::new(
|
||||||
|
PageserverRequestLabelGroupSet {
|
||||||
|
pageserver_id: lasso::ThreadedRodeo::new(),
|
||||||
|
path: lasso::ThreadedRodeo::new(),
|
||||||
|
method: StaticLabelSet::new(),
|
||||||
|
},
|
||||||
|
),
|
||||||
|
storage_controller_passthrough_request_latency: measured::HistogramVec::new(
|
||||||
|
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
|
||||||
|
),
|
||||||
|
storage_controller_database_query_error: measured::CounterVec::new(
|
||||||
|
DatabaseQueryErrorLabelGroupSet {
|
||||||
|
operation: StaticLabelSet::new(),
|
||||||
|
error_type: StaticLabelSet::new(),
|
||||||
|
},
|
||||||
|
),
|
||||||
|
storage_controller_database_query_latency: measured::HistogramVec::new(
|
||||||
|
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
|
||||||
|
),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(measured::LabelGroup)]
|
||||||
|
#[label(set = ReconcileCompleteLabelGroupSet)]
|
||||||
|
pub(crate) struct ReconcileCompleteLabelGroup {
|
||||||
|
pub(crate) status: ReconcileOutcome,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(measured::LabelGroup)]
|
||||||
|
#[label(set = HttpRequestStatusLabelGroupSet)]
|
||||||
|
pub(crate) struct HttpRequestStatusLabelGroup<'a> {
|
||||||
|
#[label(dynamic_with = lasso::ThreadedRodeo)]
|
||||||
|
pub(crate) path: &'a str,
|
||||||
|
pub(crate) method: Method,
|
||||||
|
pub(crate) status: StatusCode,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(measured::LabelGroup)]
|
||||||
|
#[label(set = HttpRequestLatencyLabelGroupSet)]
|
||||||
|
pub(crate) struct HttpRequestLatencyLabelGroup<'a> {
|
||||||
|
#[label(dynamic_with = lasso::ThreadedRodeo)]
|
||||||
|
pub(crate) path: &'a str,
|
||||||
|
pub(crate) method: Method,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for HttpRequestLatencyLabelGroupSet {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
path: lasso::ThreadedRodeo::new(),
|
||||||
|
method: StaticLabelSet::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(measured::LabelGroup, Clone)]
|
||||||
|
#[label(set = PageserverRequestLabelGroupSet)]
|
||||||
|
pub(crate) struct PageserverRequestLabelGroup<'a> {
|
||||||
|
#[label(dynamic_with = lasso::ThreadedRodeo)]
|
||||||
|
pub(crate) pageserver_id: &'a str,
|
||||||
|
#[label(dynamic_with = lasso::ThreadedRodeo)]
|
||||||
|
pub(crate) path: &'a str,
|
||||||
|
pub(crate) method: Method,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for PageserverRequestLabelGroupSet {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
pageserver_id: lasso::ThreadedRodeo::new(),
|
||||||
|
path: lasso::ThreadedRodeo::new(),
|
||||||
|
method: StaticLabelSet::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(measured::LabelGroup)]
|
||||||
|
#[label(set = DatabaseQueryErrorLabelGroupSet)]
|
||||||
|
pub(crate) struct DatabaseQueryErrorLabelGroup {
|
||||||
|
pub(crate) error_type: DatabaseErrorLabel,
|
||||||
|
pub(crate) operation: DatabaseOperation,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(measured::LabelGroup)]
|
||||||
|
#[label(set = DatabaseQueryLatencyLabelGroupSet)]
|
||||||
|
pub(crate) struct DatabaseQueryLatencyLabelGroup {
|
||||||
|
pub(crate) operation: DatabaseOperation,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(FixedCardinalityLabel)]
|
||||||
|
pub(crate) enum ReconcileOutcome {
|
||||||
|
#[label(rename = "ok")]
|
||||||
|
Success,
|
||||||
|
Error,
|
||||||
|
Cancel,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(FixedCardinalityLabel, Clone)]
|
||||||
|
pub(crate) enum Method {
|
||||||
|
Get,
|
||||||
|
Put,
|
||||||
|
Post,
|
||||||
|
Delete,
|
||||||
|
Other,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<hyper::Method> for Method {
|
||||||
|
fn from(value: hyper::Method) -> Self {
|
||||||
|
if value == hyper::Method::GET {
|
||||||
|
Method::Get
|
||||||
|
} else if value == hyper::Method::PUT {
|
||||||
|
Method::Put
|
||||||
|
} else if value == hyper::Method::POST {
|
||||||
|
Method::Post
|
||||||
|
} else if value == hyper::Method::DELETE {
|
||||||
|
Method::Delete
|
||||||
|
} else {
|
||||||
|
Method::Other
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode);
|
||||||
|
|
||||||
|
impl LabelValue for StatusCode {
|
||||||
|
fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
|
||||||
|
v.write_int(self.0.as_u16() as u64)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FixedCardinalityLabel for StatusCode {
|
||||||
|
fn cardinality() -> usize {
|
||||||
|
(100..1000).len()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn encode(&self) -> usize {
|
||||||
|
self.0.as_u16() as usize
|
||||||
|
}
|
||||||
|
|
||||||
|
fn decode(value: usize) -> Self {
|
||||||
|
Self(hyper::http::StatusCode::from_u16(u16::try_from(value).unwrap()).unwrap())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(FixedCardinalityLabel)]
|
||||||
|
pub(crate) enum DatabaseErrorLabel {
|
||||||
|
Query,
|
||||||
|
Connection,
|
||||||
|
ConnectionPool,
|
||||||
|
Logical,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DatabaseError {
|
||||||
|
pub(crate) fn error_label(&self) -> DatabaseErrorLabel {
|
||||||
|
match self {
|
||||||
|
Self::Query(_) => DatabaseErrorLabel::Query,
|
||||||
|
Self::Connection(_) => DatabaseErrorLabel::Connection,
|
||||||
|
Self::ConnectionPool(_) => DatabaseErrorLabel::ConnectionPool,
|
||||||
|
Self::Logical(_) => DatabaseErrorLabel::Logical,
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -12,7 +12,9 @@ use serde::Serialize;
|
|||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
use utils::{backoff, id::NodeId};
|
use utils::{backoff, id::NodeId};
|
||||||
|
|
||||||
use crate::persistence::NodePersistence;
|
use crate::{
|
||||||
|
pageserver_client::PageserverClient, persistence::NodePersistence, scheduler::MaySchedule,
|
||||||
|
};
|
||||||
|
|
||||||
/// Represents the in-memory description of a Node.
|
/// Represents the in-memory description of a Node.
|
||||||
///
|
///
|
||||||
@@ -83,29 +85,38 @@ impl Node {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn set_availability(
|
pub(crate) fn set_availability(&mut self, availability: NodeAvailability) {
|
||||||
&mut self,
|
match self.get_availability_transition(availability) {
|
||||||
availability: NodeAvailability,
|
AvailabilityTransition::ToActive => {
|
||||||
) -> AvailabilityTransition {
|
|
||||||
use NodeAvailability::*;
|
|
||||||
let transition = match (self.availability, availability) {
|
|
||||||
(Offline, Active) => {
|
|
||||||
// Give the node a new cancellation token, effectively resetting it to un-cancelled. Any
|
// Give the node a new cancellation token, effectively resetting it to un-cancelled. Any
|
||||||
// users of previously-cloned copies of the node will still see the old cancellation
|
// users of previously-cloned copies of the node will still see the old cancellation
|
||||||
// state. For example, Reconcilers in flight will have to complete and be spawned
|
// state. For example, Reconcilers in flight will have to complete and be spawned
|
||||||
// again to realize that the node has become available.
|
// again to realize that the node has become available.
|
||||||
self.cancel = CancellationToken::new();
|
self.cancel = CancellationToken::new();
|
||||||
AvailabilityTransition::ToActive
|
|
||||||
}
|
}
|
||||||
(Active, Offline) => {
|
AvailabilityTransition::ToOffline => {
|
||||||
// Fire the node's cancellation token to cancel any in-flight API requests to it
|
// Fire the node's cancellation token to cancel any in-flight API requests to it
|
||||||
self.cancel.cancel();
|
self.cancel.cancel();
|
||||||
AvailabilityTransition::ToOffline
|
|
||||||
}
|
}
|
||||||
_ => AvailabilityTransition::Unchanged,
|
AvailabilityTransition::Unchanged => {}
|
||||||
};
|
}
|
||||||
self.availability = availability;
|
self.availability = availability;
|
||||||
transition
|
}
|
||||||
|
|
||||||
|
/// Without modifying the availability of the node, convert the intended availability
|
||||||
|
/// into a description of the transition.
|
||||||
|
pub(crate) fn get_availability_transition(
|
||||||
|
&self,
|
||||||
|
availability: NodeAvailability,
|
||||||
|
) -> AvailabilityTransition {
|
||||||
|
use AvailabilityTransition::*;
|
||||||
|
use NodeAvailability::*;
|
||||||
|
|
||||||
|
match (self.availability, availability) {
|
||||||
|
(Offline, Active(_)) => ToActive,
|
||||||
|
(Active(_), Offline) => ToOffline,
|
||||||
|
_ => Unchanged,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Whether we may send API requests to this node.
|
/// Whether we may send API requests to this node.
|
||||||
@@ -114,21 +125,21 @@ impl Node {
|
|||||||
// a reference to the original Node's cancellation status. Checking both of these results
|
// a reference to the original Node's cancellation status. Checking both of these results
|
||||||
// in a "pessimistic" check where we will consider a Node instance unavailable if it was unavailable
|
// in a "pessimistic" check where we will consider a Node instance unavailable if it was unavailable
|
||||||
// when we cloned it, or if the original Node instance's cancellation token was fired.
|
// when we cloned it, or if the original Node instance's cancellation token was fired.
|
||||||
matches!(self.availability, NodeAvailability::Active) && !self.cancel.is_cancelled()
|
matches!(self.availability, NodeAvailability::Active(_)) && !self.cancel.is_cancelled()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Is this node elegible to have work scheduled onto it?
|
/// Is this node elegible to have work scheduled onto it?
|
||||||
pub(crate) fn may_schedule(&self) -> bool {
|
pub(crate) fn may_schedule(&self) -> MaySchedule {
|
||||||
match self.availability {
|
let score = match self.availability {
|
||||||
NodeAvailability::Active => {}
|
NodeAvailability::Active(score) => score,
|
||||||
NodeAvailability::Offline => return false,
|
NodeAvailability::Offline => return MaySchedule::No,
|
||||||
}
|
};
|
||||||
|
|
||||||
match self.scheduling {
|
match self.scheduling {
|
||||||
NodeSchedulingPolicy::Active => true,
|
NodeSchedulingPolicy::Active => MaySchedule::Yes(score),
|
||||||
NodeSchedulingPolicy::Draining => false,
|
NodeSchedulingPolicy::Draining => MaySchedule::No,
|
||||||
NodeSchedulingPolicy::Filling => true,
|
NodeSchedulingPolicy::Filling => MaySchedule::Yes(score),
|
||||||
NodeSchedulingPolicy::Pause => false,
|
NodeSchedulingPolicy::Pause => MaySchedule::No,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -146,8 +157,7 @@ impl Node {
|
|||||||
listen_pg_addr,
|
listen_pg_addr,
|
||||||
listen_pg_port,
|
listen_pg_port,
|
||||||
scheduling: NodeSchedulingPolicy::Filling,
|
scheduling: NodeSchedulingPolicy::Filling,
|
||||||
// TODO: we shouldn't really call this Active until we've heartbeated it.
|
availability: NodeAvailability::Offline,
|
||||||
availability: NodeAvailability::Active,
|
|
||||||
cancel: CancellationToken::new(),
|
cancel: CancellationToken::new(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -194,7 +204,7 @@ impl Node {
|
|||||||
cancel: &CancellationToken,
|
cancel: &CancellationToken,
|
||||||
) -> Option<mgmt_api::Result<T>>
|
) -> Option<mgmt_api::Result<T>>
|
||||||
where
|
where
|
||||||
O: FnMut(mgmt_api::Client) -> F,
|
O: FnMut(PageserverClient) -> F,
|
||||||
F: std::future::Future<Output = mgmt_api::Result<T>>,
|
F: std::future::Future<Output = mgmt_api::Result<T>>,
|
||||||
{
|
{
|
||||||
fn is_fatal(e: &mgmt_api::Error) -> bool {
|
fn is_fatal(e: &mgmt_api::Error) -> bool {
|
||||||
@@ -216,8 +226,12 @@ impl Node {
|
|||||||
.build()
|
.build()
|
||||||
.expect("Failed to construct HTTP client");
|
.expect("Failed to construct HTTP client");
|
||||||
|
|
||||||
let client =
|
let client = PageserverClient::from_client(
|
||||||
mgmt_api::Client::from_client(http_client, self.base_url(), jwt.as_deref());
|
self.get_id(),
|
||||||
|
http_client,
|
||||||
|
self.base_url(),
|
||||||
|
jwt.as_deref(),
|
||||||
|
);
|
||||||
|
|
||||||
let node_cancel_fut = self.cancel.cancelled();
|
let node_cancel_fut = self.cancel.cancelled();
|
||||||
|
|
||||||
|
|||||||
203
control_plane/attachment_service/src/pageserver_client.rs
Normal file
203
control_plane/attachment_service/src/pageserver_client.rs
Normal file
@@ -0,0 +1,203 @@
|
|||||||
|
use pageserver_api::{
|
||||||
|
models::{
|
||||||
|
LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress,
|
||||||
|
TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
|
||||||
|
},
|
||||||
|
shard::TenantShardId,
|
||||||
|
};
|
||||||
|
use pageserver_client::mgmt_api::{Client, Result};
|
||||||
|
use reqwest::StatusCode;
|
||||||
|
use utils::id::{NodeId, TimelineId};
|
||||||
|
|
||||||
|
/// Thin wrapper around [`pageserver_client::mgmt_api::Client`]. It allows the storage
|
||||||
|
/// controller to collect metrics in a non-intrusive manner.
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub(crate) struct PageserverClient {
|
||||||
|
inner: Client,
|
||||||
|
node_id_label: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
macro_rules! measured_request {
|
||||||
|
($name:literal, $method:expr, $node_id: expr, $invoke:expr) => {{
|
||||||
|
let labels = crate::metrics::PageserverRequestLabelGroup {
|
||||||
|
pageserver_id: $node_id,
|
||||||
|
path: $name,
|
||||||
|
method: $method,
|
||||||
|
};
|
||||||
|
|
||||||
|
let latency = &crate::metrics::METRICS_REGISTRY
|
||||||
|
.metrics_group
|
||||||
|
.storage_controller_pageserver_request_latency;
|
||||||
|
let _timer_guard = latency.start_timer(labels.clone());
|
||||||
|
|
||||||
|
let res = $invoke;
|
||||||
|
|
||||||
|
if res.is_err() {
|
||||||
|
let error_counters = &crate::metrics::METRICS_REGISTRY
|
||||||
|
.metrics_group
|
||||||
|
.storage_controller_pageserver_request_error;
|
||||||
|
error_counters.inc(labels)
|
||||||
|
}
|
||||||
|
|
||||||
|
res
|
||||||
|
}};
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PageserverClient {
|
||||||
|
pub(crate) fn new(node_id: NodeId, mgmt_api_endpoint: String, jwt: Option<&str>) -> Self {
|
||||||
|
Self {
|
||||||
|
inner: Client::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt),
|
||||||
|
node_id_label: node_id.0.to_string(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn from_client(
|
||||||
|
node_id: NodeId,
|
||||||
|
raw_client: reqwest::Client,
|
||||||
|
mgmt_api_endpoint: String,
|
||||||
|
jwt: Option<&str>,
|
||||||
|
) -> Self {
|
||||||
|
Self {
|
||||||
|
inner: Client::from_client(raw_client, mgmt_api_endpoint, jwt),
|
||||||
|
node_id_label: node_id.0.to_string(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) async fn tenant_delete(&self, tenant_shard_id: TenantShardId) -> Result<StatusCode> {
|
||||||
|
measured_request!(
|
||||||
|
"tenant",
|
||||||
|
crate::metrics::Method::Delete,
|
||||||
|
&self.node_id_label,
|
||||||
|
self.inner.tenant_delete(tenant_shard_id).await
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) async fn tenant_time_travel_remote_storage(
|
||||||
|
&self,
|
||||||
|
tenant_shard_id: TenantShardId,
|
||||||
|
timestamp: &str,
|
||||||
|
done_if_after: &str,
|
||||||
|
) -> Result<()> {
|
||||||
|
measured_request!(
|
||||||
|
"tenant_time_travel_remote_storage",
|
||||||
|
crate::metrics::Method::Put,
|
||||||
|
&self.node_id_label,
|
||||||
|
self.inner
|
||||||
|
.tenant_time_travel_remote_storage(tenant_shard_id, timestamp, done_if_after)
|
||||||
|
.await
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) async fn tenant_secondary_download(
|
||||||
|
&self,
|
||||||
|
tenant_id: TenantShardId,
|
||||||
|
wait: Option<std::time::Duration>,
|
||||||
|
) -> Result<(StatusCode, SecondaryProgress)> {
|
||||||
|
measured_request!(
|
||||||
|
"tenant_secondary_download",
|
||||||
|
crate::metrics::Method::Post,
|
||||||
|
&self.node_id_label,
|
||||||
|
self.inner.tenant_secondary_download(tenant_id, wait).await
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) async fn location_config(
|
||||||
|
&self,
|
||||||
|
tenant_shard_id: TenantShardId,
|
||||||
|
config: LocationConfig,
|
||||||
|
flush_ms: Option<std::time::Duration>,
|
||||||
|
lazy: bool,
|
||||||
|
) -> Result<()> {
|
||||||
|
measured_request!(
|
||||||
|
"location_config",
|
||||||
|
crate::metrics::Method::Put,
|
||||||
|
&self.node_id_label,
|
||||||
|
self.inner
|
||||||
|
.location_config(tenant_shard_id, config, flush_ms, lazy)
|
||||||
|
.await
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) async fn list_location_config(&self) -> Result<LocationConfigListResponse> {
|
||||||
|
measured_request!(
|
||||||
|
"location_configs",
|
||||||
|
crate::metrics::Method::Get,
|
||||||
|
&self.node_id_label,
|
||||||
|
self.inner.list_location_config().await
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) async fn get_location_config(
|
||||||
|
&self,
|
||||||
|
tenant_shard_id: TenantShardId,
|
||||||
|
) -> Result<Option<LocationConfig>> {
|
||||||
|
measured_request!(
|
||||||
|
"location_config",
|
||||||
|
crate::metrics::Method::Get,
|
||||||
|
&self.node_id_label,
|
||||||
|
self.inner.get_location_config(tenant_shard_id).await
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) async fn timeline_create(
|
||||||
|
&self,
|
||||||
|
tenant_shard_id: TenantShardId,
|
||||||
|
req: &TimelineCreateRequest,
|
||||||
|
) -> Result<TimelineInfo> {
|
||||||
|
measured_request!(
|
||||||
|
"timeline",
|
||||||
|
crate::metrics::Method::Post,
|
||||||
|
&self.node_id_label,
|
||||||
|
self.inner.timeline_create(tenant_shard_id, req).await
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) async fn timeline_delete(
|
||||||
|
&self,
|
||||||
|
tenant_shard_id: TenantShardId,
|
||||||
|
timeline_id: TimelineId,
|
||||||
|
) -> Result<StatusCode> {
|
||||||
|
measured_request!(
|
||||||
|
"timeline",
|
||||||
|
crate::metrics::Method::Delete,
|
||||||
|
&self.node_id_label,
|
||||||
|
self.inner
|
||||||
|
.timeline_delete(tenant_shard_id, timeline_id)
|
||||||
|
.await
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) async fn tenant_shard_split(
|
||||||
|
&self,
|
||||||
|
tenant_shard_id: TenantShardId,
|
||||||
|
req: TenantShardSplitRequest,
|
||||||
|
) -> Result<TenantShardSplitResponse> {
|
||||||
|
measured_request!(
|
||||||
|
"tenant_shard_split",
|
||||||
|
crate::metrics::Method::Put,
|
||||||
|
&self.node_id_label,
|
||||||
|
self.inner.tenant_shard_split(tenant_shard_id, req).await
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) async fn timeline_list(
|
||||||
|
&self,
|
||||||
|
tenant_shard_id: &TenantShardId,
|
||||||
|
) -> Result<Vec<TimelineInfo>> {
|
||||||
|
measured_request!(
|
||||||
|
"timelines",
|
||||||
|
crate::metrics::Method::Get,
|
||||||
|
&self.node_id_label,
|
||||||
|
self.inner.timeline_list(tenant_shard_id).await
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) async fn get_utilization(&self) -> Result<PageserverUtilization> {
|
||||||
|
measured_request!(
|
||||||
|
"utilization",
|
||||||
|
crate::metrics::Method::Get,
|
||||||
|
&self.node_id_label,
|
||||||
|
self.inner.get_utilization().await
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -19,11 +19,14 @@ use serde::{Deserialize, Serialize};
|
|||||||
use utils::generation::Generation;
|
use utils::generation::Generation;
|
||||||
use utils::id::{NodeId, TenantId};
|
use utils::id::{NodeId, TenantId};
|
||||||
|
|
||||||
|
use crate::metrics::{
|
||||||
|
DatabaseQueryErrorLabelGroup, DatabaseQueryLatencyLabelGroup, METRICS_REGISTRY,
|
||||||
|
};
|
||||||
use crate::node::Node;
|
use crate::node::Node;
|
||||||
|
|
||||||
/// ## What do we store?
|
/// ## What do we store?
|
||||||
///
|
///
|
||||||
/// The attachment service does not store most of its state durably.
|
/// The storage controller service does not store most of its state durably.
|
||||||
///
|
///
|
||||||
/// The essential things to store durably are:
|
/// The essential things to store durably are:
|
||||||
/// - generation numbers, as these must always advance monotonically to ensure data safety.
|
/// - generation numbers, as these must always advance monotonically to ensure data safety.
|
||||||
@@ -37,7 +40,7 @@ use crate::node::Node;
|
|||||||
///
|
///
|
||||||
/// ## Performance/efficiency
|
/// ## Performance/efficiency
|
||||||
///
|
///
|
||||||
/// The attachment service does not go via the database for most things: there are
|
/// The storage controller service does not go via the database for most things: there are
|
||||||
/// a couple of places where we must, and where efficiency matters:
|
/// a couple of places where we must, and where efficiency matters:
|
||||||
/// - Incrementing generation numbers: the Reconciler has to wait for this to complete
|
/// - Incrementing generation numbers: the Reconciler has to wait for this to complete
|
||||||
/// before it can attach a tenant, so this acts as a bound on how fast things like
|
/// before it can attach a tenant, so this acts as a bound on how fast things like
|
||||||
@@ -75,6 +78,25 @@ pub(crate) enum DatabaseError {
|
|||||||
Logical(String),
|
Logical(String),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(measured::FixedCardinalityLabel, Clone)]
|
||||||
|
pub(crate) enum DatabaseOperation {
|
||||||
|
InsertNode,
|
||||||
|
UpdateNode,
|
||||||
|
DeleteNode,
|
||||||
|
ListNodes,
|
||||||
|
BeginShardSplit,
|
||||||
|
CompleteShardSplit,
|
||||||
|
AbortShardSplit,
|
||||||
|
Detach,
|
||||||
|
ReAttach,
|
||||||
|
IncrementGeneration,
|
||||||
|
ListTenantShards,
|
||||||
|
InsertTenantShards,
|
||||||
|
UpdateTenantShard,
|
||||||
|
DeleteTenant,
|
||||||
|
UpdateTenantConfig,
|
||||||
|
}
|
||||||
|
|
||||||
#[must_use]
|
#[must_use]
|
||||||
pub(crate) enum AbortShardSplitStatus {
|
pub(crate) enum AbortShardSplitStatus {
|
||||||
/// We aborted the split in the database by reverting to the parent shards
|
/// We aborted the split in the database by reverting to the parent shards
|
||||||
@@ -115,6 +137,34 @@ impl Persistence {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Wraps `with_conn` in order to collect latency and error metrics
|
||||||
|
async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
|
||||||
|
where
|
||||||
|
F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
|
||||||
|
R: Send + 'static,
|
||||||
|
{
|
||||||
|
let latency = &METRICS_REGISTRY
|
||||||
|
.metrics_group
|
||||||
|
.storage_controller_database_query_latency;
|
||||||
|
let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup {
|
||||||
|
operation: op.clone(),
|
||||||
|
});
|
||||||
|
|
||||||
|
let res = self.with_conn(func).await;
|
||||||
|
|
||||||
|
if let Err(err) = &res {
|
||||||
|
let error_counter = &METRICS_REGISTRY
|
||||||
|
.metrics_group
|
||||||
|
.storage_controller_database_query_error;
|
||||||
|
error_counter.inc(DatabaseQueryErrorLabelGroup {
|
||||||
|
error_type: err.error_label(),
|
||||||
|
operation: op,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
res
|
||||||
|
}
|
||||||
|
|
||||||
/// Call the provided function in a tokio blocking thread, with a Diesel database connection.
|
/// Call the provided function in a tokio blocking thread, with a Diesel database connection.
|
||||||
async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
|
async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
|
||||||
where
|
where
|
||||||
@@ -130,21 +180,27 @@ impl Persistence {
|
|||||||
/// When a node is first registered, persist it before using it for anything
|
/// When a node is first registered, persist it before using it for anything
|
||||||
pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> {
|
pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> {
|
||||||
let np = node.to_persistent();
|
let np = node.to_persistent();
|
||||||
self.with_conn(move |conn| -> DatabaseResult<()> {
|
self.with_measured_conn(
|
||||||
diesel::insert_into(crate::schema::nodes::table)
|
DatabaseOperation::InsertNode,
|
||||||
.values(&np)
|
move |conn| -> DatabaseResult<()> {
|
||||||
.execute(conn)?;
|
diesel::insert_into(crate::schema::nodes::table)
|
||||||
Ok(())
|
.values(&np)
|
||||||
})
|
.execute(conn)?;
|
||||||
|
Ok(())
|
||||||
|
},
|
||||||
|
)
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
/// At startup, populate the list of nodes which our shards may be placed on
|
/// At startup, populate the list of nodes which our shards may be placed on
|
||||||
pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<NodePersistence>> {
|
pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<NodePersistence>> {
|
||||||
let nodes: Vec<NodePersistence> = self
|
let nodes: Vec<NodePersistence> = self
|
||||||
.with_conn(move |conn| -> DatabaseResult<_> {
|
.with_measured_conn(
|
||||||
Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
|
DatabaseOperation::ListNodes,
|
||||||
})
|
move |conn| -> DatabaseResult<_> {
|
||||||
|
Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
|
||||||
|
},
|
||||||
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
tracing::info!("list_nodes: loaded {} nodes", nodes.len());
|
tracing::info!("list_nodes: loaded {} nodes", nodes.len());
|
||||||
@@ -159,7 +215,7 @@ impl Persistence {
|
|||||||
) -> DatabaseResult<()> {
|
) -> DatabaseResult<()> {
|
||||||
use crate::schema::nodes::dsl::*;
|
use crate::schema::nodes::dsl::*;
|
||||||
let updated = self
|
let updated = self
|
||||||
.with_conn(move |conn| {
|
.with_measured_conn(DatabaseOperation::UpdateNode, move |conn| {
|
||||||
let updated = diesel::update(nodes)
|
let updated = diesel::update(nodes)
|
||||||
.filter(node_id.eq(input_node_id.0 as i64))
|
.filter(node_id.eq(input_node_id.0 as i64))
|
||||||
.set((scheduling_policy.eq(String::from(input_scheduling)),))
|
.set((scheduling_policy.eq(String::from(input_scheduling)),))
|
||||||
@@ -181,9 +237,12 @@ impl Persistence {
|
|||||||
/// be enriched at runtime with state discovered on pageservers.
|
/// be enriched at runtime with state discovered on pageservers.
|
||||||
pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
|
pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
|
||||||
let loaded = self
|
let loaded = self
|
||||||
.with_conn(move |conn| -> DatabaseResult<_> {
|
.with_measured_conn(
|
||||||
Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
|
DatabaseOperation::ListTenantShards,
|
||||||
})
|
move |conn| -> DatabaseResult<_> {
|
||||||
|
Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
|
||||||
|
},
|
||||||
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
if loaded.is_empty() {
|
if loaded.is_empty() {
|
||||||
@@ -211,15 +270,10 @@ impl Persistence {
|
|||||||
|
|
||||||
let mut decoded = serde_json::from_slice::<JsonPersistence>(&bytes)
|
let mut decoded = serde_json::from_slice::<JsonPersistence>(&bytes)
|
||||||
.map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?;
|
.map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?;
|
||||||
for (tenant_id, tenant) in &mut decoded.tenants {
|
for shard in decoded.tenants.values_mut() {
|
||||||
// Backward compat: an old attachments.json from before PR #6251, replace
|
if shard.placement_policy == "\"Single\"" {
|
||||||
// empty strings with proper defaults.
|
// Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165
|
||||||
if tenant.tenant_id.is_empty() {
|
shard.placement_policy = "{\"Attached\":0}".to_string();
|
||||||
tenant.tenant_id = tenant_id.to_string();
|
|
||||||
tenant.config = serde_json::to_string(&TenantConfig::default())
|
|
||||||
.map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
|
|
||||||
tenant.placement_policy = serde_json::to_string(&PlacementPolicy::Single)
|
|
||||||
.map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -265,17 +319,20 @@ impl Persistence {
|
|||||||
shards: Vec<TenantShardPersistence>,
|
shards: Vec<TenantShardPersistence>,
|
||||||
) -> DatabaseResult<()> {
|
) -> DatabaseResult<()> {
|
||||||
use crate::schema::tenant_shards::dsl::*;
|
use crate::schema::tenant_shards::dsl::*;
|
||||||
self.with_conn(move |conn| -> DatabaseResult<()> {
|
self.with_measured_conn(
|
||||||
conn.transaction(|conn| -> QueryResult<()> {
|
DatabaseOperation::InsertTenantShards,
|
||||||
for tenant in &shards {
|
move |conn| -> DatabaseResult<()> {
|
||||||
diesel::insert_into(tenant_shards)
|
conn.transaction(|conn| -> QueryResult<()> {
|
||||||
.values(tenant)
|
for tenant in &shards {
|
||||||
.execute(conn)?;
|
diesel::insert_into(tenant_shards)
|
||||||
}
|
.values(tenant)
|
||||||
|
.execute(conn)?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
})?;
|
||||||
Ok(())
|
Ok(())
|
||||||
})?;
|
},
|
||||||
Ok(())
|
)
|
||||||
})
|
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -283,25 +340,31 @@ impl Persistence {
|
|||||||
/// the tenant from memory on this server.
|
/// the tenant from memory on this server.
|
||||||
pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
|
pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
|
||||||
use crate::schema::tenant_shards::dsl::*;
|
use crate::schema::tenant_shards::dsl::*;
|
||||||
self.with_conn(move |conn| -> DatabaseResult<()> {
|
self.with_measured_conn(
|
||||||
diesel::delete(tenant_shards)
|
DatabaseOperation::DeleteTenant,
|
||||||
.filter(tenant_id.eq(del_tenant_id.to_string()))
|
move |conn| -> DatabaseResult<()> {
|
||||||
.execute(conn)?;
|
diesel::delete(tenant_shards)
|
||||||
|
.filter(tenant_id.eq(del_tenant_id.to_string()))
|
||||||
|
.execute(conn)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
})
|
},
|
||||||
|
)
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) async fn delete_node(&self, del_node_id: NodeId) -> DatabaseResult<()> {
|
pub(crate) async fn delete_node(&self, del_node_id: NodeId) -> DatabaseResult<()> {
|
||||||
use crate::schema::nodes::dsl::*;
|
use crate::schema::nodes::dsl::*;
|
||||||
self.with_conn(move |conn| -> DatabaseResult<()> {
|
self.with_measured_conn(
|
||||||
diesel::delete(nodes)
|
DatabaseOperation::DeleteNode,
|
||||||
.filter(node_id.eq(del_node_id.0 as i64))
|
move |conn| -> DatabaseResult<()> {
|
||||||
.execute(conn)?;
|
diesel::delete(nodes)
|
||||||
|
.filter(node_id.eq(del_node_id.0 as i64))
|
||||||
|
.execute(conn)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
})
|
},
|
||||||
|
)
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -315,7 +378,7 @@ impl Persistence {
|
|||||||
) -> DatabaseResult<HashMap<TenantShardId, Generation>> {
|
) -> DatabaseResult<HashMap<TenantShardId, Generation>> {
|
||||||
use crate::schema::tenant_shards::dsl::*;
|
use crate::schema::tenant_shards::dsl::*;
|
||||||
let updated = self
|
let updated = self
|
||||||
.with_conn(move |conn| {
|
.with_measured_conn(DatabaseOperation::ReAttach, move |conn| {
|
||||||
let rows_updated = diesel::update(tenant_shards)
|
let rows_updated = diesel::update(tenant_shards)
|
||||||
.filter(generation_pageserver.eq(node_id.0 as i64))
|
.filter(generation_pageserver.eq(node_id.0 as i64))
|
||||||
.set(generation.eq(generation + 1))
|
.set(generation.eq(generation + 1))
|
||||||
@@ -365,7 +428,7 @@ impl Persistence {
|
|||||||
) -> anyhow::Result<Generation> {
|
) -> anyhow::Result<Generation> {
|
||||||
use crate::schema::tenant_shards::dsl::*;
|
use crate::schema::tenant_shards::dsl::*;
|
||||||
let updated = self
|
let updated = self
|
||||||
.with_conn(move |conn| {
|
.with_measured_conn(DatabaseOperation::IncrementGeneration, move |conn| {
|
||||||
let updated = diesel::update(tenant_shards)
|
let updated = diesel::update(tenant_shards)
|
||||||
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
||||||
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
||||||
@@ -409,7 +472,7 @@ impl Persistence {
|
|||||||
) -> DatabaseResult<()> {
|
) -> DatabaseResult<()> {
|
||||||
use crate::schema::tenant_shards::dsl::*;
|
use crate::schema::tenant_shards::dsl::*;
|
||||||
|
|
||||||
self.with_conn(move |conn| {
|
self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| {
|
||||||
let query = diesel::update(tenant_shards)
|
let query = diesel::update(tenant_shards)
|
||||||
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
||||||
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
||||||
@@ -450,7 +513,7 @@ impl Persistence {
|
|||||||
) -> DatabaseResult<()> {
|
) -> DatabaseResult<()> {
|
||||||
use crate::schema::tenant_shards::dsl::*;
|
use crate::schema::tenant_shards::dsl::*;
|
||||||
|
|
||||||
self.with_conn(move |conn| {
|
self.with_measured_conn(DatabaseOperation::UpdateTenantConfig, move |conn| {
|
||||||
diesel::update(tenant_shards)
|
diesel::update(tenant_shards)
|
||||||
.filter(tenant_id.eq(input_tenant_id.to_string()))
|
.filter(tenant_id.eq(input_tenant_id.to_string()))
|
||||||
.set((config.eq(serde_json::to_string(&input_config).unwrap()),))
|
.set((config.eq(serde_json::to_string(&input_config).unwrap()),))
|
||||||
@@ -465,7 +528,7 @@ impl Persistence {
|
|||||||
|
|
||||||
pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
|
pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
|
||||||
use crate::schema::tenant_shards::dsl::*;
|
use crate::schema::tenant_shards::dsl::*;
|
||||||
self.with_conn(move |conn| {
|
self.with_measured_conn(DatabaseOperation::Detach, move |conn| {
|
||||||
let updated = diesel::update(tenant_shards)
|
let updated = diesel::update(tenant_shards)
|
||||||
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
||||||
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
||||||
@@ -495,7 +558,7 @@ impl Persistence {
|
|||||||
parent_to_children: Vec<(TenantShardId, Vec<TenantShardPersistence>)>,
|
parent_to_children: Vec<(TenantShardId, Vec<TenantShardPersistence>)>,
|
||||||
) -> DatabaseResult<()> {
|
) -> DatabaseResult<()> {
|
||||||
use crate::schema::tenant_shards::dsl::*;
|
use crate::schema::tenant_shards::dsl::*;
|
||||||
self.with_conn(move |conn| -> DatabaseResult<()> {
|
self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| -> DatabaseResult<()> {
|
||||||
conn.transaction(|conn| -> DatabaseResult<()> {
|
conn.transaction(|conn| -> DatabaseResult<()> {
|
||||||
// Mark parent shards as splitting
|
// Mark parent shards as splitting
|
||||||
|
|
||||||
@@ -559,26 +622,29 @@ impl Persistence {
|
|||||||
old_shard_count: ShardCount,
|
old_shard_count: ShardCount,
|
||||||
) -> DatabaseResult<()> {
|
) -> DatabaseResult<()> {
|
||||||
use crate::schema::tenant_shards::dsl::*;
|
use crate::schema::tenant_shards::dsl::*;
|
||||||
self.with_conn(move |conn| -> DatabaseResult<()> {
|
self.with_measured_conn(
|
||||||
conn.transaction(|conn| -> QueryResult<()> {
|
DatabaseOperation::CompleteShardSplit,
|
||||||
// Drop parent shards
|
move |conn| -> DatabaseResult<()> {
|
||||||
diesel::delete(tenant_shards)
|
conn.transaction(|conn| -> QueryResult<()> {
|
||||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
// Drop parent shards
|
||||||
.filter(shard_count.eq(old_shard_count.literal() as i32))
|
diesel::delete(tenant_shards)
|
||||||
.execute(conn)?;
|
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||||
|
.filter(shard_count.eq(old_shard_count.literal() as i32))
|
||||||
|
.execute(conn)?;
|
||||||
|
|
||||||
// Clear sharding flag
|
// Clear sharding flag
|
||||||
let updated = diesel::update(tenant_shards)
|
let updated = diesel::update(tenant_shards)
|
||||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||||
.set((splitting.eq(0),))
|
.set((splitting.eq(0),))
|
||||||
.execute(conn)?;
|
.execute(conn)?;
|
||||||
debug_assert!(updated > 0);
|
debug_assert!(updated > 0);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
})?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
})?;
|
},
|
||||||
|
)
|
||||||
Ok(())
|
|
||||||
})
|
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -590,31 +656,44 @@ impl Persistence {
|
|||||||
new_shard_count: ShardCount,
|
new_shard_count: ShardCount,
|
||||||
) -> DatabaseResult<AbortShardSplitStatus> {
|
) -> DatabaseResult<AbortShardSplitStatus> {
|
||||||
use crate::schema::tenant_shards::dsl::*;
|
use crate::schema::tenant_shards::dsl::*;
|
||||||
self.with_conn(move |conn| -> DatabaseResult<AbortShardSplitStatus> {
|
self.with_measured_conn(
|
||||||
let aborted = conn.transaction(|conn| -> QueryResult<AbortShardSplitStatus> {
|
DatabaseOperation::AbortShardSplit,
|
||||||
// Clear the splitting state on parent shards
|
move |conn| -> DatabaseResult<AbortShardSplitStatus> {
|
||||||
let updated = diesel::update(tenant_shards)
|
let aborted =
|
||||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
conn.transaction(|conn| -> DatabaseResult<AbortShardSplitStatus> {
|
||||||
.filter(shard_count.ne(new_shard_count.literal() as i32))
|
// Clear the splitting state on parent shards
|
||||||
.set((splitting.eq(0),))
|
let updated = diesel::update(tenant_shards)
|
||||||
.execute(conn)?;
|
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||||
|
.filter(shard_count.ne(new_shard_count.literal() as i32))
|
||||||
|
.set((splitting.eq(0),))
|
||||||
|
.execute(conn)?;
|
||||||
|
|
||||||
// Parent shards are already gone: we cannot abort.
|
// Parent shards are already gone: we cannot abort.
|
||||||
if updated == 0 {
|
if updated == 0 {
|
||||||
return Ok(AbortShardSplitStatus::Complete);
|
return Ok(AbortShardSplitStatus::Complete);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Erase child shards
|
// Sanity check: if parent shards were present, their cardinality should
|
||||||
diesel::delete(tenant_shards)
|
// be less than the number of child shards.
|
||||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
if updated >= new_shard_count.count() as usize {
|
||||||
.filter(shard_count.eq(new_shard_count.literal() as i32))
|
return Err(DatabaseError::Logical(format!(
|
||||||
.execute(conn)?;
|
"Unexpected parent shard count {updated} while aborting split to \
|
||||||
|
count {new_shard_count:?} on tenant {split_tenant_id}"
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
|
||||||
Ok(AbortShardSplitStatus::Aborted)
|
// Erase child shards
|
||||||
})?;
|
diesel::delete(tenant_shards)
|
||||||
|
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||||
|
.filter(shard_count.eq(new_shard_count.literal() as i32))
|
||||||
|
.execute(conn)?;
|
||||||
|
|
||||||
Ok(aborted)
|
Ok(AbortShardSplitStatus::Aborted)
|
||||||
})
|
})?;
|
||||||
|
|
||||||
|
Ok(aborted)
|
||||||
|
},
|
||||||
|
)
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
use crate::pageserver_client::PageserverClient;
|
||||||
use crate::persistence::Persistence;
|
use crate::persistence::Persistence;
|
||||||
use crate::service;
|
use crate::service;
|
||||||
use hyper::StatusCode;
|
use hyper::StatusCode;
|
||||||
@@ -8,7 +9,7 @@ use pageserver_api::shard::{ShardIdentity, TenantShardId};
|
|||||||
use pageserver_client::mgmt_api;
|
use pageserver_client::mgmt_api;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::time::Duration;
|
use std::time::{Duration, Instant};
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
use utils::generation::Generation;
|
use utils::generation::Generation;
|
||||||
use utils::id::{NodeId, TimelineId};
|
use utils::id::{NodeId, TimelineId};
|
||||||
@@ -117,6 +118,15 @@ impl Reconciler {
|
|||||||
flush_ms: Option<Duration>,
|
flush_ms: Option<Duration>,
|
||||||
lazy: bool,
|
lazy: bool,
|
||||||
) -> Result<(), ReconcileError> {
|
) -> Result<(), ReconcileError> {
|
||||||
|
if !node.is_available() && config.mode == LocationConfigMode::Detached {
|
||||||
|
// Attempts to detach from offline nodes may be imitated without doing I/O: a node which is offline
|
||||||
|
// will get fully reconciled wrt the shard's intent state when it is reactivated, irrespective of
|
||||||
|
// what we put into `observed`, in [`crate::service::Service::node_activate_reconcile`]
|
||||||
|
tracing::info!("Node {node} is unavailable during detach: proceeding anyway, it will be detached on next activation");
|
||||||
|
self.observed.locations.remove(&node.get_id());
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
self.observed
|
self.observed
|
||||||
.locations
|
.locations
|
||||||
.insert(node.get_id(), ObservedStateLocation { conf: None });
|
.insert(node.get_id(), ObservedStateLocation { conf: None });
|
||||||
@@ -149,9 +159,16 @@ impl Reconciler {
|
|||||||
};
|
};
|
||||||
tracing::info!("location_config({node}) complete: {:?}", config);
|
tracing::info!("location_config({node}) complete: {:?}", config);
|
||||||
|
|
||||||
self.observed
|
match config.mode {
|
||||||
.locations
|
LocationConfigMode::Detached => {
|
||||||
.insert(node.get_id(), ObservedStateLocation { conf: Some(config) });
|
self.observed.locations.remove(&node.get_id());
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
self.observed
|
||||||
|
.locations
|
||||||
|
.insert(node.get_id(), ObservedStateLocation { conf: Some(config) });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -243,8 +260,11 @@ impl Reconciler {
|
|||||||
tenant_shard_id: TenantShardId,
|
tenant_shard_id: TenantShardId,
|
||||||
node: &Node,
|
node: &Node,
|
||||||
) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
|
) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
|
||||||
let client =
|
let client = PageserverClient::new(
|
||||||
mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
|
node.get_id(),
|
||||||
|
node.base_url(),
|
||||||
|
self.service_config.jwt_token.as_deref(),
|
||||||
|
);
|
||||||
|
|
||||||
let timelines = client.timeline_list(&tenant_shard_id).await?;
|
let timelines = client.timeline_list(&tenant_shard_id).await?;
|
||||||
Ok(timelines
|
Ok(timelines
|
||||||
@@ -258,22 +278,81 @@ impl Reconciler {
|
|||||||
tenant_shard_id: TenantShardId,
|
tenant_shard_id: TenantShardId,
|
||||||
node: &Node,
|
node: &Node,
|
||||||
) -> Result<(), ReconcileError> {
|
) -> Result<(), ReconcileError> {
|
||||||
match node
|
// This is not the timeout for a request, but the total amount of time we're willing to wait
|
||||||
.with_client_retries(
|
// for a secondary location to get up to date before
|
||||||
|client| async move { client.tenant_secondary_download(tenant_shard_id).await },
|
const TOTAL_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(300);
|
||||||
&self.service_config.jwt_token,
|
|
||||||
1,
|
// This the long-polling interval for the secondary download requests we send to destination pageserver
|
||||||
1,
|
// during a migration.
|
||||||
Duration::from_secs(60),
|
const REQUEST_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(20);
|
||||||
&self.cancel,
|
|
||||||
)
|
let started_at = Instant::now();
|
||||||
.await
|
|
||||||
{
|
loop {
|
||||||
None => Err(ReconcileError::Cancel),
|
let (status, progress) = match node
|
||||||
Some(Ok(_)) => Ok(()),
|
.with_client_retries(
|
||||||
Some(Err(e)) => {
|
|client| async move {
|
||||||
tracing::info!(" (skipping destination download: {})", e);
|
client
|
||||||
Ok(())
|
.tenant_secondary_download(
|
||||||
|
tenant_shard_id,
|
||||||
|
Some(REQUEST_DOWNLOAD_TIMEOUT),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
},
|
||||||
|
&self.service_config.jwt_token,
|
||||||
|
1,
|
||||||
|
3,
|
||||||
|
REQUEST_DOWNLOAD_TIMEOUT * 2,
|
||||||
|
&self.cancel,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
None => Err(ReconcileError::Cancel),
|
||||||
|
Some(Ok(v)) => Ok(v),
|
||||||
|
Some(Err(e)) => {
|
||||||
|
// Give up, but proceed: it's unfortunate if we couldn't freshen the destination before
|
||||||
|
// attaching, but we should not let an issue with a secondary location stop us proceeding
|
||||||
|
// with a live migration.
|
||||||
|
tracing::warn!("Failed to prepare by downloading layers on node {node}: {e})");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
}?;
|
||||||
|
|
||||||
|
if status == StatusCode::OK {
|
||||||
|
tracing::info!(
|
||||||
|
"Downloads to {} complete: {}/{} layers, {}/{} bytes",
|
||||||
|
node,
|
||||||
|
progress.layers_downloaded,
|
||||||
|
progress.layers_total,
|
||||||
|
progress.bytes_downloaded,
|
||||||
|
progress.bytes_total
|
||||||
|
);
|
||||||
|
return Ok(());
|
||||||
|
} else if status == StatusCode::ACCEPTED {
|
||||||
|
let total_runtime = started_at.elapsed();
|
||||||
|
if total_runtime > TOTAL_DOWNLOAD_TIMEOUT {
|
||||||
|
tracing::warn!("Timed out after {}ms downloading layers to {node}. Progress so far: {}/{} layers, {}/{} bytes",
|
||||||
|
total_runtime.as_millis(),
|
||||||
|
progress.layers_downloaded,
|
||||||
|
progress.layers_total,
|
||||||
|
progress.bytes_downloaded,
|
||||||
|
progress.bytes_total
|
||||||
|
);
|
||||||
|
// Give up, but proceed: an incompletely warmed destination doesn't prevent migration working,
|
||||||
|
// it just makes the I/O performance for users less good.
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Log and proceed around the loop to retry. We don't sleep between requests, because our HTTP call
|
||||||
|
// to the pageserver is a long-poll.
|
||||||
|
tracing::info!(
|
||||||
|
"Downloads to {} not yet complete: {}/{} layers, {}/{} bytes",
|
||||||
|
node,
|
||||||
|
progress.layers_downloaded,
|
||||||
|
progress.layers_total,
|
||||||
|
progress.bytes_downloaded,
|
||||||
|
progress.bytes_total
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -416,7 +495,7 @@ impl Reconciler {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Downgrade the origin to secondary. If the tenant's policy is PlacementPolicy::Single, then
|
// Downgrade the origin to secondary. If the tenant's policy is PlacementPolicy::Attached(0), then
|
||||||
// this location will be deleted in the general case reconciliation that runs after this.
|
// this location will be deleted in the general case reconciliation that runs after this.
|
||||||
let origin_secondary_conf = build_location_config(
|
let origin_secondary_conf = build_location_config(
|
||||||
&self.shard,
|
&self.shard,
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
use crate::{node::Node, tenant_state::TenantState};
|
use crate::{node::Node, tenant_state::TenantState};
|
||||||
|
use pageserver_api::controller_api::UtilizationScore;
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use utils::{http::error::ApiError, id::NodeId};
|
use utils::{http::error::ApiError, id::NodeId};
|
||||||
@@ -19,15 +20,34 @@ impl From<ScheduleError> for ApiError {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Eq, PartialEq)]
|
#[derive(Serialize, Eq, PartialEq)]
|
||||||
|
pub enum MaySchedule {
|
||||||
|
Yes(UtilizationScore),
|
||||||
|
No,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize)]
|
||||||
struct SchedulerNode {
|
struct SchedulerNode {
|
||||||
/// How many shards are currently scheduled on this node, via their [`crate::tenant_state::IntentState`].
|
/// How many shards are currently scheduled on this node, via their [`crate::tenant_state::IntentState`].
|
||||||
shard_count: usize,
|
shard_count: usize,
|
||||||
|
|
||||||
/// Whether this node is currently elegible to have new shards scheduled (this is derived
|
/// Whether this node is currently elegible to have new shards scheduled (this is derived
|
||||||
/// from a node's availability state and scheduling policy).
|
/// from a node's availability state and scheduling policy).
|
||||||
may_schedule: bool,
|
may_schedule: MaySchedule,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl PartialEq for SchedulerNode {
|
||||||
|
fn eq(&self, other: &Self) -> bool {
|
||||||
|
let may_schedule_matches = matches!(
|
||||||
|
(&self.may_schedule, &other.may_schedule),
|
||||||
|
(MaySchedule::Yes(_), MaySchedule::Yes(_)) | (MaySchedule::No, MaySchedule::No)
|
||||||
|
);
|
||||||
|
|
||||||
|
may_schedule_matches && self.shard_count == other.shard_count
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Eq for SchedulerNode {}
|
||||||
|
|
||||||
/// This type is responsible for selecting which node is used when a tenant shard needs to choose a pageserver
|
/// This type is responsible for selecting which node is used when a tenant shard needs to choose a pageserver
|
||||||
/// on which to run.
|
/// on which to run.
|
||||||
///
|
///
|
||||||
@@ -186,13 +206,15 @@ impl Scheduler {
|
|||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: When the utilization score returned by the pageserver becomes meaningful,
|
||||||
|
// schedule based on that instead of the shard count.
|
||||||
let node = nodes
|
let node = nodes
|
||||||
.iter()
|
.iter()
|
||||||
.map(|node_id| {
|
.map(|node_id| {
|
||||||
let may_schedule = self
|
let may_schedule = self
|
||||||
.nodes
|
.nodes
|
||||||
.get(node_id)
|
.get(node_id)
|
||||||
.map(|n| n.may_schedule)
|
.map(|n| n.may_schedule != MaySchedule::No)
|
||||||
.unwrap_or(false);
|
.unwrap_or(false);
|
||||||
(*node_id, may_schedule)
|
(*node_id, may_schedule)
|
||||||
})
|
})
|
||||||
@@ -211,7 +233,7 @@ impl Scheduler {
|
|||||||
.nodes
|
.nodes
|
||||||
.iter()
|
.iter()
|
||||||
.filter_map(|(k, v)| {
|
.filter_map(|(k, v)| {
|
||||||
if hard_exclude.contains(k) || !v.may_schedule {
|
if hard_exclude.contains(k) || v.may_schedule == MaySchedule::No {
|
||||||
None
|
None
|
||||||
} else {
|
} else {
|
||||||
Some((*k, v.shard_count))
|
Some((*k, v.shard_count))
|
||||||
@@ -230,7 +252,7 @@ impl Scheduler {
|
|||||||
for (node_id, node) in &self.nodes {
|
for (node_id, node) in &self.nodes {
|
||||||
tracing::info!(
|
tracing::info!(
|
||||||
"Node {node_id}: may_schedule={} shards={}",
|
"Node {node_id}: may_schedule={} shards={}",
|
||||||
node.may_schedule,
|
node.may_schedule != MaySchedule::No,
|
||||||
node.shard_count
|
node.shard_count
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -255,6 +277,7 @@ impl Scheduler {
|
|||||||
pub(crate) mod test_utils {
|
pub(crate) mod test_utils {
|
||||||
|
|
||||||
use crate::node::Node;
|
use crate::node::Node;
|
||||||
|
use pageserver_api::controller_api::{NodeAvailability, UtilizationScore};
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use utils::id::NodeId;
|
use utils::id::NodeId;
|
||||||
/// Test helper: synthesize the requested number of nodes, all in active state.
|
/// Test helper: synthesize the requested number of nodes, all in active state.
|
||||||
@@ -264,13 +287,14 @@ pub(crate) mod test_utils {
|
|||||||
(1..n + 1)
|
(1..n + 1)
|
||||||
.map(|i| {
|
.map(|i| {
|
||||||
(NodeId(i), {
|
(NodeId(i), {
|
||||||
let node = Node::new(
|
let mut node = Node::new(
|
||||||
NodeId(i),
|
NodeId(i),
|
||||||
format!("httphost-{i}"),
|
format!("httphost-{i}"),
|
||||||
80 + i as u16,
|
80 + i as u16,
|
||||||
format!("pghost-{i}"),
|
format!("pghost-{i}"),
|
||||||
5432 + i as u16,
|
5432 + i as u16,
|
||||||
);
|
);
|
||||||
|
node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
|
||||||
assert!(node.is_available());
|
assert!(node.is_available());
|
||||||
node
|
node
|
||||||
})
|
})
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -4,7 +4,10 @@ use std::{
|
|||||||
time::Duration,
|
time::Duration,
|
||||||
};
|
};
|
||||||
|
|
||||||
use crate::{metrics, persistence::TenantShardPersistence};
|
use crate::{
|
||||||
|
metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
|
||||||
|
persistence::TenantShardPersistence,
|
||||||
|
};
|
||||||
use pageserver_api::controller_api::PlacementPolicy;
|
use pageserver_api::controller_api::PlacementPolicy;
|
||||||
use pageserver_api::{
|
use pageserver_api::{
|
||||||
models::{LocationConfig, LocationConfigMode, TenantConfig},
|
models::{LocationConfig, LocationConfigMode, TenantConfig},
|
||||||
@@ -457,22 +460,7 @@ impl TenantState {
|
|||||||
// Add/remove nodes to fulfil policy
|
// Add/remove nodes to fulfil policy
|
||||||
use PlacementPolicy::*;
|
use PlacementPolicy::*;
|
||||||
match self.policy {
|
match self.policy {
|
||||||
Single => {
|
Attached(secondary_count) => {
|
||||||
// Should have exactly one attached, and zero secondaries
|
|
||||||
if !self.intent.secondary.is_empty() {
|
|
||||||
self.intent.clear_secondary(scheduler);
|
|
||||||
modified = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
let (modified_attached, _attached_node_id) = self.schedule_attached(scheduler)?;
|
|
||||||
modified |= modified_attached;
|
|
||||||
|
|
||||||
if !self.intent.secondary.is_empty() {
|
|
||||||
self.intent.clear_secondary(scheduler);
|
|
||||||
modified = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Double(secondary_count) => {
|
|
||||||
let retain_secondaries = if self.intent.attached.is_none()
|
let retain_secondaries = if self.intent.attached.is_none()
|
||||||
&& scheduler.node_preferred(&self.intent.secondary).is_some()
|
&& scheduler.node_preferred(&self.intent.secondary).is_some()
|
||||||
{
|
{
|
||||||
@@ -622,7 +610,7 @@ impl TenantState {
|
|||||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
|
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
|
||||||
pub(crate) fn maybe_reconcile(
|
pub(crate) fn maybe_reconcile(
|
||||||
&mut self,
|
&mut self,
|
||||||
result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
|
result_tx: &tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
|
||||||
pageservers: &Arc<HashMap<NodeId, Node>>,
|
pageservers: &Arc<HashMap<NodeId, Node>>,
|
||||||
compute_hook: &Arc<ComputeHook>,
|
compute_hook: &Arc<ComputeHook>,
|
||||||
service_config: &service::Config,
|
service_config: &service::Config,
|
||||||
@@ -733,7 +721,11 @@ impl TenantState {
|
|||||||
let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq,
|
let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq,
|
||||||
tenant_id=%reconciler.tenant_shard_id.tenant_id,
|
tenant_id=%reconciler.tenant_shard_id.tenant_id,
|
||||||
shard_id=%reconciler.tenant_shard_id.shard_slug());
|
shard_id=%reconciler.tenant_shard_id.shard_slug());
|
||||||
metrics::RECONCILER.spawned.inc();
|
metrics::METRICS_REGISTRY
|
||||||
|
.metrics_group
|
||||||
|
.storage_controller_reconcile_spawn
|
||||||
|
.inc();
|
||||||
|
let result_tx = result_tx.clone();
|
||||||
let join_handle = tokio::task::spawn(
|
let join_handle = tokio::task::spawn(
|
||||||
async move {
|
async move {
|
||||||
// Wait for any previous reconcile task to complete before we start
|
// Wait for any previous reconcile task to complete before we start
|
||||||
@@ -750,10 +742,12 @@ impl TenantState {
|
|||||||
// TODO: wrap all remote API operations in cancellation check
|
// TODO: wrap all remote API operations in cancellation check
|
||||||
// as well.
|
// as well.
|
||||||
if reconciler.cancel.is_cancelled() {
|
if reconciler.cancel.is_cancelled() {
|
||||||
metrics::RECONCILER
|
metrics::METRICS_REGISTRY
|
||||||
.complete
|
.metrics_group
|
||||||
.with_label_values(&[metrics::ReconcilerMetrics::CANCEL])
|
.storage_controller_reconcile_complete
|
||||||
.inc();
|
.inc(ReconcileCompleteLabelGroup {
|
||||||
|
status: ReconcileOutcome::Cancel,
|
||||||
|
});
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -768,18 +762,18 @@ impl TenantState {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Update result counter
|
// Update result counter
|
||||||
match &result {
|
let outcome_label = match &result {
|
||||||
Ok(_) => metrics::RECONCILER
|
Ok(_) => ReconcileOutcome::Success,
|
||||||
.complete
|
Err(ReconcileError::Cancel) => ReconcileOutcome::Cancel,
|
||||||
.with_label_values(&[metrics::ReconcilerMetrics::SUCCESS]),
|
Err(_) => ReconcileOutcome::Error,
|
||||||
Err(ReconcileError::Cancel) => metrics::RECONCILER
|
};
|
||||||
.complete
|
|
||||||
.with_label_values(&[metrics::ReconcilerMetrics::CANCEL]),
|
metrics::METRICS_REGISTRY
|
||||||
Err(_) => metrics::RECONCILER
|
.metrics_group
|
||||||
.complete
|
.storage_controller_reconcile_complete
|
||||||
.with_label_values(&[metrics::ReconcilerMetrics::ERROR]),
|
.inc(ReconcileCompleteLabelGroup {
|
||||||
}
|
status: outcome_label,
|
||||||
.inc();
|
});
|
||||||
|
|
||||||
result_tx
|
result_tx
|
||||||
.send(ReconcileResult {
|
.send(ReconcileResult {
|
||||||
@@ -894,7 +888,7 @@ pub(crate) mod tests {
|
|||||||
|
|
||||||
let mut scheduler = Scheduler::new(nodes.values());
|
let mut scheduler = Scheduler::new(nodes.values());
|
||||||
|
|
||||||
let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));
|
let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
||||||
tenant_state
|
tenant_state
|
||||||
.schedule(&mut scheduler)
|
.schedule(&mut scheduler)
|
||||||
.expect("we have enough nodes, scheduling should work");
|
.expect("we have enough nodes, scheduling should work");
|
||||||
@@ -942,7 +936,7 @@ pub(crate) mod tests {
|
|||||||
let nodes = make_test_nodes(3);
|
let nodes = make_test_nodes(3);
|
||||||
let mut scheduler = Scheduler::new(nodes.values());
|
let mut scheduler = Scheduler::new(nodes.values());
|
||||||
|
|
||||||
let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));
|
let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
||||||
|
|
||||||
tenant_state.observed.locations.insert(
|
tenant_state.observed.locations.insert(
|
||||||
NodeId(3),
|
NodeId(3),
|
||||||
|
|||||||
@@ -294,7 +294,7 @@ where
|
|||||||
// is in state 'taken' but the thread that would unlock it is
|
// is in state 'taken' but the thread that would unlock it is
|
||||||
// not there.
|
// not there.
|
||||||
// 2. A rust object that represented some external resource in the
|
// 2. A rust object that represented some external resource in the
|
||||||
// parent now got implicitly copied by the the fork, even though
|
// parent now got implicitly copied by the fork, even though
|
||||||
// the object's type is not `Copy`. The parent program may use
|
// the object's type is not `Copy`. The parent program may use
|
||||||
// non-copyability as way to enforce unique ownership of an
|
// non-copyability as way to enforce unique ownership of an
|
||||||
// external resource in the typesystem. The fork breaks that
|
// external resource in the typesystem. The fork breaks that
|
||||||
|
|||||||
@@ -8,11 +8,11 @@
|
|||||||
use anyhow::{anyhow, bail, Context, Result};
|
use anyhow::{anyhow, bail, Context, Result};
|
||||||
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum};
|
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum};
|
||||||
use compute_api::spec::ComputeMode;
|
use compute_api::spec::ComputeMode;
|
||||||
use control_plane::attachment_service::AttachmentService;
|
|
||||||
use control_plane::endpoint::ComputeControlPlane;
|
use control_plane::endpoint::ComputeControlPlane;
|
||||||
use control_plane::local_env::{InitForceMode, LocalEnv};
|
use control_plane::local_env::{InitForceMode, LocalEnv};
|
||||||
use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
|
use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
|
||||||
use control_plane::safekeeper::SafekeeperNode;
|
use control_plane::safekeeper::SafekeeperNode;
|
||||||
|
use control_plane::storage_controller::StorageController;
|
||||||
use control_plane::{broker, local_env};
|
use control_plane::{broker, local_env};
|
||||||
use pageserver_api::controller_api::{
|
use pageserver_api::controller_api::{
|
||||||
NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, PlacementPolicy,
|
NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, PlacementPolicy,
|
||||||
@@ -138,7 +138,7 @@ fn main() -> Result<()> {
|
|||||||
"start" => rt.block_on(handle_start_all(sub_args, &env)),
|
"start" => rt.block_on(handle_start_all(sub_args, &env)),
|
||||||
"stop" => rt.block_on(handle_stop_all(sub_args, &env)),
|
"stop" => rt.block_on(handle_stop_all(sub_args, &env)),
|
||||||
"pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
|
"pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
|
||||||
"attachment_service" => rt.block_on(handle_attachment_service(sub_args, &env)),
|
"storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)),
|
||||||
"safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)),
|
"safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)),
|
||||||
"endpoint" => rt.block_on(handle_endpoint(sub_args, &env)),
|
"endpoint" => rt.block_on(handle_endpoint(sub_args, &env)),
|
||||||
"mappings" => handle_mappings(sub_args, &mut env),
|
"mappings" => handle_mappings(sub_args, &mut env),
|
||||||
@@ -437,7 +437,7 @@ async fn handle_tenant(
|
|||||||
|
|
||||||
let placement_policy = match create_match.get_one::<String>("placement-policy") {
|
let placement_policy = match create_match.get_one::<String>("placement-policy") {
|
||||||
Some(s) if !s.is_empty() => serde_json::from_str::<PlacementPolicy>(s)?,
|
Some(s) if !s.is_empty() => serde_json::from_str::<PlacementPolicy>(s)?,
|
||||||
_ => PlacementPolicy::Single,
|
_ => PlacementPolicy::Attached(0),
|
||||||
};
|
};
|
||||||
|
|
||||||
let tenant_conf = PageServerNode::parse_config(tenant_conf)?;
|
let tenant_conf = PageServerNode::parse_config(tenant_conf)?;
|
||||||
@@ -445,14 +445,14 @@ async fn handle_tenant(
|
|||||||
// If tenant ID was not specified, generate one
|
// If tenant ID was not specified, generate one
|
||||||
let tenant_id = parse_tenant_id(create_match)?.unwrap_or_else(TenantId::generate);
|
let tenant_id = parse_tenant_id(create_match)?.unwrap_or_else(TenantId::generate);
|
||||||
|
|
||||||
// We must register the tenant with the attachment service, so
|
// We must register the tenant with the storage controller, so
|
||||||
// that when the pageserver restarts, it will be re-attached.
|
// that when the pageserver restarts, it will be re-attached.
|
||||||
let attachment_service = AttachmentService::from_env(env);
|
let storage_controller = StorageController::from_env(env);
|
||||||
attachment_service
|
storage_controller
|
||||||
.tenant_create(TenantCreateRequest {
|
.tenant_create(TenantCreateRequest {
|
||||||
// Note that ::unsharded here isn't actually because the tenant is unsharded, its because the
|
// Note that ::unsharded here isn't actually because the tenant is unsharded, its because the
|
||||||
// attachment service expecfs a shard-naive tenant_id in this attribute, and the TenantCreateRequest
|
// storage controller expecfs a shard-naive tenant_id in this attribute, and the TenantCreateRequest
|
||||||
// type is used both in attachment service (for creating tenants) and in pageserver (for creating shards)
|
// type is used both in storage controller (for creating tenants) and in pageserver (for creating shards)
|
||||||
new_tenant_id: TenantShardId::unsharded(tenant_id),
|
new_tenant_id: TenantShardId::unsharded(tenant_id),
|
||||||
generation: None,
|
generation: None,
|
||||||
shard_parameters: ShardParameters {
|
shard_parameters: ShardParameters {
|
||||||
@@ -476,9 +476,9 @@ async fn handle_tenant(
|
|||||||
.context("Failed to parse postgres version from the argument string")?;
|
.context("Failed to parse postgres version from the argument string")?;
|
||||||
|
|
||||||
// FIXME: passing None for ancestor_start_lsn is not kosher in a sharded world: we can't have
|
// FIXME: passing None for ancestor_start_lsn is not kosher in a sharded world: we can't have
|
||||||
// different shards picking different start lsns. Maybe we have to teach attachment service
|
// different shards picking different start lsns. Maybe we have to teach storage controller
|
||||||
// to let shard 0 branch first and then propagate the chosen LSN to other shards.
|
// to let shard 0 branch first and then propagate the chosen LSN to other shards.
|
||||||
attachment_service
|
storage_controller
|
||||||
.tenant_timeline_create(
|
.tenant_timeline_create(
|
||||||
tenant_id,
|
tenant_id,
|
||||||
TimelineCreateRequest {
|
TimelineCreateRequest {
|
||||||
@@ -523,84 +523,6 @@ async fn handle_tenant(
|
|||||||
.with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
|
.with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
|
||||||
println!("tenant {tenant_id} successfully configured on the pageserver");
|
println!("tenant {tenant_id} successfully configured on the pageserver");
|
||||||
}
|
}
|
||||||
Some(("migrate", matches)) => {
|
|
||||||
let tenant_shard_id = get_tenant_shard_id(matches, env)?;
|
|
||||||
let new_pageserver = get_pageserver(env, matches)?;
|
|
||||||
let new_pageserver_id = new_pageserver.conf.id;
|
|
||||||
|
|
||||||
let attachment_service = AttachmentService::from_env(env);
|
|
||||||
attachment_service
|
|
||||||
.tenant_migrate(tenant_shard_id, new_pageserver_id)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
println!("tenant {tenant_shard_id} migrated to {}", new_pageserver_id);
|
|
||||||
}
|
|
||||||
Some(("status", matches)) => {
|
|
||||||
let tenant_id = get_tenant_id(matches, env)?;
|
|
||||||
|
|
||||||
let mut shard_table = comfy_table::Table::new();
|
|
||||||
shard_table.set_header(["Shard", "Pageserver", "Physical Size"]);
|
|
||||||
|
|
||||||
let mut tenant_synthetic_size = None;
|
|
||||||
|
|
||||||
let attachment_service = AttachmentService::from_env(env);
|
|
||||||
for shard in attachment_service.tenant_locate(tenant_id).await?.shards {
|
|
||||||
let pageserver =
|
|
||||||
PageServerNode::from_env(env, env.get_pageserver_conf(shard.node_id)?);
|
|
||||||
|
|
||||||
let size = pageserver
|
|
||||||
.http_client
|
|
||||||
.tenant_details(shard.shard_id)
|
|
||||||
.await?
|
|
||||||
.tenant_info
|
|
||||||
.current_physical_size
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
shard_table.add_row([
|
|
||||||
format!("{}", shard.shard_id.shard_slug()),
|
|
||||||
format!("{}", shard.node_id.0),
|
|
||||||
format!("{} MiB", size / (1024 * 1024)),
|
|
||||||
]);
|
|
||||||
|
|
||||||
if shard.shard_id.is_zero() {
|
|
||||||
tenant_synthetic_size =
|
|
||||||
Some(pageserver.tenant_synthetic_size(shard.shard_id).await?);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let Some(synthetic_size) = tenant_synthetic_size else {
|
|
||||||
bail!("Shard 0 not found")
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut tenant_table = comfy_table::Table::new();
|
|
||||||
tenant_table.add_row(["Tenant ID".to_string(), tenant_id.to_string()]);
|
|
||||||
tenant_table.add_row([
|
|
||||||
"Synthetic size".to_string(),
|
|
||||||
format!("{} MiB", synthetic_size.size.unwrap_or(0) / (1024 * 1024)),
|
|
||||||
]);
|
|
||||||
|
|
||||||
println!("{tenant_table}");
|
|
||||||
println!("{shard_table}");
|
|
||||||
}
|
|
||||||
Some(("shard-split", matches)) => {
|
|
||||||
let tenant_id = get_tenant_id(matches, env)?;
|
|
||||||
let shard_count: u8 = matches.get_one::<u8>("shard-count").cloned().unwrap_or(0);
|
|
||||||
|
|
||||||
let attachment_service = AttachmentService::from_env(env);
|
|
||||||
let result = attachment_service
|
|
||||||
.tenant_split(tenant_id, shard_count)
|
|
||||||
.await?;
|
|
||||||
println!(
|
|
||||||
"Split tenant {} into shards {}",
|
|
||||||
tenant_id,
|
|
||||||
result
|
|
||||||
.new_shards
|
|
||||||
.iter()
|
|
||||||
.map(|s| format!("{:?}", s))
|
|
||||||
.collect::<Vec<_>>()
|
|
||||||
.join(",")
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
|
Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
|
||||||
None => bail!("no tenant subcommand provided"),
|
None => bail!("no tenant subcommand provided"),
|
||||||
@@ -613,7 +535,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
|||||||
|
|
||||||
match timeline_match.subcommand() {
|
match timeline_match.subcommand() {
|
||||||
Some(("list", list_match)) => {
|
Some(("list", list_match)) => {
|
||||||
// TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the attachment service
|
// TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the storage controller
|
||||||
// where shard 0 is attached, and query there.
|
// where shard 0 is attached, and query there.
|
||||||
let tenant_shard_id = get_tenant_shard_id(list_match, env)?;
|
let tenant_shard_id = get_tenant_shard_id(list_match, env)?;
|
||||||
let timelines = pageserver.timeline_list(&tenant_shard_id).await?;
|
let timelines = pageserver.timeline_list(&tenant_shard_id).await?;
|
||||||
@@ -633,7 +555,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
|||||||
let new_timeline_id_opt = parse_timeline_id(create_match)?;
|
let new_timeline_id_opt = parse_timeline_id(create_match)?;
|
||||||
let new_timeline_id = new_timeline_id_opt.unwrap_or(TimelineId::generate());
|
let new_timeline_id = new_timeline_id_opt.unwrap_or(TimelineId::generate());
|
||||||
|
|
||||||
let attachment_service = AttachmentService::from_env(env);
|
let storage_controller = StorageController::from_env(env);
|
||||||
let create_req = TimelineCreateRequest {
|
let create_req = TimelineCreateRequest {
|
||||||
new_timeline_id,
|
new_timeline_id,
|
||||||
ancestor_timeline_id: None,
|
ancestor_timeline_id: None,
|
||||||
@@ -641,7 +563,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
|||||||
ancestor_start_lsn: None,
|
ancestor_start_lsn: None,
|
||||||
pg_version: Some(pg_version),
|
pg_version: Some(pg_version),
|
||||||
};
|
};
|
||||||
let timeline_info = attachment_service
|
let timeline_info = storage_controller
|
||||||
.tenant_timeline_create(tenant_id, create_req)
|
.tenant_timeline_create(tenant_id, create_req)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
@@ -730,7 +652,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
|||||||
.transpose()
|
.transpose()
|
||||||
.context("Failed to parse ancestor start Lsn from the request")?;
|
.context("Failed to parse ancestor start Lsn from the request")?;
|
||||||
let new_timeline_id = TimelineId::generate();
|
let new_timeline_id = TimelineId::generate();
|
||||||
let attachment_service = AttachmentService::from_env(env);
|
let storage_controller = StorageController::from_env(env);
|
||||||
let create_req = TimelineCreateRequest {
|
let create_req = TimelineCreateRequest {
|
||||||
new_timeline_id,
|
new_timeline_id,
|
||||||
ancestor_timeline_id: Some(ancestor_timeline_id),
|
ancestor_timeline_id: Some(ancestor_timeline_id),
|
||||||
@@ -738,7 +660,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
|||||||
ancestor_start_lsn: start_lsn,
|
ancestor_start_lsn: start_lsn,
|
||||||
pg_version: None,
|
pg_version: None,
|
||||||
};
|
};
|
||||||
let timeline_info = attachment_service
|
let timeline_info = storage_controller
|
||||||
.tenant_timeline_create(tenant_id, create_req)
|
.tenant_timeline_create(tenant_id, create_req)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
@@ -767,7 +689,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
|||||||
|
|
||||||
match sub_name {
|
match sub_name {
|
||||||
"list" => {
|
"list" => {
|
||||||
// TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the attachment service
|
// TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the storage controller
|
||||||
// where shard 0 is attached, and query there.
|
// where shard 0 is attached, and query there.
|
||||||
let tenant_shard_id = get_tenant_shard_id(sub_args, env)?;
|
let tenant_shard_id = get_tenant_shard_id(sub_args, env)?;
|
||||||
let timeline_infos = get_timeline_infos(env, &tenant_shard_id)
|
let timeline_infos = get_timeline_infos(env, &tenant_shard_id)
|
||||||
@@ -952,21 +874,21 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
|||||||
(
|
(
|
||||||
vec![(parsed.0, parsed.1.unwrap_or(5432))],
|
vec![(parsed.0, parsed.1.unwrap_or(5432))],
|
||||||
// If caller is telling us what pageserver to use, this is not a tenant which is
|
// If caller is telling us what pageserver to use, this is not a tenant which is
|
||||||
// full managed by attachment service, therefore not sharded.
|
// full managed by storage controller, therefore not sharded.
|
||||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||||
)
|
)
|
||||||
} else {
|
} else {
|
||||||
// Look up the currently attached location of the tenant, and its striping metadata,
|
// Look up the currently attached location of the tenant, and its striping metadata,
|
||||||
// to pass these on to postgres.
|
// to pass these on to postgres.
|
||||||
let attachment_service = AttachmentService::from_env(env);
|
let storage_controller = StorageController::from_env(env);
|
||||||
let locate_result = attachment_service.tenant_locate(endpoint.tenant_id).await?;
|
let locate_result = storage_controller.tenant_locate(endpoint.tenant_id).await?;
|
||||||
let pageservers = locate_result
|
let pageservers = locate_result
|
||||||
.shards
|
.shards
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|shard| {
|
.map(|shard| {
|
||||||
(
|
(
|
||||||
Host::parse(&shard.listen_pg_addr)
|
Host::parse(&shard.listen_pg_addr)
|
||||||
.expect("Attachment service reported bad hostname"),
|
.expect("Storage controller reported bad hostname"),
|
||||||
shard.listen_pg_port,
|
shard.listen_pg_port,
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
@@ -1015,8 +937,8 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
|||||||
pageserver.pg_connection_config.port(),
|
pageserver.pg_connection_config.port(),
|
||||||
)]
|
)]
|
||||||
} else {
|
} else {
|
||||||
let attachment_service = AttachmentService::from_env(env);
|
let storage_controller = StorageController::from_env(env);
|
||||||
attachment_service
|
storage_controller
|
||||||
.tenant_locate(endpoint.tenant_id)
|
.tenant_locate(endpoint.tenant_id)
|
||||||
.await?
|
.await?
|
||||||
.shards
|
.shards
|
||||||
@@ -1024,7 +946,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
|||||||
.map(|shard| {
|
.map(|shard| {
|
||||||
(
|
(
|
||||||
Host::parse(&shard.listen_pg_addr)
|
Host::parse(&shard.listen_pg_addr)
|
||||||
.expect("Attachment service reported malformed host"),
|
.expect("Storage controller reported malformed host"),
|
||||||
shard.listen_pg_port,
|
shard.listen_pg_port,
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
@@ -1100,9 +1022,8 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
|
|||||||
async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||||
match sub_match.subcommand() {
|
match sub_match.subcommand() {
|
||||||
Some(("start", subcommand_args)) => {
|
Some(("start", subcommand_args)) => {
|
||||||
let register = subcommand_args.get_one::<bool>("register").unwrap_or(&true);
|
|
||||||
if let Err(e) = get_pageserver(env, subcommand_args)?
|
if let Err(e) = get_pageserver(env, subcommand_args)?
|
||||||
.start(&pageserver_config_overrides(subcommand_args), *register)
|
.start(&pageserver_config_overrides(subcommand_args))
|
||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
eprintln!("pageserver start failed: {e}");
|
eprintln!("pageserver start failed: {e}");
|
||||||
@@ -1131,7 +1052,7 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
|||||||
}
|
}
|
||||||
|
|
||||||
if let Err(e) = pageserver
|
if let Err(e) = pageserver
|
||||||
.start(&pageserver_config_overrides(subcommand_args), false)
|
.start(&pageserver_config_overrides(subcommand_args))
|
||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
eprintln!("pageserver start failed: {e}");
|
eprintln!("pageserver start failed: {e}");
|
||||||
@@ -1144,8 +1065,8 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
|||||||
let scheduling = subcommand_args.get_one("scheduling");
|
let scheduling = subcommand_args.get_one("scheduling");
|
||||||
let availability = subcommand_args.get_one("availability");
|
let availability = subcommand_args.get_one("availability");
|
||||||
|
|
||||||
let attachment_service = AttachmentService::from_env(env);
|
let storage_controller = StorageController::from_env(env);
|
||||||
attachment_service
|
storage_controller
|
||||||
.node_configure(NodeConfigureRequest {
|
.node_configure(NodeConfigureRequest {
|
||||||
node_id: pageserver.conf.id,
|
node_id: pageserver.conf.id,
|
||||||
scheduling: scheduling.cloned(),
|
scheduling: scheduling.cloned(),
|
||||||
@@ -1170,11 +1091,11 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn handle_attachment_service(
|
async fn handle_storage_controller(
|
||||||
sub_match: &ArgMatches,
|
sub_match: &ArgMatches,
|
||||||
env: &local_env::LocalEnv,
|
env: &local_env::LocalEnv,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let svc = AttachmentService::from_env(env);
|
let svc = StorageController::from_env(env);
|
||||||
match sub_match.subcommand() {
|
match sub_match.subcommand() {
|
||||||
Some(("start", _start_match)) => {
|
Some(("start", _start_match)) => {
|
||||||
if let Err(e) = svc.start().await {
|
if let Err(e) = svc.start().await {
|
||||||
@@ -1194,8 +1115,8 @@ async fn handle_attachment_service(
|
|||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Some((sub_name, _)) => bail!("Unexpected attachment_service subcommand '{}'", sub_name),
|
Some((sub_name, _)) => bail!("Unexpected storage_controller subcommand '{}'", sub_name),
|
||||||
None => bail!("no attachment_service subcommand provided"),
|
None => bail!("no storage_controller subcommand provided"),
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -1280,11 +1201,11 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
|||||||
|
|
||||||
broker::start_broker_process(env).await?;
|
broker::start_broker_process(env).await?;
|
||||||
|
|
||||||
// Only start the attachment service if the pageserver is configured to need it
|
// Only start the storage controller if the pageserver is configured to need it
|
||||||
if env.control_plane_api.is_some() {
|
if env.control_plane_api.is_some() {
|
||||||
let attachment_service = AttachmentService::from_env(env);
|
let storage_controller = StorageController::from_env(env);
|
||||||
if let Err(e) = attachment_service.start().await {
|
if let Err(e) = storage_controller.start().await {
|
||||||
eprintln!("attachment_service start failed: {:#}", e);
|
eprintln!("storage_controller start failed: {:#}", e);
|
||||||
try_stop_all(env, true).await;
|
try_stop_all(env, true).await;
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
@@ -1293,7 +1214,7 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
|||||||
for ps_conf in &env.pageservers {
|
for ps_conf in &env.pageservers {
|
||||||
let pageserver = PageServerNode::from_env(env, ps_conf);
|
let pageserver = PageServerNode::from_env(env, ps_conf);
|
||||||
if let Err(e) = pageserver
|
if let Err(e) = pageserver
|
||||||
.start(&pageserver_config_overrides(sub_match), true)
|
.start(&pageserver_config_overrides(sub_match))
|
||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
|
eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
|
||||||
@@ -1356,9 +1277,9 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if env.control_plane_api.is_some() {
|
if env.control_plane_api.is_some() {
|
||||||
let attachment_service = AttachmentService::from_env(env);
|
let storage_controller = StorageController::from_env(env);
|
||||||
if let Err(e) = attachment_service.stop(immediate).await {
|
if let Err(e) = storage_controller.stop(immediate).await {
|
||||||
eprintln!("attachment service stop failed: {e:#}");
|
eprintln!("storage controller stop failed: {e:#}");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1575,18 +1496,6 @@ fn cli() -> Command {
|
|||||||
.subcommand(Command::new("config")
|
.subcommand(Command::new("config")
|
||||||
.arg(tenant_id_arg.clone())
|
.arg(tenant_id_arg.clone())
|
||||||
.arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
|
.arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
|
||||||
.subcommand(Command::new("migrate")
|
|
||||||
.about("Migrate a tenant from one pageserver to another")
|
|
||||||
.arg(tenant_id_arg.clone())
|
|
||||||
.arg(pageserver_id_arg.clone()))
|
|
||||||
.subcommand(Command::new("status")
|
|
||||||
.about("Human readable summary of the tenant's shards and attachment locations")
|
|
||||||
.arg(tenant_id_arg.clone()))
|
|
||||||
.subcommand(Command::new("shard-split")
|
|
||||||
.about("Increase the number of shards in the tenant")
|
|
||||||
.arg(tenant_id_arg.clone())
|
|
||||||
.arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
.subcommand(
|
.subcommand(
|
||||||
Command::new("pageserver")
|
Command::new("pageserver")
|
||||||
@@ -1596,11 +1505,7 @@ fn cli() -> Command {
|
|||||||
.subcommand(Command::new("status"))
|
.subcommand(Command::new("status"))
|
||||||
.subcommand(Command::new("start")
|
.subcommand(Command::new("start")
|
||||||
.about("Start local pageserver")
|
.about("Start local pageserver")
|
||||||
.arg(pageserver_config_args.clone()).arg(Arg::new("register")
|
.arg(pageserver_config_args.clone())
|
||||||
.long("register")
|
|
||||||
.default_value("true").required(false)
|
|
||||||
.value_parser(value_parser!(bool))
|
|
||||||
.value_name("register"))
|
|
||||||
)
|
)
|
||||||
.subcommand(Command::new("stop")
|
.subcommand(Command::new("stop")
|
||||||
.about("Stop local pageserver")
|
.about("Stop local pageserver")
|
||||||
@@ -1618,9 +1523,9 @@ fn cli() -> Command {
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
.subcommand(
|
.subcommand(
|
||||||
Command::new("attachment_service")
|
Command::new("storage_controller")
|
||||||
.arg_required_else_help(true)
|
.arg_required_else_help(true)
|
||||||
.about("Manage attachment_service")
|
.about("Manage storage_controller")
|
||||||
.subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
|
.subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
|
||||||
.subcommand(Command::new("stop").about("Stop local pageserver")
|
.subcommand(Command::new("stop").about("Stop local pageserver")
|
||||||
.arg(stop_mode_arg.clone()))
|
.arg(stop_mode_arg.clone()))
|
||||||
|
|||||||
@@ -12,7 +12,7 @@
|
|||||||
//!
|
//!
|
||||||
//! The endpoint is managed by the `compute_ctl` binary. When an endpoint is
|
//! The endpoint is managed by the `compute_ctl` binary. When an endpoint is
|
||||||
//! started, we launch `compute_ctl` It synchronizes the safekeepers, downloads
|
//! started, we launch `compute_ctl` It synchronizes the safekeepers, downloads
|
||||||
//! the basebackup from the pageserver to initialize the the data directory, and
|
//! the basebackup from the pageserver to initialize the data directory, and
|
||||||
//! finally launches the PostgreSQL process. It watches the PostgreSQL process
|
//! finally launches the PostgreSQL process. It watches the PostgreSQL process
|
||||||
//! until it exits.
|
//! until it exits.
|
||||||
//!
|
//!
|
||||||
@@ -57,9 +57,9 @@ use serde::{Deserialize, Serialize};
|
|||||||
use url::Host;
|
use url::Host;
|
||||||
use utils::id::{NodeId, TenantId, TimelineId};
|
use utils::id::{NodeId, TenantId, TimelineId};
|
||||||
|
|
||||||
use crate::attachment_service::AttachmentService;
|
|
||||||
use crate::local_env::LocalEnv;
|
use crate::local_env::LocalEnv;
|
||||||
use crate::postgresql_conf::PostgresConf;
|
use crate::postgresql_conf::PostgresConf;
|
||||||
|
use crate::storage_controller::StorageController;
|
||||||
|
|
||||||
use compute_api::responses::{ComputeState, ComputeStatus};
|
use compute_api::responses::{ComputeState, ComputeStatus};
|
||||||
use compute_api::spec::{Cluster, ComputeFeature, ComputeMode, ComputeSpec};
|
use compute_api::spec::{Cluster, ComputeFeature, ComputeMode, ComputeSpec};
|
||||||
@@ -750,17 +750,17 @@ impl Endpoint {
|
|||||||
let postgresql_conf = self.read_postgresql_conf()?;
|
let postgresql_conf = self.read_postgresql_conf()?;
|
||||||
spec.cluster.postgresql_conf = Some(postgresql_conf);
|
spec.cluster.postgresql_conf = Some(postgresql_conf);
|
||||||
|
|
||||||
// If we weren't given explicit pageservers, query the attachment service
|
// If we weren't given explicit pageservers, query the storage controller
|
||||||
if pageservers.is_empty() {
|
if pageservers.is_empty() {
|
||||||
let attachment_service = AttachmentService::from_env(&self.env);
|
let storage_controller = StorageController::from_env(&self.env);
|
||||||
let locate_result = attachment_service.tenant_locate(self.tenant_id).await?;
|
let locate_result = storage_controller.tenant_locate(self.tenant_id).await?;
|
||||||
pageservers = locate_result
|
pageservers = locate_result
|
||||||
.shards
|
.shards
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|shard| {
|
.map(|shard| {
|
||||||
(
|
(
|
||||||
Host::parse(&shard.listen_pg_addr)
|
Host::parse(&shard.listen_pg_addr)
|
||||||
.expect("Attachment service reported bad hostname"),
|
.expect("Storage controller reported bad hostname"),
|
||||||
shard.listen_pg_port,
|
shard.listen_pg_port,
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -6,7 +6,6 @@
|
|||||||
//! local installations.
|
//! local installations.
|
||||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||||
|
|
||||||
pub mod attachment_service;
|
|
||||||
mod background_process;
|
mod background_process;
|
||||||
pub mod broker;
|
pub mod broker;
|
||||||
pub mod endpoint;
|
pub mod endpoint;
|
||||||
@@ -14,3 +13,4 @@ pub mod local_env;
|
|||||||
pub mod pageserver;
|
pub mod pageserver;
|
||||||
pub mod postgresql_conf;
|
pub mod postgresql_conf;
|
||||||
pub mod safekeeper;
|
pub mod safekeeper;
|
||||||
|
pub mod storage_controller;
|
||||||
|
|||||||
@@ -72,13 +72,13 @@ pub struct LocalEnv {
|
|||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub safekeepers: Vec<SafekeeperConf>,
|
pub safekeepers: Vec<SafekeeperConf>,
|
||||||
|
|
||||||
// Control plane upcall API for pageserver: if None, we will not run attachment_service. If set, this will
|
// Control plane upcall API for pageserver: if None, we will not run storage_controller If set, this will
|
||||||
// be propagated into each pageserver's configuration.
|
// be propagated into each pageserver's configuration.
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub control_plane_api: Option<Url>,
|
pub control_plane_api: Option<Url>,
|
||||||
|
|
||||||
// Control plane upcall API for attachment service. If set, this will be propagated into the
|
// Control plane upcall API for storage controller. If set, this will be propagated into the
|
||||||
// attachment service's configuration.
|
// storage controller's configuration.
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub control_plane_compute_hook_api: Option<Url>,
|
pub control_plane_compute_hook_api: Option<Url>,
|
||||||
|
|
||||||
@@ -114,7 +114,7 @@ impl NeonBroker {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||||
#[serde(default)]
|
#[serde(default, deny_unknown_fields)]
|
||||||
pub struct PageServerConf {
|
pub struct PageServerConf {
|
||||||
// node id
|
// node id
|
||||||
pub id: NodeId,
|
pub id: NodeId,
|
||||||
@@ -126,6 +126,9 @@ pub struct PageServerConf {
|
|||||||
// auth type used for the PG and HTTP ports
|
// auth type used for the PG and HTTP ports
|
||||||
pub pg_auth_type: AuthType,
|
pub pg_auth_type: AuthType,
|
||||||
pub http_auth_type: AuthType,
|
pub http_auth_type: AuthType,
|
||||||
|
|
||||||
|
pub(crate) virtual_file_io_engine: Option<String>,
|
||||||
|
pub(crate) get_vectored_impl: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for PageServerConf {
|
impl Default for PageServerConf {
|
||||||
@@ -136,6 +139,8 @@ impl Default for PageServerConf {
|
|||||||
listen_http_addr: String::new(),
|
listen_http_addr: String::new(),
|
||||||
pg_auth_type: AuthType::Trust,
|
pg_auth_type: AuthType::Trust,
|
||||||
http_auth_type: AuthType::Trust,
|
http_auth_type: AuthType::Trust,
|
||||||
|
virtual_file_io_engine: None,
|
||||||
|
get_vectored_impl: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -227,10 +232,10 @@ impl LocalEnv {
|
|||||||
self.neon_distrib_dir.join("pageserver")
|
self.neon_distrib_dir.join("pageserver")
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn attachment_service_bin(&self) -> PathBuf {
|
pub fn storage_controller_bin(&self) -> PathBuf {
|
||||||
// Irrespective of configuration, attachment service binary is always
|
// Irrespective of configuration, storage controller binary is always
|
||||||
// run from the same location as neon_local. This means that for compatibility
|
// run from the same location as neon_local. This means that for compatibility
|
||||||
// tests that run old pageserver/safekeeper, they still run latest attachment service.
|
// tests that run old pageserver/safekeeper, they still run latest storage controller.
|
||||||
let neon_local_bin_dir = env::current_exe().unwrap().parent().unwrap().to_owned();
|
let neon_local_bin_dir = env::current_exe().unwrap().parent().unwrap().to_owned();
|
||||||
neon_local_bin_dir.join("storage_controller")
|
neon_local_bin_dir.join("storage_controller")
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -17,8 +17,6 @@ use std::time::Duration;
|
|||||||
use anyhow::{bail, Context};
|
use anyhow::{bail, Context};
|
||||||
use camino::Utf8PathBuf;
|
use camino::Utf8PathBuf;
|
||||||
use futures::SinkExt;
|
use futures::SinkExt;
|
||||||
use hyper::StatusCode;
|
|
||||||
use pageserver_api::controller_api::NodeRegisterRequest;
|
|
||||||
use pageserver_api::models::{
|
use pageserver_api::models::{
|
||||||
self, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, TimelineInfo,
|
self, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, TimelineInfo,
|
||||||
};
|
};
|
||||||
@@ -32,7 +30,6 @@ use utils::{
|
|||||||
lsn::Lsn,
|
lsn::Lsn,
|
||||||
};
|
};
|
||||||
|
|
||||||
use crate::attachment_service::AttachmentService;
|
|
||||||
use crate::local_env::PageServerConf;
|
use crate::local_env::PageServerConf;
|
||||||
use crate::{background_process, local_env::LocalEnv};
|
use crate::{background_process, local_env::LocalEnv};
|
||||||
|
|
||||||
@@ -81,18 +78,39 @@ impl PageServerNode {
|
|||||||
///
|
///
|
||||||
/// These all end up on the command line of the `pageserver` binary.
|
/// These all end up on the command line of the `pageserver` binary.
|
||||||
fn neon_local_overrides(&self, cli_overrides: &[&str]) -> Vec<String> {
|
fn neon_local_overrides(&self, cli_overrides: &[&str]) -> Vec<String> {
|
||||||
let id = format!("id={}", self.conf.id);
|
|
||||||
// FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
|
// FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
|
||||||
let pg_distrib_dir_param = format!(
|
let pg_distrib_dir_param = format!(
|
||||||
"pg_distrib_dir='{}'",
|
"pg_distrib_dir='{}'",
|
||||||
self.env.pg_distrib_dir_raw().display()
|
self.env.pg_distrib_dir_raw().display()
|
||||||
);
|
);
|
||||||
|
|
||||||
let http_auth_type_param = format!("http_auth_type='{}'", self.conf.http_auth_type);
|
let PageServerConf {
|
||||||
let listen_http_addr_param = format!("listen_http_addr='{}'", self.conf.listen_http_addr);
|
id,
|
||||||
|
listen_pg_addr,
|
||||||
|
listen_http_addr,
|
||||||
|
pg_auth_type,
|
||||||
|
http_auth_type,
|
||||||
|
virtual_file_io_engine,
|
||||||
|
get_vectored_impl,
|
||||||
|
} = &self.conf;
|
||||||
|
|
||||||
let pg_auth_type_param = format!("pg_auth_type='{}'", self.conf.pg_auth_type);
|
let id = format!("id={}", id);
|
||||||
let listen_pg_addr_param = format!("listen_pg_addr='{}'", self.conf.listen_pg_addr);
|
|
||||||
|
let http_auth_type_param = format!("http_auth_type='{}'", http_auth_type);
|
||||||
|
let listen_http_addr_param = format!("listen_http_addr='{}'", listen_http_addr);
|
||||||
|
|
||||||
|
let pg_auth_type_param = format!("pg_auth_type='{}'", pg_auth_type);
|
||||||
|
let listen_pg_addr_param = format!("listen_pg_addr='{}'", listen_pg_addr);
|
||||||
|
let virtual_file_io_engine = if let Some(virtual_file_io_engine) = virtual_file_io_engine {
|
||||||
|
format!("virtual_file_io_engine='{virtual_file_io_engine}'")
|
||||||
|
} else {
|
||||||
|
String::new()
|
||||||
|
};
|
||||||
|
let get_vectored_impl = if let Some(get_vectored_impl) = get_vectored_impl {
|
||||||
|
format!("get_vectored_impl='{get_vectored_impl}'")
|
||||||
|
} else {
|
||||||
|
String::new()
|
||||||
|
};
|
||||||
|
|
||||||
let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
|
let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
|
||||||
|
|
||||||
@@ -104,6 +122,8 @@ impl PageServerNode {
|
|||||||
listen_http_addr_param,
|
listen_http_addr_param,
|
||||||
listen_pg_addr_param,
|
listen_pg_addr_param,
|
||||||
broker_endpoint_param,
|
broker_endpoint_param,
|
||||||
|
virtual_file_io_engine,
|
||||||
|
get_vectored_impl,
|
||||||
];
|
];
|
||||||
|
|
||||||
if let Some(control_plane_api) = &self.env.control_plane_api {
|
if let Some(control_plane_api) = &self.env.control_plane_api {
|
||||||
@@ -112,9 +132,9 @@ impl PageServerNode {
|
|||||||
control_plane_api.as_str()
|
control_plane_api.as_str()
|
||||||
));
|
));
|
||||||
|
|
||||||
// Attachment service uses the same auth as pageserver: if JWT is enabled
|
// Storage controller uses the same auth as pageserver: if JWT is enabled
|
||||||
// for us, we will also need it to talk to them.
|
// for us, we will also need it to talk to them.
|
||||||
if matches!(self.conf.http_auth_type, AuthType::NeonJWT) {
|
if matches!(http_auth_type, AuthType::NeonJWT) {
|
||||||
let jwt_token = self
|
let jwt_token = self
|
||||||
.env
|
.env
|
||||||
.generate_auth_token(&Claims::new(None, Scope::GenerationsApi))
|
.generate_auth_token(&Claims::new(None, Scope::GenerationsApi))
|
||||||
@@ -132,8 +152,7 @@ impl PageServerNode {
|
|||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
if self.conf.http_auth_type != AuthType::Trust || self.conf.pg_auth_type != AuthType::Trust
|
if *http_auth_type != AuthType::Trust || *pg_auth_type != AuthType::Trust {
|
||||||
{
|
|
||||||
// Keys are generated in the toplevel repo dir, pageservers' workdirs
|
// Keys are generated in the toplevel repo dir, pageservers' workdirs
|
||||||
// are one level below that, so refer to keys with ../
|
// are one level below that, so refer to keys with ../
|
||||||
overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
|
overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
|
||||||
@@ -164,8 +183,8 @@ impl PageServerNode {
|
|||||||
.expect("non-Unicode path")
|
.expect("non-Unicode path")
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn start(&self, config_overrides: &[&str], register: bool) -> anyhow::Result<()> {
|
pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
|
||||||
self.start_node(config_overrides, false, register).await
|
self.start_node(config_overrides, false).await
|
||||||
}
|
}
|
||||||
|
|
||||||
fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
|
fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
|
||||||
@@ -203,6 +222,28 @@ impl PageServerNode {
|
|||||||
String::from_utf8_lossy(&init_output.stderr),
|
String::from_utf8_lossy(&init_output.stderr),
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Write metadata file, used by pageserver on startup to register itself with
|
||||||
|
// the storage controller
|
||||||
|
let metadata_path = datadir.join("metadata.json");
|
||||||
|
|
||||||
|
let (_http_host, http_port) =
|
||||||
|
parse_host_port(&self.conf.listen_http_addr).expect("Unable to parse listen_http_addr");
|
||||||
|
let http_port = http_port.unwrap_or(9898);
|
||||||
|
// Intentionally hand-craft JSON: this acts as an implicit format compat test
|
||||||
|
// in case the pageserver-side structure is edited, and reflects the real life
|
||||||
|
// situation: the metadata is written by some other script.
|
||||||
|
std::fs::write(
|
||||||
|
metadata_path,
|
||||||
|
serde_json::to_vec(&serde_json::json!({
|
||||||
|
"host": "localhost",
|
||||||
|
"port": self.pg_connection_config.port(),
|
||||||
|
"http_host": "localhost",
|
||||||
|
"http_port": http_port,
|
||||||
|
}))
|
||||||
|
.unwrap(),
|
||||||
|
)
|
||||||
|
.expect("Failed to write metadata file");
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -210,27 +251,7 @@ impl PageServerNode {
|
|||||||
&self,
|
&self,
|
||||||
config_overrides: &[&str],
|
config_overrides: &[&str],
|
||||||
update_config: bool,
|
update_config: bool,
|
||||||
register: bool,
|
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
// Register the node with the storage controller before starting pageserver: pageserver must be registered to
|
|
||||||
// successfully call /re-attach and finish starting up.
|
|
||||||
if register {
|
|
||||||
let attachment_service = AttachmentService::from_env(&self.env);
|
|
||||||
let (pg_host, pg_port) =
|
|
||||||
parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
|
|
||||||
let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
|
|
||||||
.expect("Unable to parse listen_http_addr");
|
|
||||||
attachment_service
|
|
||||||
.node_register(NodeRegisterRequest {
|
|
||||||
node_id: self.conf.id,
|
|
||||||
listen_pg_addr: pg_host.to_string(),
|
|
||||||
listen_pg_port: pg_port.unwrap_or(5432),
|
|
||||||
listen_http_addr: http_host.to_string(),
|
|
||||||
listen_http_port: http_port.unwrap_or(80),
|
|
||||||
})
|
|
||||||
.await?;
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: using a thread here because start_process() is not async but we need to call check_status()
|
// TODO: using a thread here because start_process() is not async but we need to call check_status()
|
||||||
let datadir = self.repo_path();
|
let datadir = self.repo_path();
|
||||||
print!(
|
print!(
|
||||||
@@ -263,11 +284,6 @@ impl PageServerNode {
|
|||||||
match st {
|
match st {
|
||||||
Ok(()) => Ok(true),
|
Ok(()) => Ok(true),
|
||||||
Err(mgmt_api::Error::ReceiveBody(_)) => Ok(false),
|
Err(mgmt_api::Error::ReceiveBody(_)) => Ok(false),
|
||||||
Err(mgmt_api::Error::ApiError(status, _msg))
|
|
||||||
if status == StatusCode::SERVICE_UNAVAILABLE =>
|
|
||||||
{
|
|
||||||
Ok(false)
|
|
||||||
}
|
|
||||||
Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
|
Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
@@ -560,13 +576,6 @@ impl PageServerNode {
|
|||||||
Ok(self.http_client.list_timelines(*tenant_shard_id).await?)
|
Ok(self.http_client.list_timelines(*tenant_shard_id).await?)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn tenant_secondary_download(&self, tenant_id: &TenantShardId) -> anyhow::Result<()> {
|
|
||||||
Ok(self
|
|
||||||
.http_client
|
|
||||||
.tenant_secondary_download(*tenant_id)
|
|
||||||
.await?)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn timeline_create(
|
pub async fn timeline_create(
|
||||||
&self,
|
&self,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_shard_id: TenantShardId,
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ use pageserver_api::{
|
|||||||
TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
|
TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
|
||||||
TimelineCreateRequest, TimelineInfo,
|
TimelineCreateRequest, TimelineInfo,
|
||||||
},
|
},
|
||||||
shard::TenantShardId,
|
shard::{ShardStripeSize, TenantShardId},
|
||||||
};
|
};
|
||||||
use pageserver_client::mgmt_api::ResponseErrorMessageExt;
|
use pageserver_client::mgmt_api::ResponseErrorMessageExt;
|
||||||
use postgres_backend::AuthType;
|
use postgres_backend::AuthType;
|
||||||
@@ -24,7 +24,7 @@ use utils::{
|
|||||||
id::{NodeId, TenantId},
|
id::{NodeId, TenantId},
|
||||||
};
|
};
|
||||||
|
|
||||||
pub struct AttachmentService {
|
pub struct StorageController {
|
||||||
env: LocalEnv,
|
env: LocalEnv,
|
||||||
listen: String,
|
listen: String,
|
||||||
path: Utf8PathBuf,
|
path: Utf8PathBuf,
|
||||||
@@ -36,7 +36,10 @@ pub struct AttachmentService {
|
|||||||
|
|
||||||
const COMMAND: &str = "storage_controller";
|
const COMMAND: &str = "storage_controller";
|
||||||
|
|
||||||
const ATTACHMENT_SERVICE_POSTGRES_VERSION: u32 = 16;
|
const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
|
||||||
|
|
||||||
|
// Use a shorter pageserver unavailability interval than the default to speed up tests.
|
||||||
|
const NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize)]
|
||||||
pub struct AttachHookRequest {
|
pub struct AttachHookRequest {
|
||||||
@@ -59,7 +62,7 @@ pub struct InspectResponse {
|
|||||||
pub attachment: Option<(u32, NodeId)>,
|
pub attachment: Option<(u32, NodeId)>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl AttachmentService {
|
impl StorageController {
|
||||||
pub fn from_env(env: &LocalEnv) -> Self {
|
pub fn from_env(env: &LocalEnv) -> Self {
|
||||||
let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
|
let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
|
||||||
.unwrap()
|
.unwrap()
|
||||||
@@ -136,27 +139,27 @@ impl AttachmentService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn pid_file(&self) -> Utf8PathBuf {
|
fn pid_file(&self) -> Utf8PathBuf {
|
||||||
Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("attachment_service.pid"))
|
Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("storage_controller.pid"))
|
||||||
.expect("non-Unicode path")
|
.expect("non-Unicode path")
|
||||||
}
|
}
|
||||||
|
|
||||||
/// PIDFile for the postgres instance used to store attachment service state
|
/// PIDFile for the postgres instance used to store storage controller state
|
||||||
fn postgres_pid_file(&self) -> Utf8PathBuf {
|
fn postgres_pid_file(&self) -> Utf8PathBuf {
|
||||||
Utf8PathBuf::from_path_buf(
|
Utf8PathBuf::from_path_buf(
|
||||||
self.env
|
self.env
|
||||||
.base_data_dir
|
.base_data_dir
|
||||||
.join("attachment_service_postgres.pid"),
|
.join("storage_controller_postgres.pid"),
|
||||||
)
|
)
|
||||||
.expect("non-Unicode path")
|
.expect("non-Unicode path")
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl`
|
/// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl`
|
||||||
///
|
///
|
||||||
/// This usually uses ATTACHMENT_SERVICE_POSTGRES_VERSION of postgres, but will fall back
|
/// This usually uses STORAGE_CONTROLLER_POSTGRES_VERSION of postgres, but will fall back
|
||||||
/// to other versions if that one isn't found. Some automated tests create circumstances
|
/// to other versions if that one isn't found. Some automated tests create circumstances
|
||||||
/// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
|
/// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
|
||||||
pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
|
pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
|
||||||
let prefer_versions = [ATTACHMENT_SERVICE_POSTGRES_VERSION, 15, 14];
|
let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 15, 14];
|
||||||
|
|
||||||
for v in prefer_versions {
|
for v in prefer_versions {
|
||||||
let path = Utf8PathBuf::from_path_buf(self.env.pg_bin_dir(v)?).unwrap();
|
let path = Utf8PathBuf::from_path_buf(self.env.pg_bin_dir(v)?).unwrap();
|
||||||
@@ -189,7 +192,7 @@ impl AttachmentService {
|
|||||||
///
|
///
|
||||||
/// Returns the database url
|
/// Returns the database url
|
||||||
pub async fn setup_database(&self) -> anyhow::Result<String> {
|
pub async fn setup_database(&self) -> anyhow::Result<String> {
|
||||||
const DB_NAME: &str = "attachment_service";
|
const DB_NAME: &str = "storage_controller";
|
||||||
let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port);
|
let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port);
|
||||||
|
|
||||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||||
@@ -219,10 +222,10 @@ impl AttachmentService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub async fn start(&self) -> anyhow::Result<()> {
|
pub async fn start(&self) -> anyhow::Result<()> {
|
||||||
// Start a vanilla Postgres process used by the attachment service for persistence.
|
// Start a vanilla Postgres process used by the storage controller for persistence.
|
||||||
let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
|
let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.join("attachment_service_db");
|
.join("storage_controller_db");
|
||||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||||
let pg_log_path = pg_data_path.join("postgres.log");
|
let pg_log_path = pg_data_path.join("postgres.log");
|
||||||
|
|
||||||
@@ -245,7 +248,7 @@ impl AttachmentService {
|
|||||||
.await?;
|
.await?;
|
||||||
};
|
};
|
||||||
|
|
||||||
println!("Starting attachment service database...");
|
println!("Starting storage controller database...");
|
||||||
let db_start_args = [
|
let db_start_args = [
|
||||||
"-w",
|
"-w",
|
||||||
"-D",
|
"-D",
|
||||||
@@ -256,7 +259,7 @@ impl AttachmentService {
|
|||||||
];
|
];
|
||||||
|
|
||||||
background_process::start_process(
|
background_process::start_process(
|
||||||
"attachment_service_db",
|
"storage_controller_db",
|
||||||
&self.env.base_data_dir,
|
&self.env.base_data_dir,
|
||||||
pg_bin_dir.join("pg_ctl").as_std_path(),
|
pg_bin_dir.join("pg_ctl").as_std_path(),
|
||||||
db_start_args,
|
db_start_args,
|
||||||
@@ -269,13 +272,18 @@ impl AttachmentService {
|
|||||||
// Run migrations on every startup, in case something changed.
|
// Run migrations on every startup, in case something changed.
|
||||||
let database_url = self.setup_database().await?;
|
let database_url = self.setup_database().await?;
|
||||||
|
|
||||||
|
let max_unavailable: humantime::Duration = NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL.into();
|
||||||
|
|
||||||
let mut args = vec![
|
let mut args = vec![
|
||||||
"-l",
|
"-l",
|
||||||
&self.listen,
|
&self.listen,
|
||||||
"-p",
|
"-p",
|
||||||
self.path.as_ref(),
|
self.path.as_ref(),
|
||||||
|
"--dev",
|
||||||
"--database-url",
|
"--database-url",
|
||||||
&database_url,
|
&database_url,
|
||||||
|
"--max-unavailable-interval",
|
||||||
|
&max_unavailable.to_string(),
|
||||||
]
|
]
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|s| s.to_string())
|
.map(|s| s.to_string())
|
||||||
@@ -300,7 +308,7 @@ impl AttachmentService {
|
|||||||
background_process::start_process(
|
background_process::start_process(
|
||||||
COMMAND,
|
COMMAND,
|
||||||
&self.env.base_data_dir,
|
&self.env.base_data_dir,
|
||||||
&self.env.attachment_service_bin(),
|
&self.env.storage_controller_bin(),
|
||||||
args,
|
args,
|
||||||
[(
|
[(
|
||||||
"NEON_REPO_DIR".to_string(),
|
"NEON_REPO_DIR".to_string(),
|
||||||
@@ -322,10 +330,10 @@ impl AttachmentService {
|
|||||||
pub async fn stop(&self, immediate: bool) -> anyhow::Result<()> {
|
pub async fn stop(&self, immediate: bool) -> anyhow::Result<()> {
|
||||||
background_process::stop_process(immediate, COMMAND, &self.pid_file())?;
|
background_process::stop_process(immediate, COMMAND, &self.pid_file())?;
|
||||||
|
|
||||||
let pg_data_path = self.env.base_data_dir.join("attachment_service_db");
|
let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
|
||||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||||
|
|
||||||
println!("Stopping attachment service database...");
|
println!("Stopping storage controller database...");
|
||||||
let pg_stop_args = ["-D", &pg_data_path.to_string_lossy(), "stop"];
|
let pg_stop_args = ["-D", &pg_data_path.to_string_lossy(), "stop"];
|
||||||
let stop_status = Command::new(pg_bin_dir.join("pg_ctl"))
|
let stop_status = Command::new(pg_bin_dir.join("pg_ctl"))
|
||||||
.args(pg_stop_args)
|
.args(pg_stop_args)
|
||||||
@@ -344,10 +352,10 @@ impl AttachmentService {
|
|||||||
// fine that stop failed. Otherwise it is an error that stop failed.
|
// fine that stop failed. Otherwise it is an error that stop failed.
|
||||||
const PG_STATUS_NOT_RUNNING: i32 = 3;
|
const PG_STATUS_NOT_RUNNING: i32 = 3;
|
||||||
if Some(PG_STATUS_NOT_RUNNING) == status_exitcode.code() {
|
if Some(PG_STATUS_NOT_RUNNING) == status_exitcode.code() {
|
||||||
println!("Attachment service data base is already stopped");
|
println!("Storage controller database is already stopped");
|
||||||
return Ok(());
|
return Ok(());
|
||||||
} else {
|
} else {
|
||||||
anyhow::bail!("Failed to stop attachment service database: {stop_status}")
|
anyhow::bail!("Failed to stop storage controller database: {stop_status}")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -368,7 +376,7 @@ impl AttachmentService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Simple HTTP request wrapper for calling into attachment service
|
/// Simple HTTP request wrapper for calling into storage controller
|
||||||
async fn dispatch<RQ, RS>(
|
async fn dispatch<RQ, RS>(
|
||||||
&self,
|
&self,
|
||||||
method: hyper::Method,
|
method: hyper::Method,
|
||||||
@@ -468,7 +476,7 @@ impl AttachmentService {
|
|||||||
pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
|
pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
|
||||||
self.dispatch::<(), _>(
|
self.dispatch::<(), _>(
|
||||||
Method::GET,
|
Method::GET,
|
||||||
format!("control/v1/tenant/{tenant_id}/locate"),
|
format!("debug/v1/tenant/{tenant_id}/locate"),
|
||||||
None,
|
None,
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
@@ -496,11 +504,15 @@ impl AttachmentService {
|
|||||||
&self,
|
&self,
|
||||||
tenant_id: TenantId,
|
tenant_id: TenantId,
|
||||||
new_shard_count: u8,
|
new_shard_count: u8,
|
||||||
|
new_stripe_size: Option<ShardStripeSize>,
|
||||||
) -> anyhow::Result<TenantShardSplitResponse> {
|
) -> anyhow::Result<TenantShardSplitResponse> {
|
||||||
self.dispatch(
|
self.dispatch(
|
||||||
Method::PUT,
|
Method::PUT,
|
||||||
format!("control/v1/tenant/{tenant_id}/shard_split"),
|
format!("control/v1/tenant/{tenant_id}/shard_split"),
|
||||||
Some(TenantShardSplitRequest { new_shard_count }),
|
Some(TenantShardSplitRequest {
|
||||||
|
new_shard_count,
|
||||||
|
new_stripe_size,
|
||||||
|
}),
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
@@ -70,9 +70,9 @@ Should only be used e.g. for status check/tenant creation/list.
|
|||||||
Should only be used e.g. for status check.
|
Should only be used e.g. for status check.
|
||||||
Currently also used for connection from any pageserver to any safekeeper.
|
Currently also used for connection from any pageserver to any safekeeper.
|
||||||
|
|
||||||
"generations_api": Provides access to the upcall APIs served by the attachment service or the control plane.
|
"generations_api": Provides access to the upcall APIs served by the storage controller or the control plane.
|
||||||
|
|
||||||
"admin": Provides access to the control plane and admin APIs of the attachment service.
|
"admin": Provides access to the control plane and admin APIs of the storage controller.
|
||||||
|
|
||||||
### CLI
|
### CLI
|
||||||
CLI generates a key pair during call to `neon_local init` with the following commands:
|
CLI generates a key pair during call to `neon_local init` with the following commands:
|
||||||
|
|||||||
408
docs/rfcs/031-sharding-static.md
Normal file
408
docs/rfcs/031-sharding-static.md
Normal file
@@ -0,0 +1,408 @@
|
|||||||
|
# Sharding Phase 1: Static Key-space Sharding
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
To enable databases with sizes approaching the capacity of a pageserver's disk,
|
||||||
|
it is necessary to break up the storage for the database, or _shard_ it.
|
||||||
|
|
||||||
|
Sharding in general is a complex area. This RFC aims to define an initial
|
||||||
|
capability that will permit creating large-capacity databases using a static configuration
|
||||||
|
defined at time of Tenant creation.
|
||||||
|
|
||||||
|
## Motivation
|
||||||
|
|
||||||
|
Currently, all data for a Tenant, including all its timelines, is stored on a single
|
||||||
|
pageserver. The local storage required may be several times larger than the actual
|
||||||
|
database size, due to LSM write inflation.
|
||||||
|
|
||||||
|
If a database is larger than what one pageserver can hold, then it becomes impossible
|
||||||
|
for the pageserver to hold it in local storage, as it must do to provide service to
|
||||||
|
clients.
|
||||||
|
|
||||||
|
### Prior art
|
||||||
|
|
||||||
|
In Neon:
|
||||||
|
|
||||||
|
- Layer File Spreading: https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Konstantin-21fd9b11b618475da5f39c61dd8ab7a4
|
||||||
|
- Layer File SPreading: https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Christian-eb6b64182a214e11b3fceceee688d843
|
||||||
|
- Key Space partitioning: https://www.notion.so/neondatabase/One-Pager-Key-Space-Partitioning-Stas-8e3a28a600a04a25a68523f42a170677
|
||||||
|
|
||||||
|
Prior art in other distributed systems is too broad to capture here: pretty much
|
||||||
|
any scale out storage system does something like this.
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
- Enable creating a large (for example, 16TiB) database without requiring dedicated
|
||||||
|
pageserver nodes.
|
||||||
|
- Share read/write bandwidth costs for large databases across pageservers, as well
|
||||||
|
as storage capacity, in order to avoid large capacity databases acting as I/O hotspots
|
||||||
|
that disrupt service to other tenants.
|
||||||
|
- Our data distribution scheme should handle sparse/nonuniform keys well, since postgres
|
||||||
|
does not write out a single contiguous ranges of page numbers.
|
||||||
|
|
||||||
|
_Note: the definition of 'large database' is arbitrary, but the lower bound is to ensure that a database
|
||||||
|
that a user might create on a current-gen enterprise SSD should also work well on
|
||||||
|
Neon. The upper bound is whatever postgres can handle: i.e. we must make sure that the
|
||||||
|
pageserver backend is not the limiting factor in the database size_.
|
||||||
|
|
||||||
|
## Non Goals
|
||||||
|
|
||||||
|
- Independently distributing timelines within the same tenant. If a tenant has many
|
||||||
|
timelines, then sharding may be a less efficient mechanism for distributing load than
|
||||||
|
sharing out timelines between pageservers.
|
||||||
|
- Distributing work in the LSN dimension: this RFC focuses on the Key dimension only,
|
||||||
|
based on the idea that separate mechanisms will make sense for each dimension.
|
||||||
|
|
||||||
|
## Impacted Components
|
||||||
|
|
||||||
|
pageserver, control plane, postgres/smgr
|
||||||
|
|
||||||
|
## Terminology
|
||||||
|
|
||||||
|
**Key**: a postgres page number, qualified by relation. In the sense that the pageserver is a versioned key-value store,
|
||||||
|
the page number is the key in that store. `Key` is a literal data type in existing code.
|
||||||
|
|
||||||
|
**LSN dimension**: this just means the range of LSNs (history), when talking about the range
|
||||||
|
of keys and LSNs as a two dimensional space.
|
||||||
|
|
||||||
|
## Implementation
|
||||||
|
|
||||||
|
### Key sharding vs. LSN sharding
|
||||||
|
|
||||||
|
When we think of sharding across the two dimensional key/lsn space, this is an
|
||||||
|
opportunity to think about how the two dimensions differ:
|
||||||
|
|
||||||
|
- Sharding the key space distributes the _write_ workload of ingesting data
|
||||||
|
and compacting. This work must be carefully managed so that exactly one
|
||||||
|
node owns a given key.
|
||||||
|
- Sharding the LSN space distributes the _historical read_ workload. This work
|
||||||
|
can be done by anyone without any special coordination, as long as they can
|
||||||
|
see the remote index and layers.
|
||||||
|
|
||||||
|
The key sharding is the harder part, and also the more urgent one, to support larger
|
||||||
|
capacity databases. Because distributing historical LSN read work is a relatively
|
||||||
|
simpler problem that most users don't have, we defer it to future work. It is anticipated
|
||||||
|
that some quite simple P2P offload model will enable distributing work for historical
|
||||||
|
reads: a node which is low on space can call out to peer to ask it to download and
|
||||||
|
serve reads from a historical layer.
|
||||||
|
|
||||||
|
### Key mapping scheme
|
||||||
|
|
||||||
|
Having decided to focus on key sharding, we must next decide how we will map
|
||||||
|
keys to shards. It is proposed to use a "wide striping" approach, to obtain a good compromise
|
||||||
|
between data locality and avoiding entire large relations mapping to the same shard.
|
||||||
|
|
||||||
|
We will define two spaces:
|
||||||
|
|
||||||
|
- Key space: unsigned integer
|
||||||
|
- Shard space: integer from 0 to N-1, where we have N shards.
|
||||||
|
|
||||||
|
### Key -> Shard mapping
|
||||||
|
|
||||||
|
Keys are currently defined in the pageserver's getpage@lsn interface as follows:
|
||||||
|
|
||||||
|
```
|
||||||
|
pub struct Key {
|
||||||
|
pub field1: u8,
|
||||||
|
pub field2: u32,
|
||||||
|
pub field3: u32,
|
||||||
|
pub field4: u32,
|
||||||
|
pub field5: u8,
|
||||||
|
pub field6: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
|
||||||
|
Key {
|
||||||
|
field1: 0x00,
|
||||||
|
field2: rel.spcnode,
|
||||||
|
field3: rel.dbnode,
|
||||||
|
field4: rel.relnode,
|
||||||
|
field5: rel.forknum,
|
||||||
|
field6: blknum,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
_Note: keys for relation metadata are ignored here, as this data will be mirrored to all
|
||||||
|
shards. For distribution purposes, we only care about user data keys_
|
||||||
|
|
||||||
|
The properties we want from our Key->Shard mapping are:
|
||||||
|
|
||||||
|
- Locality in `blknum`, such that adjacent `blknum` will usually map to
|
||||||
|
the same stripe and consequently land on the same shard, even though the overall
|
||||||
|
collection of blocks in a relation will be spread over many stripes and therefore
|
||||||
|
many shards.
|
||||||
|
- Avoid the same blknum on different relations landing on the same stripe, so that
|
||||||
|
with many small relations we do not end up aliasing data to the same stripe/shard.
|
||||||
|
- Avoid vulnerability to aliasing in the values of relation identity fields, such that
|
||||||
|
if there are patterns in the value of `relnode`, these do not manifest as patterns
|
||||||
|
in data placement.
|
||||||
|
|
||||||
|
To accomplish this, the blknum is used to select a stripe, and stripes are
|
||||||
|
assigned to shards in a pseudorandom order via a hash. The motivation for
|
||||||
|
pseudo-random distribution (rather than sequential mapping of stripe to shard)
|
||||||
|
is to avoid I/O hotspots when sequentially reading multiple relations: we don't want
|
||||||
|
all relations' stripes to touch pageservers in the same order.
|
||||||
|
|
||||||
|
To map a `Key` to a shard:
|
||||||
|
|
||||||
|
- Hash the `Key` field 4 (relNode).
|
||||||
|
- Divide field 6 (`blknum`) field by the stripe size in pages, and combine the
|
||||||
|
hash of this with the hash from the previous step.
|
||||||
|
- The total hash modulo the shard count gives the shard holding this key.
|
||||||
|
|
||||||
|
Why don't we use the other fields in the Key?
|
||||||
|
|
||||||
|
- We ignore `forknum` for key mapping, because it distinguishes different classes of data
|
||||||
|
in the same relation, and we would like to keep the data in a relation together.
|
||||||
|
- We would like to use spcNode and dbNode, but cannot. Postgres database creation operations can refer to an existing database as a template, such that the created
|
||||||
|
database's blocks differ only by spcNode and dbNode from the original. To enable running
|
||||||
|
this type of creation without cross-pageserver communication, we must ensure that these
|
||||||
|
blocks map to the same shard -- we do this by excluding spcNode and dbNode from the hash.
|
||||||
|
|
||||||
|
### Data placement examples
|
||||||
|
|
||||||
|
For example, consider the extreme large databases cases of postgres data layout in a system with 8 shards
|
||||||
|
and a stripe size of 32k pages:
|
||||||
|
|
||||||
|
- A single large relation: `blknum` division will break the data up into 4096
|
||||||
|
stripes, which will be scattered across the shards.
|
||||||
|
- 4096 relations of of 32k pages each: each relation will map to exactly one stripe,
|
||||||
|
and that stripe will be placed according to the hash of the key fields 4. The
|
||||||
|
data placement will be statistically uniform across shards.
|
||||||
|
|
||||||
|
Data placement will be more uneven on smaller databases:
|
||||||
|
|
||||||
|
- A tenant with 2 shards and 2 relations of one stripe size each: there is a 50% chance
|
||||||
|
that both relations land on the same shard and no data lands on the other shard.
|
||||||
|
- A tenant with 8 shards and one relation of size 12 stripes: 4 shards will have double
|
||||||
|
the data of the other four shards.
|
||||||
|
|
||||||
|
These uneven cases for small amounts of data do not matter, as long as the stripe size
|
||||||
|
is an order of magnitude smaller than the amount of data we are comfortable holding
|
||||||
|
in a single shard: if our system handles shard sizes up to 10-100GB, then it is not an issue if
|
||||||
|
a tenant has some shards with 256MB size and some shards with 512MB size, even though
|
||||||
|
the standard deviation of shard size within the tenant is very high. Our key mapping
|
||||||
|
scheme provides a statistical guarantee that as the tenant's overall data size increases,
|
||||||
|
uniformity of placement will improve.
|
||||||
|
|
||||||
|
### Important Types
|
||||||
|
|
||||||
|
#### `ShardIdentity`
|
||||||
|
|
||||||
|
Provides the information needed to know whether a particular key belongs
|
||||||
|
to a particular shard:
|
||||||
|
|
||||||
|
- Layout version
|
||||||
|
- Stripe size
|
||||||
|
- Shard count
|
||||||
|
- Shard index
|
||||||
|
|
||||||
|
This structure's size is constant. Note that if we had used a differnet key
|
||||||
|
mapping scheme such as consistent hashing with explicit hash ranges assigned
|
||||||
|
to each shard, then the ShardIdentity's size would grow with the shard count: the simpler
|
||||||
|
key mapping scheme used here enables a small fixed size ShardIdentity.
|
||||||
|
|
||||||
|
### Pageserver changes
|
||||||
|
|
||||||
|
#### Structural
|
||||||
|
|
||||||
|
Everywhere the Pageserver currently deals with Tenants, it will move to dealing with
|
||||||
|
`TenantShard`s, which are just a `Tenant` plus a `ShardIdentity` telling it which part
|
||||||
|
of the keyspace it owns. An un-sharded tenant is just a `TenantShard` whose `ShardIdentity`
|
||||||
|
covers the whole keyspace.
|
||||||
|
|
||||||
|
When the pageserver writes layers and index_part.json to remote storage, it must
|
||||||
|
include the shard index & count in the name, to avoid collisions (the count is
|
||||||
|
necessary for future-proofing: the count will vary in time). These keys
|
||||||
|
will also include a generation number: the [generation numbers](025-generation-numbers.md) system will work
|
||||||
|
exactly the same for TenantShards as it does for Tenants today: each shard will have
|
||||||
|
its own generation number.
|
||||||
|
|
||||||
|
#### Storage Format: Keys
|
||||||
|
|
||||||
|
For tenants with >1 shard, layer files implicitly become sparse: within the key
|
||||||
|
range described in the layer name, the layer file for a shard will only hold the
|
||||||
|
content relevant to stripes assigned to the shard.
|
||||||
|
|
||||||
|
For this reason, the LayerFileName within a tenant is no longer unique: different shards
|
||||||
|
may use the same LayerFileName to refer to different data. We may solve this simply
|
||||||
|
by including the shard number in the keys used for layers.
|
||||||
|
|
||||||
|
The shard number will be included as a prefix (as part of tenant ID), like this:
|
||||||
|
|
||||||
|
`pageserver/v1/tenants/<tenant_id>-<shard_number><shard_count>/timelines/<timeline id>/<layer file name>-<generation>`
|
||||||
|
|
||||||
|
`pageserver/v1/tenants/<tenant_id>-<shard_number><shard_count>/timelines/<timeline id>/index_part.json-<generation>`
|
||||||
|
|
||||||
|
Reasons for this particular format:
|
||||||
|
|
||||||
|
- Use of a prefix is convenient for implementation (no need to carry the shard ID everywhere
|
||||||
|
we construct a layer file name), and enables efficient listing of index_parts within
|
||||||
|
a particular shard-timeline prefix.
|
||||||
|
- Including the shard _count_ as well as shard number means that in future when we implement
|
||||||
|
shard splitting, it will be possible for a parent shard and one of its children to write
|
||||||
|
the same layer file without a name collision. For example, a parent shard 0_1 might split
|
||||||
|
into two (0_2, 1_2), and in the process of splitting shard 0_2 could write a layer or index_part
|
||||||
|
that is distinct from what shard 0_1 would have written at the same place.
|
||||||
|
|
||||||
|
In practice, we expect shard counts to be relatively small, so a `u8` will be sufficient,
|
||||||
|
and therefore the shard part of the path can be a fixed-length hex string like `{:02X}{:02X}`,
|
||||||
|
for example a single-shard tenant's prefix will be `0001`.
|
||||||
|
|
||||||
|
For backward compatibility, we may define a special `ShardIdentity` that has shard_count==0,
|
||||||
|
and use this as a cue to construct paths with no prefix at all.
|
||||||
|
|
||||||
|
#### Storage Format: Indices
|
||||||
|
|
||||||
|
In the phase 1 described in this RFC, shards only reference layers they write themselves. However,
|
||||||
|
when we implement shard splitting in future, it will be useful to enable shards to reference layers
|
||||||
|
written by other shards (specifically the parent shard during a split), so that shards don't
|
||||||
|
have to exhaustively copy all data into their own shard-prefixed keys.
|
||||||
|
|
||||||
|
To enable this, the `IndexPart` structure will be extended to store the (shard number, shard count)
|
||||||
|
tuple on each layer, such that it can construct paths for layers written by other shards. This
|
||||||
|
naturally raises the question of who "owns" such layers written by ancestral shards: this problem
|
||||||
|
will be addressed in phase 2.
|
||||||
|
|
||||||
|
For backward compatibility, any index entry without shard information will be assumed to be
|
||||||
|
in the legacy shardidentity.
|
||||||
|
|
||||||
|
#### WAL Ingest
|
||||||
|
|
||||||
|
In Phase 1, all shards will subscribe to the safekeeper to download WAL content. They will filter
|
||||||
|
it down to the pages relevant to their shard:
|
||||||
|
|
||||||
|
- For ordinary user data writes, only retain a write if it matches the ShardIdentity
|
||||||
|
- For metadata describing relations etc, all shards retain these writes.
|
||||||
|
|
||||||
|
The pageservers must somehow give the safekeeper correct feedback on remote_consistent_lsn:
|
||||||
|
one solution here is for the 0th shard to periodically peek at the IndexParts for all the other shards,
|
||||||
|
and have only the 0th shard populate remote_consistent_lsn. However, this is relatively
|
||||||
|
expensive: if the safekeeper can be made shard-aware then it could be taught to use
|
||||||
|
the max() of all shards' remote_consistent_lsns to decide when to trim the WAL.
|
||||||
|
|
||||||
|
#### Compaction/GC
|
||||||
|
|
||||||
|
No changes needed.
|
||||||
|
|
||||||
|
The pageserver doesn't have to do anything special during compaction
|
||||||
|
or GC. It is implicitly operating on the subset of keys that map to its ShardIdentity.
|
||||||
|
This will result in sparse layer files, containing keys only in the stripes that this
|
||||||
|
shard owns. Where optimizations currently exist in compaction for spotting "gaps" in
|
||||||
|
the key range, these should be updated to ignore gaps that are due to sharding, to
|
||||||
|
avoid spuriously splitting up layers ito stripe-sized pieces.
|
||||||
|
|
||||||
|
### Compute Endpoints
|
||||||
|
|
||||||
|
Compute endpoints will need to:
|
||||||
|
|
||||||
|
- Accept a vector of connection strings as part of their configuration from the control plane
|
||||||
|
- Route pageserver requests according to mapping the hash of key to the correct
|
||||||
|
entry in the vector of connection strings.
|
||||||
|
|
||||||
|
Doing this in compute rather than routing requests via a single pageserver is
|
||||||
|
necessary to enable sharding tenants without adding latency from extra hops.
|
||||||
|
|
||||||
|
### Control Plane
|
||||||
|
|
||||||
|
Tenants, or _Projects_ in the control plane, will each own a set of TenantShards (this will
|
||||||
|
be 1 for small tenants). Logic for placement of tenant shards is just the same as the current logic for placing
|
||||||
|
tenants.
|
||||||
|
|
||||||
|
Tenant lifecycle operations like deletion will require fanning-out to all the shards
|
||||||
|
in the tenant. The same goes for timeline creation and deletion: a timeline should
|
||||||
|
not be considered created until it has been created in all shards.
|
||||||
|
|
||||||
|
#### Selectively enabling sharding for large tenants
|
||||||
|
|
||||||
|
Initially, we will explicitly enable sharding for large tenants only.
|
||||||
|
|
||||||
|
In future, this hint mechanism will become optional when we implement automatic
|
||||||
|
re-sharding of tenants.
|
||||||
|
|
||||||
|
## Future Phases
|
||||||
|
|
||||||
|
This section exists to indicate what will likely come next after this phase.
|
||||||
|
|
||||||
|
Phases 2a and 2b are amenable to execution in parallel.
|
||||||
|
|
||||||
|
### Phase 2a: WAL fan-out
|
||||||
|
|
||||||
|
**Problem**: when all shards consume the whole WAL, the network bandwidth used
|
||||||
|
for transmitting the WAL from safekeeper to pageservers is multiplied by a factor
|
||||||
|
of the shard count.
|
||||||
|
|
||||||
|
Network bandwidth is not our most pressing bottleneck, but it is likely to become
|
||||||
|
a problem if we set a modest shard count (~8) on a significant number of tenants,
|
||||||
|
especially as those larger tenants which we shard are also likely to have higher
|
||||||
|
write bandwidth than average.
|
||||||
|
|
||||||
|
### Phase 2b: Shard Splitting
|
||||||
|
|
||||||
|
**Problem**: the number of shards in a tenant is defined at creation time and cannot
|
||||||
|
be changed. This causes excessive sharding for most small tenants, and an upper
|
||||||
|
bound on scale for very large tenants.
|
||||||
|
|
||||||
|
To address this, a _splitting_ feature will later be added. One shard can split its
|
||||||
|
data into a number of children by doing a special compaction operation to generate
|
||||||
|
image layers broken up child-shard-wise, and then writing out an `index_part.json` for
|
||||||
|
each child. This will then require external coordination (by the control plane) to
|
||||||
|
safely attach these new child shards and then move them around to distribute work.
|
||||||
|
The opposite _merging_ operation can also be imagined, but is unlikely to be implemented:
|
||||||
|
once a Tenant has been sharded, the marginal efficiency benefit of merging is unlikely to justify
|
||||||
|
the risk/complexity of implementing such a rarely-encountered scenario.
|
||||||
|
|
||||||
|
### Phase N (future): distributed historical reads
|
||||||
|
|
||||||
|
**Problem**: while sharding based on key is good for handling changes in overall
|
||||||
|
database size, it is less suitable for spiky/unpredictable changes in the read
|
||||||
|
workload to historical layers. Sudden increases in historical reads could result
|
||||||
|
in sudden increases in local disk capacity required for a TenantShard.
|
||||||
|
|
||||||
|
Example: the extreme case of this would be to run a tenant for a year, then create branches
|
||||||
|
with ancestors at monthly intervals. This could lead to a sudden 12x inflation in
|
||||||
|
the on-disk capacity footprint of a TenantShard, since it would be serving reads
|
||||||
|
from all those disparate historical layers.
|
||||||
|
|
||||||
|
If we can respond fast enough, then key-sharding a tenant more finely can help with
|
||||||
|
this, but splitting may be a relatively expensive operation and the increased historical
|
||||||
|
read load may be transient.
|
||||||
|
|
||||||
|
A separate mechanism for handling heavy historical reads could be something like
|
||||||
|
a gossip mechanism for pageservers to communicate
|
||||||
|
about their workload, and then a getpageatlsn offload mechanism where one pageserver can
|
||||||
|
ask another to go read the necessary layers from remote storage to serve the read. This
|
||||||
|
requires relativly little coordination because it is read-only: any node can service any
|
||||||
|
read. All reads to a particular shard would still flow through one node, but the
|
||||||
|
disk capactity & I/O impact of servicing the read would be distributed.
|
||||||
|
|
||||||
|
## FAQ/Alternatives
|
||||||
|
|
||||||
|
### Why stripe the data, rather than using contiguous ranges of keyspace for each shard?
|
||||||
|
|
||||||
|
When a database is growing under a write workload, writes may predominantly hit the
|
||||||
|
end of the keyspace, creating a bandwidth hotspot on that shard. Similarly, if the user
|
||||||
|
is intensively re-writing a particular relation, if that relation lived in a particular
|
||||||
|
shard then it would not achieve our goal of distributing the write work across shards.
|
||||||
|
|
||||||
|
### Why not proxy read requests through one pageserver, so that endpoints don't have to change?
|
||||||
|
|
||||||
|
1. This would not achieve scale-out of network bandwidth: a busy tenant with a large
|
||||||
|
database would still cause a load hotspot on the pageserver routing its read requests.
|
||||||
|
2. The additional hop through the "proxy" pageserver would add latency and overall
|
||||||
|
resource cost (CPU, network bandwidth)
|
||||||
|
|
||||||
|
### Layer File Spreading: use one pageserver as the owner of a tenant, and have it spread out work on a per-layer basis to peers
|
||||||
|
|
||||||
|
In this model, there would be no explicit sharding of work, but the pageserver to which
|
||||||
|
a tenant is attached would not hold all layers on its disk: instead, it would call out
|
||||||
|
to peers to have them store some layers, and call out to those peers to request reads
|
||||||
|
in those layers.
|
||||||
|
|
||||||
|
This mechanism will work well for distributing work in the LSN dimension, but in the key
|
||||||
|
space dimension it has the major limitation of requiring one node to handle all
|
||||||
|
incoming writes, and compactions. Even if the write workload for a large database
|
||||||
|
fits in one pageserver, it will still be a hotspot and such tenants may still
|
||||||
|
de-facto require their own pageserver.
|
||||||
479
docs/rfcs/032-shard-splitting.md
Normal file
479
docs/rfcs/032-shard-splitting.md
Normal file
@@ -0,0 +1,479 @@
|
|||||||
|
# Shard splitting
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
This RFC describes a new pageserver API for splitting an existing tenant shard into
|
||||||
|
multiple shards, and describes how to use this API to safely increase the total
|
||||||
|
shard count of a tenant.
|
||||||
|
|
||||||
|
## Motivation
|
||||||
|
|
||||||
|
In the [sharding RFC](031-sharding-static.md), a mechanism was introduced to scale
|
||||||
|
tenants beyond the capacity of a single pageserver by breaking up the key space
|
||||||
|
into stripes, and distributing these stripes across many pageservers. However,
|
||||||
|
the shard count was defined once at tenant creation time and not varied thereafter.
|
||||||
|
|
||||||
|
In practice, the expected size of a database is rarely known at creation time, and
|
||||||
|
it is inefficient to enable sharding for very small tenants: we need to be
|
||||||
|
able to create a tenant with a small number of shards (such as 1), and later expand
|
||||||
|
when it becomes clear that the tenant has grown in size to a point where sharding
|
||||||
|
is beneficial.
|
||||||
|
|
||||||
|
### Prior art
|
||||||
|
|
||||||
|
Many distributed systems have the problem of choosing how many shards to create for
|
||||||
|
tenants that do not specify an expected size up-front. There are a couple of general
|
||||||
|
approaches:
|
||||||
|
|
||||||
|
- Write to a key space in order, and start a new shard when the highest key advances
|
||||||
|
past some point. This doesn't work well for Neon, because we write to our key space
|
||||||
|
in many different contiguous ranges (per relation), rather than in one contiguous
|
||||||
|
range. To adapt to this kind of model, we would need a sharding scheme where each
|
||||||
|
relation had its own range of shards, which would be inefficient for the common
|
||||||
|
case of databases with many small relations.
|
||||||
|
- Monitor the system, and automatically re-shard at some size threshold. For
|
||||||
|
example in Ceph, the [pg_autoscaler](https://github.com/ceph/ceph/blob/49c27499af4ee9a90f69fcc6bf3597999d6efc7b/src/pybind/mgr/pg_autoscaler/module.py)
|
||||||
|
component monitors the size of each RADOS Pool, and adjusts the number of Placement
|
||||||
|
Groups (Ceph's shard equivalent).
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
- A configurable capacity limit per-shard is enforced.
|
||||||
|
- Changes in shard count do not interrupt service beyond requiring postgres
|
||||||
|
to reconnect (i.e. milliseconds).
|
||||||
|
- Human being does not have to choose shard count
|
||||||
|
|
||||||
|
## Non Goals
|
||||||
|
|
||||||
|
- Shard splitting is always a tenant-global operation: we will not enable splitting
|
||||||
|
one shard while leaving others intact.
|
||||||
|
- The inverse operation (shard merging) is not described in this RFC. This is a lower
|
||||||
|
priority than splitting, because databases grow more often than they shrink, and
|
||||||
|
a database with many shards will still work properly if the stored data shrinks, just
|
||||||
|
with slightly more overhead (e.g. redundant WAL replication)
|
||||||
|
- Shard splitting is only initiated based on capacity bounds, not load. Splitting
|
||||||
|
a tenant based on load will make sense for some medium-capacity, high-load workloads,
|
||||||
|
but is more complex to reason about and likely is not desirable until we have
|
||||||
|
shard merging to reduce the shard count again if the database becomes less busy.
|
||||||
|
|
||||||
|
## Impacted Components
|
||||||
|
|
||||||
|
pageserver, storage controller
|
||||||
|
|
||||||
|
(the _storage controller_ is the evolution of what was called `attachment_service` in our test environment)
|
||||||
|
|
||||||
|
## Terminology
|
||||||
|
|
||||||
|
**Parent** shards are the shards that exist before a split. **Child** shards are
|
||||||
|
the new shards created during a split.
|
||||||
|
|
||||||
|
**Shard** is synonymous with _tenant shard_.
|
||||||
|
|
||||||
|
**Shard Index** is the 2-tuple of shard number and shard count, written in
|
||||||
|
paths as {:02x}{:02x}, e.g. `0001`.
|
||||||
|
|
||||||
|
## Background
|
||||||
|
|
||||||
|
In the implementation section, a couple of existing aspects of sharding are important
|
||||||
|
to remember:
|
||||||
|
|
||||||
|
- Shard identifiers contain the shard number and count, so that "shard 0 of 1" (`0001`) is
|
||||||
|
a distinct shard from "shard 0 of 2" (`0002`). This is the case in key paths, local
|
||||||
|
storage paths, and remote index metadata.
|
||||||
|
- Remote layer file paths contain the shard index of the shard that created them, and
|
||||||
|
remote indices contain the same index to enable building the layer file path. A shard's
|
||||||
|
index may reference layers that were created by another shard.
|
||||||
|
- Local tenant shard directories include the shard index. All layers downloaded by
|
||||||
|
a tenant shard are stored in this shard-prefixed path, even if those layers were
|
||||||
|
initially created by another shard: tenant shards do not read and write one anothers'
|
||||||
|
paths.
|
||||||
|
- The `Tenant` pageserver type represents one tenant _shard_, not the whole tenant.
|
||||||
|
This is for historical reasons and will be cleaned up in future, but the existing
|
||||||
|
name is used here to help comprehension when reading code.
|
||||||
|
|
||||||
|
## Implementation
|
||||||
|
|
||||||
|
Note: this section focuses on the correctness of the core split process. This will
|
||||||
|
be fairly inefficient in a naive implementation, and several important optimizations
|
||||||
|
are described in a later section.
|
||||||
|
|
||||||
|
There are broadly two parts to the implementation:
|
||||||
|
|
||||||
|
1. The pageserver split API, which splits one shard on one pageserver
|
||||||
|
2. The overall tenant split proccess which is coordinated by the storage controller,
|
||||||
|
and calls into the pageserver split API as needed.
|
||||||
|
|
||||||
|
### Pageserver Split API
|
||||||
|
|
||||||
|
The pageserver will expose a new API endpoint at `/v1/tenant/:tenant_shard_id/shard_split`
|
||||||
|
that takes the new total shard count in the body.
|
||||||
|
|
||||||
|
The pageserver split API operates on one tenant shard, on one pageserver. External
|
||||||
|
coordination is required to use it safely, this is described in the later
|
||||||
|
'Split procedure' section.
|
||||||
|
|
||||||
|
#### Preparation
|
||||||
|
|
||||||
|
First identify the shard indices for the new child shards. These are deterministic,
|
||||||
|
calculated from the parent shard's index, and the number of children being created (this
|
||||||
|
is an input to the API, and validated to be a power of two). In a trivial example, splitting
|
||||||
|
0001 in two always results in 0002 and 0102.
|
||||||
|
|
||||||
|
Child shard indices are chosen such that the childrens' parts of the keyspace will
|
||||||
|
be subsets of the parent's parts of the keyspace.
|
||||||
|
|
||||||
|
#### Step 1: write new remote indices
|
||||||
|
|
||||||
|
In remote storage, splitting is very simple: we may just write new index_part.json
|
||||||
|
objects for each child shard, containing exactly the same layers as the parent shard.
|
||||||
|
|
||||||
|
The children will have more data than they need, but this avoids any exhausive
|
||||||
|
re-writing or copying of layer files.
|
||||||
|
|
||||||
|
The index key path includes a generation number: the parent shard's current
|
||||||
|
attached generation number will also be used for the child shards' indices. This
|
||||||
|
makes the operation safely retryable: if everything crashes and restarts, we may
|
||||||
|
call the split API again on the parent shard, and the result will be some new remote
|
||||||
|
indices for the child shards, under a higher generation number.
|
||||||
|
|
||||||
|
#### Step 2: start new `Tenant` objects
|
||||||
|
|
||||||
|
A new `Tenant` object may be instantiated for each child shard, while the parent
|
||||||
|
shard still exists. When calling the tenant_spawn function for this object,
|
||||||
|
the remote index from step 1 will be read, and the child shard will start
|
||||||
|
to ingest WAL to catch up from whatever was in the remote storage at step 1.
|
||||||
|
|
||||||
|
We now wait for child shards' WAL ingestion to catch up with the parent shard,
|
||||||
|
so that we can safely tear down the parent shard without risking an availability
|
||||||
|
gap to clients reading recent LSNs.
|
||||||
|
|
||||||
|
#### Step 3: tear down parent `Tenant` object
|
||||||
|
|
||||||
|
Once child shards are running and have caught up with WAL ingest, we no longer
|
||||||
|
need the parent shard. Note that clients may still be using it -- when we
|
||||||
|
shut it down, any page_service handlers will also shut down, causing clients
|
||||||
|
to disconnect. When the client reconnects, it will re-lookup the tenant,
|
||||||
|
and hit the child shard instead of the parent (shard lookup from page_service
|
||||||
|
should bias toward higher ShardCount shards).
|
||||||
|
|
||||||
|
Note that at this stage the page service client has not yet been notified of
|
||||||
|
any split. In the trivial single split example:
|
||||||
|
|
||||||
|
- Shard 0001 is gone: Tenant object torn down
|
||||||
|
- Shards 0002 and 0102 are running on the same pageserver where Shard 0001 used to live.
|
||||||
|
- Clients will continue to connect to that server thinking that shard 0001 is there,
|
||||||
|
and all requests will work, because any key that was in shard 0001 is definitely
|
||||||
|
available in either shard 0002 or shard 0102.
|
||||||
|
- Eventually, the storage controller (not the pageserver) will decide to migrate
|
||||||
|
some child shards away: at that point it will do a live migration, ensuring
|
||||||
|
that the client has an updated configuration before it detaches anything
|
||||||
|
from the original server.
|
||||||
|
|
||||||
|
#### Complete
|
||||||
|
|
||||||
|
When we send a 200 response to the split request, we are promising the caller:
|
||||||
|
|
||||||
|
- That the child shards are persistent in remote storage
|
||||||
|
- That the parent shard has been shut down
|
||||||
|
|
||||||
|
This enables the caller to proceed with the overall shard split operation, which
|
||||||
|
may involve other shards on other pageservers.
|
||||||
|
|
||||||
|
### Storage Controller Split procedure
|
||||||
|
|
||||||
|
Splitting a tenant requires calling the pageserver split API, and tracking
|
||||||
|
enough state to ensure recovery + completion in the event of any component (pageserver
|
||||||
|
or storage controller) crashing (or request timing out) during the split.
|
||||||
|
|
||||||
|
1. call the split API on all existing shards. Ensure that the resulting
|
||||||
|
child shards are pinned to their pageservers until _all_ the split calls are done.
|
||||||
|
This pinning may be implemented as a "split bit" on the tenant shards, that
|
||||||
|
blocks any migrations, and also acts as a sign that if we restart, we must go
|
||||||
|
through some recovery steps to resume the split.
|
||||||
|
2. Once all the split calls are done, we may unpin the child shards (clear
|
||||||
|
the split bit). The split is now complete: subsequent steps are just migrations,
|
||||||
|
not strictly part of the split.
|
||||||
|
3. Try to schedule new pageserver locations for the child shards, using
|
||||||
|
a soft anti-affinity constraint to place shards from the same tenant onto different
|
||||||
|
pageservers.
|
||||||
|
|
||||||
|
Updating computes about the new shard count is not necessary until we migrate
|
||||||
|
any of the child shards away from the parent's location.
|
||||||
|
|
||||||
|
### Recovering from failures
|
||||||
|
|
||||||
|
#### Rolling back an incomplete split
|
||||||
|
|
||||||
|
An incomplete shard split may be rolled back quite simply, by attaching the parent shards to pageservers,
|
||||||
|
and detaching child shards. This will lose any WAL ingested into the children after the parents
|
||||||
|
were detached earlier, but the parents will catch up.
|
||||||
|
|
||||||
|
No special pageserver API is needed for this. From the storage controllers point of view, the
|
||||||
|
procedure is:
|
||||||
|
|
||||||
|
1. For all parent shards in the tenant, ensure they are attached
|
||||||
|
2. For all child shards, ensure they are not attached
|
||||||
|
3. Drop child shards from the storage controller's database, and clear the split bit on the parent shards.
|
||||||
|
|
||||||
|
Any remote storage content for child shards is left behind. This is similar to other cases where
|
||||||
|
we may leave garbage objects in S3 (e.g. when we upload a layer but crash before uploading an
|
||||||
|
index that references it). Future online scrub/cleanup functionality can remove these objects, or
|
||||||
|
they will be removed when the tenant is deleted, as tenant deletion lists all objects in the prefix,
|
||||||
|
which would include any child shards that were rolled back.
|
||||||
|
|
||||||
|
If any timelines had been created on child shards, they will be lost when rolling back. To mitigate
|
||||||
|
this, we will **block timeline creation during splitting**, so that we can safely roll back until
|
||||||
|
the split is complete, without risking losing timelines.
|
||||||
|
|
||||||
|
Rolling back an incomplete split will happen automatically if a split fails due to some fatal
|
||||||
|
reason, and will not be accessible via an API:
|
||||||
|
|
||||||
|
- A pageserver fails to complete its split API request after too many retries
|
||||||
|
- A pageserver returns a fatal unexpected error such as 400 or 500
|
||||||
|
- The storage controller database returns a non-retryable error
|
||||||
|
- Some internal invariant is violated in the storage controller split code
|
||||||
|
|
||||||
|
#### Rolling back a complete split
|
||||||
|
|
||||||
|
A complete shard split may be rolled back similarly to an incomplete split, with the following
|
||||||
|
modifications:
|
||||||
|
|
||||||
|
- The parent shards will no longer exist in the storage controller database, so these must
|
||||||
|
be re-synthesized somehow: the hard part of this is figuring the parent shards' generations. This
|
||||||
|
may be accomplished either by probing in S3, or by retaining some tombstone state for deleted
|
||||||
|
shards in the storage controller database.
|
||||||
|
- Any timelines that were created after the split complete will disappear when rolling back
|
||||||
|
to the tenant shards. For this reason, rolling back after a complete split should only
|
||||||
|
be done due to serious issues where loss of recently created timelines is acceptable, or
|
||||||
|
in cases where we have confirmed that no timelines were created in the intervening period.
|
||||||
|
- Parent shards' layers must not have been deleted: this property will come "for free" when
|
||||||
|
we first roll out sharding, by simply not implementing deletion of parent layers after
|
||||||
|
a split. When we do implement such deletion (see "Cleaning up parent-shard layers" in the
|
||||||
|
Optimizations section), it should apply a TTL to layers such that we have a
|
||||||
|
defined walltime window in which rollback will be possible.
|
||||||
|
|
||||||
|
The storage controller will expose an API for rolling back a complete split, for use
|
||||||
|
in the field if we encounter some critical bug with a post-split tenant.
|
||||||
|
|
||||||
|
#### Retrying API calls during Pageserver Restart
|
||||||
|
|
||||||
|
When a pageserver restarts during a split API call, it may witness on-disk content for both parent and
|
||||||
|
child shards from an ongoing split. This does not intrinsically break anything, and the
|
||||||
|
pageserver may include all these shards in its `/re-attach` request to the storage controller.
|
||||||
|
|
||||||
|
In order to support such restarts, it is important that the storage controller stores
|
||||||
|
persistent records of each child shard before it calls into a pageserver, as these child shards
|
||||||
|
may require generation increments via a `/re-attach` request.
|
||||||
|
|
||||||
|
The pageserver restart will also result in a failed API call from the storage controller's point
|
||||||
|
of view. Recall that if _any_ pageserver fails to split, the overall split operation may not
|
||||||
|
complete, and all shards must remain pinned to their current pageserver locations until the
|
||||||
|
split is done.
|
||||||
|
|
||||||
|
The pageserver API calls during splitting will retry on transient errors, so that
|
||||||
|
short availability gaps do not result in a failure of the overall operation. The
|
||||||
|
split in progress will be automatically rolled back if the threshold for API
|
||||||
|
retries is reached (e.g. if a pageserver stays offline for longer than a typical
|
||||||
|
restart).
|
||||||
|
|
||||||
|
#### Rollback on Storage Controller Restart
|
||||||
|
|
||||||
|
On startup, the storage controller will inspect the split bit for tenant shards that
|
||||||
|
it loads from the database. If any splits are in progress:
|
||||||
|
|
||||||
|
- Database content will be reverted to the parent shards
|
||||||
|
- Child shards will be dropped from memory
|
||||||
|
- The parent and child shards will be included in the general startup reconciliation that
|
||||||
|
the storage controller does: any child shards will be detached from pageservers because
|
||||||
|
they don't exist in the storage controller's expected set of shards, and parent shards
|
||||||
|
will be attached if they aren't already.
|
||||||
|
|
||||||
|
#### Storage controller API request failures/retries
|
||||||
|
|
||||||
|
The split request handler will implement idempotency: if the [`Tenant`] requested to split
|
||||||
|
doesn't exist, we will check for the would-be child shards, and if they already exist,
|
||||||
|
we consider the request complete.
|
||||||
|
|
||||||
|
If a request is retried while the original request is still underway, then the split
|
||||||
|
request handler will notice an InProgress marker in TenantManager, and return 503
|
||||||
|
to encourage the client to backoff/retry. This is the same as the general pageserver
|
||||||
|
API handling for calls that try to act on an InProgress shard.
|
||||||
|
|
||||||
|
#### Compute start/restart during a split
|
||||||
|
|
||||||
|
If a compute starts up during split, it will be configured with the old sharding
|
||||||
|
configuration. This will work for reads irrespective of the progress of the split
|
||||||
|
as long as no child hards have been migrated away from their original location, and
|
||||||
|
this is guaranteed in the split procedure (see earlier section).
|
||||||
|
|
||||||
|
#### Pageserver fails permanently during a split
|
||||||
|
|
||||||
|
If a pageserver permanently fails (i.e. the storage controller availability state for it
|
||||||
|
goes to Offline) while a split is in progress, the splitting operation will roll back, and
|
||||||
|
during the roll back it will skip any API calls to the offline pageserver. If the offline
|
||||||
|
pageserver becomes available again, any stale locations will be cleaned up via the normal reconciliation process (the `/re-attach` API).
|
||||||
|
|
||||||
|
### Handling secondary locations
|
||||||
|
|
||||||
|
For correctness, it is not necessary to split secondary locations. We can simply detach
|
||||||
|
the secondary locations for parent shards, and then attach new secondary locations
|
||||||
|
for child shards.
|
||||||
|
|
||||||
|
Clearly this is not optimal, as it will result in re-downloads of layer files that
|
||||||
|
were already present on disk. See "Splitting secondary locations"
|
||||||
|
|
||||||
|
### Conditions to trigger a split
|
||||||
|
|
||||||
|
The pageserver will expose a new API for reporting on shards that are candidates
|
||||||
|
for split: this will return a top-N report of the largest tenant shards by
|
||||||
|
physical size (remote size). This should exclude any tenants that are already
|
||||||
|
at the maximum configured shard count.
|
||||||
|
|
||||||
|
The API would look something like:
|
||||||
|
`/v1/top_n_tenant?shard_count_lt=8&sort_by=resident_size`
|
||||||
|
|
||||||
|
The storage controller will poll that API across all pageservers it manages at some appropriate interval (e.g. 60 seconds).
|
||||||
|
|
||||||
|
A split operation will be started when the tenant exceeds some threshold. This threshold
|
||||||
|
should be _less than_ how large we actually want shards to be, perhaps much less. That's to
|
||||||
|
minimize the amount of work involved in splitting -- if we want 100GiB shards, we shouldn't
|
||||||
|
wait for a tenant to exceed 100GiB before we split anything. Some data analysis of existing
|
||||||
|
tenant size distribution may be useful here: if we can make a statement like "usually, if
|
||||||
|
a tenant has exceeded 20GiB they're probably going to exceed 100GiB later", then we might
|
||||||
|
make our policy to split a tenant at 20GiB.
|
||||||
|
|
||||||
|
The finest split we can do is by factors of two, but we can do higher-cardinality splits
|
||||||
|
too, and this will help to reduce the overhead of repeatedly re-splitting a tenant
|
||||||
|
as it grows. An example of a very simple heuristic for early deployment of the splitting
|
||||||
|
feature would be: "Split tenants into 8 shards when their physical size exceeds 64GiB": that
|
||||||
|
would give us two kinds of tenant (1 shard and 8 shards), and the confidence that once we had
|
||||||
|
split a tenant, it will not need re-splitting soon after.
|
||||||
|
|
||||||
|
## Optimizations
|
||||||
|
|
||||||
|
### Flush parent shard to remote storage during split
|
||||||
|
|
||||||
|
Any data that is in WAL but not remote storage at time of split will need
|
||||||
|
to be replayed by child shards when they start for the first time. To minimize
|
||||||
|
this work, we may flush the parent shard to remote storage before writing the
|
||||||
|
remote indices for child shards.
|
||||||
|
|
||||||
|
It is important that this flush is subject to some time bounds: we may be splitting
|
||||||
|
in response to a surge of write ingest, so it may be time-critical to split. A
|
||||||
|
few seconds to flush latest data should be sufficient to optimize common cases without
|
||||||
|
running the risk of holding up a split for a harmful length of time when a parent
|
||||||
|
shard is being written heavily. If the flush doesn't complete in time, we may proceed
|
||||||
|
to shut down the parent shard and carry on with the split.
|
||||||
|
|
||||||
|
### Hard linking parent layers into child shard directories
|
||||||
|
|
||||||
|
Before we start the Tenant objects for child shards, we may pre-populate their
|
||||||
|
local storage directories with hard links to the layer files already present
|
||||||
|
in the parent shard's local directory. When the child shard starts and downloads
|
||||||
|
its remote index, it will find all those layer files already present on local disk.
|
||||||
|
|
||||||
|
This avoids wasting download capacity and makes splitting faster, but more importantly
|
||||||
|
it avoids taking up a factor of N more disk space when splitting 1 shard into N.
|
||||||
|
|
||||||
|
This mechanism will work well in typical flows where shards are migrated away
|
||||||
|
promptly after a split, but for the general case including what happens when
|
||||||
|
layers are evicted and re-downloaded after a split, see the 'Proactive compaction'
|
||||||
|
section below.
|
||||||
|
|
||||||
|
### Filtering during compaction
|
||||||
|
|
||||||
|
Compaction, especially image layer generation, should skip any keys that are
|
||||||
|
present in a shard's layer files, but do not match the shard's ShardIdentity's
|
||||||
|
is_key_local() check. This avoids carrying around data for longer than necessary
|
||||||
|
in post-split compactions.
|
||||||
|
|
||||||
|
This was already implemented in https://github.com/neondatabase/neon/pull/6246
|
||||||
|
|
||||||
|
### Proactive compaction
|
||||||
|
|
||||||
|
In remote storage, there is little reason to rewrite any data on a shard split:
|
||||||
|
all the children can reference parent layers via the very cheap write of the child
|
||||||
|
index_part.json.
|
||||||
|
|
||||||
|
In local storage, things are more nuanced. During the initial split there is no
|
||||||
|
capacity cost to duplicating parent layers, if we implement the hard linking
|
||||||
|
optimization described above. However, as soon as any layers are evicted from
|
||||||
|
local disk and re-downloaded, the downloaded layers will not be hard-links any more:
|
||||||
|
they'll have real capacity footprint. That isn't a problem if we migrate child shards
|
||||||
|
away from the parent node swiftly, but it risks a significant over-use of local disk
|
||||||
|
space if we do not.
|
||||||
|
|
||||||
|
For example, if we did an 8-way split of a shard, and then _didn't_ migrate 7 of
|
||||||
|
the shards elsewhere, then churned all the layers in all the shards via eviction,
|
||||||
|
then we would blow up the storage capacity used on the node by 8x. If we're splitting
|
||||||
|
a 100GB shard, that could take the pageserver to the point of exhausting disk space.
|
||||||
|
|
||||||
|
To avoid this scenario, we could implement a special compaction mode where we just
|
||||||
|
read historic layers, drop unwanted keys, and write back the layer file. This
|
||||||
|
is pretty expensive, but useful if we have split a large shard and are not going to
|
||||||
|
migrate the child shards away.
|
||||||
|
|
||||||
|
The heuristic conditions for triggering such a compaction are:
|
||||||
|
|
||||||
|
- A) eviction plus time: if a child shard
|
||||||
|
has existed for more than a time threshold, and has been requested to perform at least one eviction, then it becomes urgent for this child shard to execute a proactive compaction to reduce its storage footprint, at the cost of I/O load.
|
||||||
|
- B) resident size plus time: we may inspect the resident layers and calculate how
|
||||||
|
many of them include the overhead of storing pre-split keys. After some time
|
||||||
|
threshold (different to the one in case A) we still have such layers occupying
|
||||||
|
local disk space, then we should proactively compact them.
|
||||||
|
|
||||||
|
### Cleaning up parent-shard layers
|
||||||
|
|
||||||
|
It is functionally harmless to leave parent shard layers in remote storage indefinitely.
|
||||||
|
They would be cleaned up in the event of the tenant's deletion.
|
||||||
|
|
||||||
|
As an optimization to avoid leaking remote storage capacity (which costs money), we may
|
||||||
|
lazily clean up parent shard layers once no child shards reference them.
|
||||||
|
|
||||||
|
This may be done _very_ lazily: e.g. check every PITR interval. The cleanup procedure is:
|
||||||
|
|
||||||
|
- list all the key prefixes beginning with the tenant ID, and select those shard prefixes
|
||||||
|
which do not belong to the most-recently-split set of shards (_ancestral shards_, i.e. `shard*count < max(shard_count) over all shards)`, and those shard prefixes which do have the latest shard count (_current shards_)
|
||||||
|
- If there are no _ancestral shard_ prefixes found, we have nothing to clean up and
|
||||||
|
may drop out now.
|
||||||
|
- find the latest-generation index for each _current shard_, read all and accumulate the set of layers belonging to ancestral shards referenced by these indices.
|
||||||
|
- for all ancestral shards, list objects in the prefix and delete any layer which was not
|
||||||
|
referenced by a current shard.
|
||||||
|
|
||||||
|
If this cleanup is scheduled for 1-2 PITR periods after the split, there is a good chance that child shards will have written their own image layers covering the whole keyspace, such that all parent shard layers will be deletable.
|
||||||
|
|
||||||
|
The cleanup may be done by the scrubber (external process), or we may choose to have
|
||||||
|
the zeroth shard in the latest generation do the work -- there is no obstacle to one shard
|
||||||
|
reading the other shard's indices at runtime, and we do not require visibility of the
|
||||||
|
latest index writes.
|
||||||
|
|
||||||
|
Cleanup should be artificially delayed by some period (for example 24 hours) to ensure
|
||||||
|
that we retain the option to roll back a split in case of bugs.
|
||||||
|
|
||||||
|
### Splitting secondary locations
|
||||||
|
|
||||||
|
We may implement a pageserver API similar to the main splitting API, which does a simpler
|
||||||
|
operation for secondary locations: it would not write anything to S3, instead it would simply
|
||||||
|
create the child shard directory on local disk, hard link in directories from the parent,
|
||||||
|
and set up the in memory (TenantSlot) state for the children.
|
||||||
|
|
||||||
|
Similar to attached locations, a subset of secondary locations will probably need re-locating
|
||||||
|
after the split is complete, to avoid leaving multiple child shards on the same pageservers,
|
||||||
|
where they may use excessive space for the tenant.
|
||||||
|
|
||||||
|
## FAQ/Alternatives
|
||||||
|
|
||||||
|
### What should the thresholds be set to?
|
||||||
|
|
||||||
|
Shard size limit: the pre-sharding default capacity quota for databases was 200GiB, so this could be a starting point for the per-shard size limit.
|
||||||
|
|
||||||
|
Max shard count:
|
||||||
|
|
||||||
|
- The safekeeper overhead to sharding is currently O(N) network bandwidth because
|
||||||
|
the un-filtered WAL is sent to all shards. To avoid this growing out of control,
|
||||||
|
a limit of 8 shards should be temporarily imposed until WAL filtering is implemented
|
||||||
|
on the safekeeper.
|
||||||
|
- there is also little benefit to increasing the shard count beyond the number
|
||||||
|
of pageservers in a region.
|
||||||
|
|
||||||
|
### Is it worth just rewriting all the data during a split to simplify reasoning about space?
|
||||||
@@ -40,7 +40,7 @@ macro_rules! register_hll {
|
|||||||
}};
|
}};
|
||||||
|
|
||||||
($N:literal, $NAME:expr, $HELP:expr $(,)?) => {{
|
($N:literal, $NAME:expr, $HELP:expr $(,)?) => {{
|
||||||
$crate::register_hll!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES)
|
$crate::register_hll!($N, $crate::opts!($NAME, $HELP))
|
||||||
}};
|
}};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -29,7 +29,6 @@ pub mod launch_timestamp;
|
|||||||
mod wrappers;
|
mod wrappers;
|
||||||
pub use wrappers::{CountedReader, CountedWriter};
|
pub use wrappers::{CountedReader, CountedWriter};
|
||||||
mod hll;
|
mod hll;
|
||||||
pub mod metric_vec_duration;
|
|
||||||
pub use hll::{HyperLogLog, HyperLogLogVec};
|
pub use hll::{HyperLogLog, HyperLogLogVec};
|
||||||
#[cfg(target_os = "linux")]
|
#[cfg(target_os = "linux")]
|
||||||
pub mod more_process_metrics;
|
pub mod more_process_metrics;
|
||||||
|
|||||||
@@ -1,23 +0,0 @@
|
|||||||
//! Helpers for observing duration on `HistogramVec` / `CounterVec` / `GaugeVec` / `MetricVec<T>`.
|
|
||||||
|
|
||||||
use std::{future::Future, time::Instant};
|
|
||||||
|
|
||||||
pub trait DurationResultObserver {
|
|
||||||
fn observe_result<T, E>(&self, res: &Result<T, E>, duration: std::time::Duration);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn observe_async_block_duration_by_result<
|
|
||||||
T,
|
|
||||||
E,
|
|
||||||
F: Future<Output = Result<T, E>>,
|
|
||||||
O: DurationResultObserver,
|
|
||||||
>(
|
|
||||||
observer: &O,
|
|
||||||
block: F,
|
|
||||||
) -> Result<T, E> {
|
|
||||||
let start = Instant::now();
|
|
||||||
let result = block.await;
|
|
||||||
let duration = start.elapsed();
|
|
||||||
observer.observe_result(&result, duration);
|
|
||||||
result
|
|
||||||
}
|
|
||||||
@@ -6,7 +6,10 @@ use std::str::FromStr;
|
|||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use utils::id::NodeId;
|
use utils::id::NodeId;
|
||||||
|
|
||||||
use crate::{models::ShardParameters, shard::TenantShardId};
|
use crate::{
|
||||||
|
models::{ShardParameters, TenantConfig},
|
||||||
|
shard::{ShardStripeSize, TenantShardId},
|
||||||
|
};
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize)]
|
||||||
pub struct TenantCreateResponseShard {
|
pub struct TenantCreateResponseShard {
|
||||||
@@ -35,7 +38,7 @@ pub struct NodeRegisterRequest {
|
|||||||
pub struct NodeConfigureRequest {
|
pub struct NodeConfigureRequest {
|
||||||
pub node_id: NodeId,
|
pub node_id: NodeId,
|
||||||
|
|
||||||
pub availability: Option<NodeAvailability>,
|
pub availability: Option<NodeAvailabilityWrapper>,
|
||||||
pub scheduling: Option<NodeSchedulingPolicy>,
|
pub scheduling: Option<NodeSchedulingPolicy>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -57,6 +60,31 @@ pub struct TenantLocateResponse {
|
|||||||
pub shard_params: ShardParameters,
|
pub shard_params: ShardParameters,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize)]
|
||||||
|
pub struct TenantDescribeResponse {
|
||||||
|
pub shards: Vec<TenantDescribeResponseShard>,
|
||||||
|
pub stripe_size: ShardStripeSize,
|
||||||
|
pub policy: PlacementPolicy,
|
||||||
|
pub config: TenantConfig,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize)]
|
||||||
|
pub struct TenantDescribeResponseShard {
|
||||||
|
pub tenant_shard_id: TenantShardId,
|
||||||
|
|
||||||
|
pub node_attached: Option<NodeId>,
|
||||||
|
pub node_secondary: Vec<NodeId>,
|
||||||
|
|
||||||
|
pub last_error: String,
|
||||||
|
|
||||||
|
/// A task is currently running to reconcile this tenant's intent state with the state on pageservers
|
||||||
|
pub is_reconciling: bool,
|
||||||
|
/// This shard failed in sending a compute notification to the cloud control plane, and a retry is pending.
|
||||||
|
pub is_pending_compute_notification: bool,
|
||||||
|
/// A shard split is currently underway
|
||||||
|
pub is_splitting: bool,
|
||||||
|
}
|
||||||
|
|
||||||
/// Explicitly migrating a particular shard is a low level operation
|
/// Explicitly migrating a particular shard is a low level operation
|
||||||
/// TODO: higher level "Reschedule tenant" operation where the request
|
/// TODO: higher level "Reschedule tenant" operation where the request
|
||||||
/// specifies some constraints, e.g. asking it to get off particular node(s)
|
/// specifies some constraints, e.g. asking it to get off particular node(s)
|
||||||
@@ -66,30 +94,82 @@ pub struct TenantShardMigrateRequest {
|
|||||||
pub node_id: NodeId,
|
pub node_id: NodeId,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
|
/// Utilisation score indicating how good a candidate a pageserver
|
||||||
|
/// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
|
||||||
|
/// Lower values are better.
|
||||||
|
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
|
||||||
|
pub struct UtilizationScore(pub u64);
|
||||||
|
|
||||||
|
impl UtilizationScore {
|
||||||
|
pub fn worst() -> Self {
|
||||||
|
UtilizationScore(u64::MAX)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Clone, Copy)]
|
||||||
|
#[serde(into = "NodeAvailabilityWrapper")]
|
||||||
pub enum NodeAvailability {
|
pub enum NodeAvailability {
|
||||||
// Normal, happy state
|
// Normal, happy state
|
||||||
Active,
|
Active(UtilizationScore),
|
||||||
// Offline: Tenants shouldn't try to attach here, but they may assume that their
|
// Offline: Tenants shouldn't try to attach here, but they may assume that their
|
||||||
// secondary locations on this node still exist. Newly added nodes are in this
|
// secondary locations on this node still exist. Newly added nodes are in this
|
||||||
// state until we successfully contact them.
|
// state until we successfully contact them.
|
||||||
Offline,
|
Offline,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl PartialEq for NodeAvailability {
|
||||||
|
fn eq(&self, other: &Self) -> bool {
|
||||||
|
use NodeAvailability::*;
|
||||||
|
matches!((self, other), (Active(_), Active(_)) | (Offline, Offline))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Eq for NodeAvailability {}
|
||||||
|
|
||||||
|
// This wrapper provides serde functionality and it should only be used to
|
||||||
|
// communicate with external callers which don't know or care about the
|
||||||
|
// utilisation score of the pageserver it is targeting.
|
||||||
|
#[derive(Serialize, Deserialize, Clone)]
|
||||||
|
pub enum NodeAvailabilityWrapper {
|
||||||
|
Active,
|
||||||
|
Offline,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<NodeAvailabilityWrapper> for NodeAvailability {
|
||||||
|
fn from(val: NodeAvailabilityWrapper) -> Self {
|
||||||
|
match val {
|
||||||
|
// Assume the worst utilisation score to begin with. It will later be updated by
|
||||||
|
// the heartbeats.
|
||||||
|
NodeAvailabilityWrapper::Active => NodeAvailability::Active(UtilizationScore::worst()),
|
||||||
|
NodeAvailabilityWrapper::Offline => NodeAvailability::Offline,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<NodeAvailability> for NodeAvailabilityWrapper {
|
||||||
|
fn from(val: NodeAvailability) -> Self {
|
||||||
|
match val {
|
||||||
|
NodeAvailability::Active(_) => NodeAvailabilityWrapper::Active,
|
||||||
|
NodeAvailability::Offline => NodeAvailabilityWrapper::Offline,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl FromStr for NodeAvailability {
|
impl FromStr for NodeAvailability {
|
||||||
type Err = anyhow::Error;
|
type Err = anyhow::Error;
|
||||||
|
|
||||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||||
match s {
|
match s {
|
||||||
"active" => Ok(Self::Active),
|
// This is used when parsing node configuration requests from neon-local.
|
||||||
|
// Assume the worst possible utilisation score
|
||||||
|
// and let it get updated via the heartbeats.
|
||||||
|
"active" => Ok(Self::Active(UtilizationScore::worst())),
|
||||||
"offline" => Ok(Self::Offline),
|
"offline" => Ok(Self::Offline),
|
||||||
_ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
|
_ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// FIXME: this is a duplicate of the type in the attachment_service crate, because the
|
|
||||||
/// type needs to be defined with diesel traits in there.
|
|
||||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
|
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
|
||||||
pub enum NodeSchedulingPolicy {
|
pub enum NodeSchedulingPolicy {
|
||||||
Active,
|
Active,
|
||||||
@@ -129,11 +209,8 @@ impl From<NodeSchedulingPolicy> for String {
|
|||||||
/// to create secondary locations.
|
/// to create secondary locations.
|
||||||
#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
|
#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
|
||||||
pub enum PlacementPolicy {
|
pub enum PlacementPolicy {
|
||||||
/// Cheapest way to attach a tenant: just one pageserver, no secondary
|
/// Normal live state: one attached pageserver and zero or more secondaries.
|
||||||
Single,
|
Attached(usize),
|
||||||
/// Production-ready way to attach a tenant: one attached pageserver and
|
|
||||||
/// some number of secondaries.
|
|
||||||
Double(usize),
|
|
||||||
/// Create one secondary mode locations. This is useful when onboarding
|
/// Create one secondary mode locations. This is useful when onboarding
|
||||||
/// a tenant, or for an idle tenant that we might want to bring online quickly.
|
/// a tenant, or for an idle tenant that we might want to bring online quickly.
|
||||||
Secondary,
|
Secondary,
|
||||||
@@ -155,14 +232,14 @@ mod test {
|
|||||||
/// Check stability of PlacementPolicy's serialization
|
/// Check stability of PlacementPolicy's serialization
|
||||||
#[test]
|
#[test]
|
||||||
fn placement_policy_encoding() -> anyhow::Result<()> {
|
fn placement_policy_encoding() -> anyhow::Result<()> {
|
||||||
let v = PlacementPolicy::Double(1);
|
let v = PlacementPolicy::Attached(1);
|
||||||
let encoded = serde_json::to_string(&v)?;
|
let encoded = serde_json::to_string(&v)?;
|
||||||
assert_eq!(encoded, "{\"Double\":1}");
|
assert_eq!(encoded, "{\"Attached\":1}");
|
||||||
assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
|
assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
|
||||||
|
|
||||||
let v = PlacementPolicy::Single;
|
let v = PlacementPolicy::Detached;
|
||||||
let encoded = serde_json::to_string(&v)?;
|
let encoded = serde_json::to_string(&v)?;
|
||||||
assert_eq!(encoded, "\"Single\"");
|
assert_eq!(encoded, "\"Detached\"");
|
||||||
assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
|
assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ pub mod utilization;
|
|||||||
pub use utilization::PageserverUtilization;
|
pub use utilization::PageserverUtilization;
|
||||||
|
|
||||||
use std::{
|
use std::{
|
||||||
|
borrow::Cow,
|
||||||
collections::HashMap,
|
collections::HashMap,
|
||||||
io::{BufRead, Read},
|
io::{BufRead, Read},
|
||||||
num::{NonZeroU64, NonZeroUsize},
|
num::{NonZeroU64, NonZeroUsize},
|
||||||
@@ -198,6 +199,13 @@ pub struct TimelineCreateRequest {
|
|||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize)]
|
||||||
pub struct TenantShardSplitRequest {
|
pub struct TenantShardSplitRequest {
|
||||||
pub new_shard_count: u8,
|
pub new_shard_count: u8,
|
||||||
|
|
||||||
|
// A tenant's stripe size is only meaningful the first time their shard count goes
|
||||||
|
// above 1: therefore during a split from 1->N shards, we may modify the stripe size.
|
||||||
|
//
|
||||||
|
// If this is set while the stripe count is being increased from an already >1 value,
|
||||||
|
// then the request will fail with 400.
|
||||||
|
pub new_stripe_size: Option<ShardStripeSize>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize)]
|
||||||
@@ -419,7 +427,7 @@ pub struct StatusResponse {
|
|||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
#[serde(deny_unknown_fields)]
|
#[serde(deny_unknown_fields)]
|
||||||
pub struct TenantLocationConfigRequest {
|
pub struct TenantLocationConfigRequest {
|
||||||
pub tenant_id: TenantShardId,
|
pub tenant_id: Option<TenantShardId>,
|
||||||
#[serde(flatten)]
|
#[serde(flatten)]
|
||||||
pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
|
pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
|
||||||
}
|
}
|
||||||
@@ -570,7 +578,7 @@ pub struct TimelineInfo {
|
|||||||
pub walreceiver_status: String,
|
pub walreceiver_status: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
pub struct LayerMapInfo {
|
pub struct LayerMapInfo {
|
||||||
pub in_memory_layers: Vec<InMemoryLayerInfo>,
|
pub in_memory_layers: Vec<InMemoryLayerInfo>,
|
||||||
pub historic_layers: Vec<HistoricLayerInfo>,
|
pub historic_layers: Vec<HistoricLayerInfo>,
|
||||||
@@ -588,7 +596,7 @@ pub enum LayerAccessKind {
|
|||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
pub struct LayerAccessStatFullDetails {
|
pub struct LayerAccessStatFullDetails {
|
||||||
pub when_millis_since_epoch: u64,
|
pub when_millis_since_epoch: u64,
|
||||||
pub task_kind: &'static str,
|
pub task_kind: Cow<'static, str>,
|
||||||
pub access_kind: LayerAccessKind,
|
pub access_kind: LayerAccessKind,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -647,23 +655,23 @@ impl LayerResidenceEvent {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
pub struct LayerAccessStats {
|
pub struct LayerAccessStats {
|
||||||
pub access_count_by_access_kind: HashMap<LayerAccessKind, u64>,
|
pub access_count_by_access_kind: HashMap<LayerAccessKind, u64>,
|
||||||
pub task_kind_access_flag: Vec<&'static str>,
|
pub task_kind_access_flag: Vec<Cow<'static, str>>,
|
||||||
pub first: Option<LayerAccessStatFullDetails>,
|
pub first: Option<LayerAccessStatFullDetails>,
|
||||||
pub accesses_history: HistoryBufferWithDropCounter<LayerAccessStatFullDetails, 16>,
|
pub accesses_history: HistoryBufferWithDropCounter<LayerAccessStatFullDetails, 16>,
|
||||||
pub residence_events_history: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
|
pub residence_events_history: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
#[serde(tag = "kind")]
|
#[serde(tag = "kind")]
|
||||||
pub enum InMemoryLayerInfo {
|
pub enum InMemoryLayerInfo {
|
||||||
Open { lsn_start: Lsn },
|
Open { lsn_start: Lsn },
|
||||||
Frozen { lsn_start: Lsn, lsn_end: Lsn },
|
Frozen { lsn_start: Lsn, lsn_end: Lsn },
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
#[serde(tag = "kind")]
|
#[serde(tag = "kind")]
|
||||||
pub enum HistoricLayerInfo {
|
pub enum HistoricLayerInfo {
|
||||||
Delta {
|
Delta {
|
||||||
@@ -685,6 +693,32 @@ pub enum HistoricLayerInfo {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl HistoricLayerInfo {
|
||||||
|
pub fn layer_file_name(&self) -> &str {
|
||||||
|
match self {
|
||||||
|
HistoricLayerInfo::Delta {
|
||||||
|
layer_file_name, ..
|
||||||
|
} => layer_file_name,
|
||||||
|
HistoricLayerInfo::Image {
|
||||||
|
layer_file_name, ..
|
||||||
|
} => layer_file_name,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pub fn is_remote(&self) -> bool {
|
||||||
|
match self {
|
||||||
|
HistoricLayerInfo::Delta { remote, .. } => *remote,
|
||||||
|
HistoricLayerInfo::Image { remote, .. } => *remote,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pub fn set_remote(&mut self, value: bool) {
|
||||||
|
let field = match self {
|
||||||
|
HistoricLayerInfo::Delta { remote, .. } => remote,
|
||||||
|
HistoricLayerInfo::Image { remote, .. } => remote,
|
||||||
|
};
|
||||||
|
*field = value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize)]
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
pub struct DownloadRemoteLayersTaskSpawnRequest {
|
pub struct DownloadRemoteLayersTaskSpawnRequest {
|
||||||
pub max_concurrent_downloads: NonZeroUsize,
|
pub max_concurrent_downloads: NonZeroUsize,
|
||||||
@@ -717,6 +751,52 @@ pub struct WalRedoManagerStatus {
|
|||||||
pub pid: Option<u32>,
|
pub pid: Option<u32>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
|
||||||
|
/// a download job, timing out while waiting for it to run, and then inspecting this status to understand
|
||||||
|
/// what's happening.
|
||||||
|
#[derive(Default, Debug, Serialize, Deserialize, Clone)]
|
||||||
|
pub struct SecondaryProgress {
|
||||||
|
/// The remote storage LastModified time of the heatmap object we last downloaded.
|
||||||
|
#[serde(
|
||||||
|
serialize_with = "opt_ser_rfc3339_millis",
|
||||||
|
deserialize_with = "opt_deser_rfc3339_millis"
|
||||||
|
)]
|
||||||
|
pub heatmap_mtime: Option<SystemTime>,
|
||||||
|
|
||||||
|
/// The number of layers currently on-disk
|
||||||
|
pub layers_downloaded: usize,
|
||||||
|
/// The number of layers in the most recently seen heatmap
|
||||||
|
pub layers_total: usize,
|
||||||
|
|
||||||
|
/// The number of layer bytes currently on-disk
|
||||||
|
pub bytes_downloaded: u64,
|
||||||
|
/// The number of layer bytes in the most recently seen heatmap
|
||||||
|
pub bytes_total: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn opt_ser_rfc3339_millis<S: serde::Serializer>(
|
||||||
|
ts: &Option<SystemTime>,
|
||||||
|
serializer: S,
|
||||||
|
) -> Result<S::Ok, S::Error> {
|
||||||
|
match ts {
|
||||||
|
Some(ts) => serializer.collect_str(&humantime::format_rfc3339_millis(*ts)),
|
||||||
|
None => serializer.serialize_none(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn opt_deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<Option<SystemTime>, D::Error>
|
||||||
|
where
|
||||||
|
D: serde::de::Deserializer<'de>,
|
||||||
|
{
|
||||||
|
let s: Option<String> = serde::de::Deserialize::deserialize(deserializer)?;
|
||||||
|
match s {
|
||||||
|
None => Ok(None),
|
||||||
|
Some(s) => humantime::parse_rfc3339(&s)
|
||||||
|
.map_err(serde::de::Error::custom)
|
||||||
|
.map(Some),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub mod virtual_file {
|
pub mod virtual_file {
|
||||||
#[derive(
|
#[derive(
|
||||||
Copy,
|
Copy,
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ use std::time::SystemTime;
|
|||||||
///
|
///
|
||||||
/// `format: int64` fields must use `ser_saturating_u63` because openapi generated clients might
|
/// `format: int64` fields must use `ser_saturating_u63` because openapi generated clients might
|
||||||
/// not handle full u64 values properly.
|
/// not handle full u64 values properly.
|
||||||
#[derive(serde::Serialize, Debug)]
|
#[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
|
||||||
pub struct PageserverUtilization {
|
pub struct PageserverUtilization {
|
||||||
/// Used disk space
|
/// Used disk space
|
||||||
#[serde(serialize_with = "ser_saturating_u63")]
|
#[serde(serialize_with = "ser_saturating_u63")]
|
||||||
@@ -21,7 +21,10 @@ pub struct PageserverUtilization {
|
|||||||
/// When was this snapshot captured, pageserver local time.
|
/// When was this snapshot captured, pageserver local time.
|
||||||
///
|
///
|
||||||
/// Use millis to give confidence that the value is regenerated often enough.
|
/// Use millis to give confidence that the value is regenerated often enough.
|
||||||
#[serde(serialize_with = "ser_rfc3339_millis")]
|
#[serde(
|
||||||
|
serialize_with = "ser_rfc3339_millis",
|
||||||
|
deserialize_with = "deser_rfc3339_millis"
|
||||||
|
)]
|
||||||
pub captured_at: SystemTime,
|
pub captured_at: SystemTime,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -32,6 +35,14 @@ fn ser_rfc3339_millis<S: serde::Serializer>(
|
|||||||
serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
|
serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<SystemTime, D::Error>
|
||||||
|
where
|
||||||
|
D: serde::de::Deserializer<'de>,
|
||||||
|
{
|
||||||
|
let s: String = serde::de::Deserialize::deserialize(deserializer)?;
|
||||||
|
humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom)
|
||||||
|
}
|
||||||
|
|
||||||
/// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
|
/// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
|
||||||
///
|
///
|
||||||
/// Instead of newtype, use this because a newtype would get require handling deserializing values
|
/// Instead of newtype, use this because a newtype would get require handling deserializing values
|
||||||
|
|||||||
@@ -6,19 +6,36 @@
|
|||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use utils::id::NodeId;
|
use utils::id::NodeId;
|
||||||
|
|
||||||
use crate::shard::TenantShardId;
|
use crate::{
|
||||||
|
controller_api::NodeRegisterRequest, models::LocationConfigMode, shard::TenantShardId,
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Upcall message sent by the pageserver to the configured `control_plane_api` on
|
||||||
|
/// startup.
|
||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize)]
|
||||||
pub struct ReAttachRequest {
|
pub struct ReAttachRequest {
|
||||||
pub node_id: NodeId,
|
pub node_id: NodeId,
|
||||||
|
|
||||||
|
/// Optional inline self-registration: this is useful with the storage controller,
|
||||||
|
/// if the node already has a node_id set.
|
||||||
|
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||||||
|
pub register: Option<NodeRegisterRequest>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
fn default_mode() -> LocationConfigMode {
|
||||||
|
LocationConfigMode::AttachedSingle
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
pub struct ReAttachResponseTenant {
|
pub struct ReAttachResponseTenant {
|
||||||
pub id: TenantShardId,
|
pub id: TenantShardId,
|
||||||
pub gen: u32,
|
/// Mandatory if LocationConfigMode is None or set to an Attached* mode
|
||||||
}
|
pub gen: Option<u32>,
|
||||||
|
|
||||||
|
/// Default value only for backward compat: this field should be set
|
||||||
|
#[serde(default = "default_mode")]
|
||||||
|
pub mode: LocationConfigMode,
|
||||||
|
}
|
||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize)]
|
||||||
pub struct ReAttachResponse {
|
pub struct ReAttachResponse {
|
||||||
pub tenants: Vec<ReAttachResponseTenant>,
|
pub tenants: Vec<ReAttachResponseTenant>,
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
use anyhow::*;
|
use anyhow::*;
|
||||||
use clap::{value_parser, Arg, ArgMatches, Command};
|
use clap::{value_parser, Arg, ArgMatches, Command};
|
||||||
|
use postgres::Client;
|
||||||
use std::{path::PathBuf, str::FromStr};
|
use std::{path::PathBuf, str::FromStr};
|
||||||
use wal_craft::*;
|
use wal_craft::*;
|
||||||
|
|
||||||
@@ -8,8 +9,8 @@ fn main() -> Result<()> {
|
|||||||
.init();
|
.init();
|
||||||
let arg_matches = cli().get_matches();
|
let arg_matches = cli().get_matches();
|
||||||
|
|
||||||
let wal_craft = |arg_matches: &ArgMatches, client| {
|
let wal_craft = |arg_matches: &ArgMatches, client: &mut Client| {
|
||||||
let (intermediate_lsns, end_of_wal_lsn) = match arg_matches
|
let intermediate_lsns = match arg_matches
|
||||||
.get_one::<String>("type")
|
.get_one::<String>("type")
|
||||||
.map(|s| s.as_str())
|
.map(|s| s.as_str())
|
||||||
.context("'type' is required")?
|
.context("'type' is required")?
|
||||||
@@ -25,6 +26,7 @@ fn main() -> Result<()> {
|
|||||||
LastWalRecordCrossingSegment::NAME => LastWalRecordCrossingSegment::craft(client)?,
|
LastWalRecordCrossingSegment::NAME => LastWalRecordCrossingSegment::craft(client)?,
|
||||||
a => panic!("Unknown --type argument: {a}"),
|
a => panic!("Unknown --type argument: {a}"),
|
||||||
};
|
};
|
||||||
|
let end_of_wal_lsn = client.pg_current_wal_insert_lsn()?;
|
||||||
for lsn in intermediate_lsns {
|
for lsn in intermediate_lsns {
|
||||||
println!("intermediate_lsn = {lsn}");
|
println!("intermediate_lsn = {lsn}");
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,7 +5,6 @@ use postgres::types::PgLsn;
|
|||||||
use postgres::Client;
|
use postgres::Client;
|
||||||
use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
|
use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
|
||||||
use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
|
use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
|
||||||
use std::cmp::Ordering;
|
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
use std::process::Command;
|
use std::process::Command;
|
||||||
use std::time::{Duration, Instant};
|
use std::time::{Duration, Instant};
|
||||||
@@ -232,59 +231,52 @@ pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> anyhow
|
|||||||
pub trait Crafter {
|
pub trait Crafter {
|
||||||
const NAME: &'static str;
|
const NAME: &'static str;
|
||||||
|
|
||||||
/// Generates WAL using the client `client`. Returns a pair of:
|
/// Generates WAL using the client `client`. Returns a vector of some valid
|
||||||
/// * A vector of some valid "interesting" intermediate LSNs which one may start reading from.
|
/// "interesting" intermediate LSNs which one may start reading from.
|
||||||
/// May include or exclude Lsn(0) and the end-of-wal.
|
/// test_end_of_wal uses this to check various starting points.
|
||||||
/// * The expected end-of-wal LSN.
|
///
|
||||||
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)>;
|
/// Note that postgres is generally keen about writing some WAL. While we
|
||||||
|
/// try to disable it (autovacuum, big wal_writer_delay, etc) it is always
|
||||||
|
/// possible, e.g. xl_running_xacts are dumped each 15s. So checks about
|
||||||
|
/// stable WAL end would be flaky unless postgres is shut down. For this
|
||||||
|
/// reason returning potential end of WAL here is pointless. Most of the
|
||||||
|
/// time this doesn't happen though, so it is reasonable to create needed
|
||||||
|
/// WAL structure and immediately kill postgres like test_end_of_wal does.
|
||||||
|
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>>;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Wraps some WAL craft function, providing current LSN to it before the
|
||||||
|
/// insertion and flushing WAL afterwards. Also pushes initial LSN to the
|
||||||
|
/// result.
|
||||||
fn craft_internal<C: postgres::GenericClient>(
|
fn craft_internal<C: postgres::GenericClient>(
|
||||||
client: &mut C,
|
client: &mut C,
|
||||||
f: impl Fn(&mut C, PgLsn) -> anyhow::Result<(Vec<PgLsn>, Option<PgLsn>)>,
|
f: impl Fn(&mut C, PgLsn) -> anyhow::Result<Vec<PgLsn>>,
|
||||||
) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
|
) -> anyhow::Result<Vec<PgLsn>> {
|
||||||
ensure_server_config(client)?;
|
ensure_server_config(client)?;
|
||||||
|
|
||||||
let initial_lsn = client.pg_current_wal_insert_lsn()?;
|
let initial_lsn = client.pg_current_wal_insert_lsn()?;
|
||||||
info!("LSN initial = {}", initial_lsn);
|
info!("LSN initial = {}", initial_lsn);
|
||||||
|
|
||||||
let (mut intermediate_lsns, last_lsn) = f(client, initial_lsn)?;
|
let mut intermediate_lsns = f(client, initial_lsn)?;
|
||||||
let last_lsn = match last_lsn {
|
|
||||||
None => client.pg_current_wal_insert_lsn()?,
|
|
||||||
Some(last_lsn) => {
|
|
||||||
let insert_lsn = client.pg_current_wal_insert_lsn()?;
|
|
||||||
match last_lsn.cmp(&insert_lsn) {
|
|
||||||
Ordering::Less => bail!(
|
|
||||||
"Some records were inserted after the crafted WAL: {} vs {}",
|
|
||||||
last_lsn,
|
|
||||||
insert_lsn
|
|
||||||
),
|
|
||||||
Ordering::Equal => last_lsn,
|
|
||||||
Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
if !intermediate_lsns.starts_with(&[initial_lsn]) {
|
if !intermediate_lsns.starts_with(&[initial_lsn]) {
|
||||||
intermediate_lsns.insert(0, initial_lsn);
|
intermediate_lsns.insert(0, initial_lsn);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Some records may be not flushed, e.g. non-transactional logical messages.
|
// Some records may be not flushed, e.g. non-transactional logical messages.
|
||||||
|
//
|
||||||
|
// Note: this is broken if pg_current_wal_insert_lsn is at page boundary
|
||||||
|
// because pg_current_wal_insert_lsn skips page headers.
|
||||||
client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?;
|
client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?;
|
||||||
match last_lsn.cmp(&client.pg_current_wal_flush_lsn()?) {
|
Ok(intermediate_lsns)
|
||||||
Ordering::Less => bail!("Some records were flushed after the crafted WAL"),
|
|
||||||
Ordering::Equal => {}
|
|
||||||
Ordering::Greater => bail!("Reported LSN is greater than flush_lsn"),
|
|
||||||
}
|
|
||||||
Ok((intermediate_lsns, last_lsn))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct Simple;
|
pub struct Simple;
|
||||||
impl Crafter for Simple {
|
impl Crafter for Simple {
|
||||||
const NAME: &'static str = "simple";
|
const NAME: &'static str = "simple";
|
||||||
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
|
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
|
||||||
craft_internal(client, |client, _| {
|
craft_internal(client, |client, _| {
|
||||||
client.execute("CREATE table t(x int)", &[])?;
|
client.execute("CREATE table t(x int)", &[])?;
|
||||||
Ok((Vec::new(), None))
|
Ok(Vec::new())
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -292,29 +284,36 @@ impl Crafter for Simple {
|
|||||||
pub struct LastWalRecordXlogSwitch;
|
pub struct LastWalRecordXlogSwitch;
|
||||||
impl Crafter for LastWalRecordXlogSwitch {
|
impl Crafter for LastWalRecordXlogSwitch {
|
||||||
const NAME: &'static str = "last_wal_record_xlog_switch";
|
const NAME: &'static str = "last_wal_record_xlog_switch";
|
||||||
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
|
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
|
||||||
// Do not use generate_internal because here we end up with flush_lsn exactly on
|
// Do not use craft_internal because here we end up with flush_lsn exactly on
|
||||||
// the segment boundary and insert_lsn after the initial page header, which is unusual.
|
// the segment boundary and insert_lsn after the initial page header, which is unusual.
|
||||||
ensure_server_config(client)?;
|
ensure_server_config(client)?;
|
||||||
|
|
||||||
client.execute("CREATE table t(x int)", &[])?;
|
client.execute("CREATE table t(x int)", &[])?;
|
||||||
let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
|
let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
|
||||||
let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
|
// pg_switch_wal returns end of last record of the switched segment,
|
||||||
let next_segment = PgLsn::from(0x0200_0000);
|
// i.e. end of SWITCH itself.
|
||||||
|
let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
|
||||||
|
let before_xlog_switch_u64 = u64::from(before_xlog_switch);
|
||||||
|
let next_segment = PgLsn::from(
|
||||||
|
before_xlog_switch_u64 - (before_xlog_switch_u64 % WAL_SEGMENT_SIZE as u64)
|
||||||
|
+ WAL_SEGMENT_SIZE as u64,
|
||||||
|
);
|
||||||
ensure!(
|
ensure!(
|
||||||
after_xlog_switch <= next_segment,
|
xlog_switch_record_end <= next_segment,
|
||||||
"XLOG_SWITCH message ended after the expected segment boundary: {} > {}",
|
"XLOG_SWITCH record ended after the expected segment boundary: {} > {}",
|
||||||
after_xlog_switch,
|
xlog_switch_record_end,
|
||||||
next_segment
|
next_segment
|
||||||
);
|
);
|
||||||
Ok((vec![before_xlog_switch, after_xlog_switch], next_segment))
|
Ok(vec![before_xlog_switch, xlog_switch_record_end])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct LastWalRecordXlogSwitchEndsOnPageBoundary;
|
pub struct LastWalRecordXlogSwitchEndsOnPageBoundary;
|
||||||
|
/// Craft xlog SWITCH record ending at page boundary.
|
||||||
impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
|
impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
|
||||||
const NAME: &'static str = "last_wal_record_xlog_switch_ends_on_page_boundary";
|
const NAME: &'static str = "last_wal_record_xlog_switch_ends_on_page_boundary";
|
||||||
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
|
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
|
||||||
// Do not use generate_internal because here we end up with flush_lsn exactly on
|
// Do not use generate_internal because here we end up with flush_lsn exactly on
|
||||||
// the segment boundary and insert_lsn after the initial page header, which is unusual.
|
// the segment boundary and insert_lsn after the initial page header, which is unusual.
|
||||||
ensure_server_config(client)?;
|
ensure_server_config(client)?;
|
||||||
@@ -361,28 +360,29 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
|
|||||||
|
|
||||||
// Emit the XLOG_SWITCH
|
// Emit the XLOG_SWITCH
|
||||||
let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
|
let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
|
||||||
let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
|
let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
|
||||||
let next_segment = PgLsn::from(0x0200_0000);
|
let next_segment = PgLsn::from(0x0200_0000);
|
||||||
ensure!(
|
ensure!(
|
||||||
after_xlog_switch < next_segment,
|
xlog_switch_record_end < next_segment,
|
||||||
"XLOG_SWITCH message ended on or after the expected segment boundary: {} > {}",
|
"XLOG_SWITCH record ended on or after the expected segment boundary: {} > {}",
|
||||||
after_xlog_switch,
|
xlog_switch_record_end,
|
||||||
next_segment
|
next_segment
|
||||||
);
|
);
|
||||||
ensure!(
|
ensure!(
|
||||||
u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
|
u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
|
||||||
"XLOG_SWITCH message ended not on page boundary: {}, offset = {}",
|
"XLOG_SWITCH message ended not on page boundary: {}, offset = {}",
|
||||||
after_xlog_switch,
|
xlog_switch_record_end,
|
||||||
u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ
|
u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
|
||||||
);
|
);
|
||||||
Ok((vec![before_xlog_switch, after_xlog_switch], next_segment))
|
Ok(vec![before_xlog_switch, xlog_switch_record_end])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn craft_single_logical_message(
|
/// Write ~16MB logical message; it should cross WAL segment.
|
||||||
|
fn craft_seg_size_logical_message(
|
||||||
client: &mut impl postgres::GenericClient,
|
client: &mut impl postgres::GenericClient,
|
||||||
transactional: bool,
|
transactional: bool,
|
||||||
) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
|
) -> anyhow::Result<Vec<PgLsn>> {
|
||||||
craft_internal(client, |client, initial_lsn| {
|
craft_internal(client, |client, initial_lsn| {
|
||||||
ensure!(
|
ensure!(
|
||||||
initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024),
|
initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024),
|
||||||
@@ -405,34 +405,24 @@ fn craft_single_logical_message(
|
|||||||
"Logical message crossed two segments"
|
"Logical message crossed two segments"
|
||||||
);
|
);
|
||||||
|
|
||||||
if transactional {
|
Ok(vec![message_lsn])
|
||||||
// Transactional logical messages are part of a transaction, so the one above is
|
|
||||||
// followed by a small COMMIT record.
|
|
||||||
|
|
||||||
let after_message_lsn = client.pg_current_wal_insert_lsn()?;
|
|
||||||
ensure!(
|
|
||||||
message_lsn < after_message_lsn,
|
|
||||||
"No record found after the emitted message"
|
|
||||||
);
|
|
||||||
Ok((vec![message_lsn], Some(after_message_lsn)))
|
|
||||||
} else {
|
|
||||||
Ok((Vec::new(), Some(message_lsn)))
|
|
||||||
}
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct WalRecordCrossingSegmentFollowedBySmallOne;
|
pub struct WalRecordCrossingSegmentFollowedBySmallOne;
|
||||||
impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne {
|
impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne {
|
||||||
const NAME: &'static str = "wal_record_crossing_segment_followed_by_small_one";
|
const NAME: &'static str = "wal_record_crossing_segment_followed_by_small_one";
|
||||||
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
|
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
|
||||||
craft_single_logical_message(client, true)
|
// Transactional message crossing WAL segment will be followed by small
|
||||||
|
// commit record.
|
||||||
|
craft_seg_size_logical_message(client, true)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct LastWalRecordCrossingSegment;
|
pub struct LastWalRecordCrossingSegment;
|
||||||
impl Crafter for LastWalRecordCrossingSegment {
|
impl Crafter for LastWalRecordCrossingSegment {
|
||||||
const NAME: &'static str = "last_wal_record_crossing_segment";
|
const NAME: &'static str = "last_wal_record_crossing_segment";
|
||||||
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
|
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
|
||||||
craft_single_logical_message(client, false)
|
craft_seg_size_logical_message(client, false)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -11,13 +11,15 @@ use utils::const_assert;
|
|||||||
use utils::lsn::Lsn;
|
use utils::lsn::Lsn;
|
||||||
|
|
||||||
fn init_logging() {
|
fn init_logging() {
|
||||||
let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(
|
let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(format!(
|
||||||
format!("crate=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"),
|
"crate=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"
|
||||||
))
|
)))
|
||||||
.is_test(true)
|
.is_test(true)
|
||||||
.try_init();
|
.try_init();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Test that find_end_of_wal returns the same results as pg_dump on various
|
||||||
|
/// WALs created by Crafter.
|
||||||
fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
|
fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
|
||||||
use crate::*;
|
use crate::*;
|
||||||
|
|
||||||
@@ -38,13 +40,13 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
|
|||||||
}
|
}
|
||||||
cfg.initdb().unwrap();
|
cfg.initdb().unwrap();
|
||||||
let srv = cfg.start_server().unwrap();
|
let srv = cfg.start_server().unwrap();
|
||||||
let (intermediate_lsns, expected_end_of_wal_partial) =
|
let intermediate_lsns = C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
|
||||||
C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
|
|
||||||
let intermediate_lsns: Vec<Lsn> = intermediate_lsns
|
let intermediate_lsns: Vec<Lsn> = intermediate_lsns
|
||||||
.iter()
|
.iter()
|
||||||
.map(|&lsn| u64::from(lsn).into())
|
.map(|&lsn| u64::from(lsn).into())
|
||||||
.collect();
|
.collect();
|
||||||
let expected_end_of_wal: Lsn = u64::from(expected_end_of_wal_partial).into();
|
// Kill postgres. Note that it might have inserted to WAL something after
|
||||||
|
// 'craft' did its job.
|
||||||
srv.kill();
|
srv.kill();
|
||||||
|
|
||||||
// Check find_end_of_wal on the initial WAL
|
// Check find_end_of_wal on the initial WAL
|
||||||
@@ -56,7 +58,7 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
|
|||||||
.filter(|fname| IsXLogFileName(fname))
|
.filter(|fname| IsXLogFileName(fname))
|
||||||
.max()
|
.max()
|
||||||
.unwrap();
|
.unwrap();
|
||||||
check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal);
|
let expected_end_of_wal = find_pg_waldump_end_of_wal(&cfg, &last_segment);
|
||||||
for start_lsn in intermediate_lsns
|
for start_lsn in intermediate_lsns
|
||||||
.iter()
|
.iter()
|
||||||
.chain(std::iter::once(&expected_end_of_wal))
|
.chain(std::iter::once(&expected_end_of_wal))
|
||||||
@@ -91,11 +93,7 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn check_pg_waldump_end_of_wal(
|
fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &str) -> Lsn {
|
||||||
cfg: &crate::Conf,
|
|
||||||
last_segment: &str,
|
|
||||||
expected_end_of_wal: Lsn,
|
|
||||||
) {
|
|
||||||
// Get the actual end of WAL by pg_waldump
|
// Get the actual end of WAL by pg_waldump
|
||||||
let waldump_output = cfg
|
let waldump_output = cfg
|
||||||
.pg_waldump("000000010000000000000001", last_segment)
|
.pg_waldump("000000010000000000000001", last_segment)
|
||||||
@@ -113,11 +111,8 @@ fn check_pg_waldump_end_of_wal(
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap();
|
let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap();
|
||||||
info!(
|
info!("waldump erred on {}", waldump_wal_end);
|
||||||
"waldump erred on {}, expected wal end at {}",
|
waldump_wal_end
|
||||||
waldump_wal_end, expected_end_of_wal
|
|
||||||
);
|
|
||||||
assert_eq!(waldump_wal_end, expected_end_of_wal);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn check_end_of_wal(
|
fn check_end_of_wal(
|
||||||
@@ -210,9 +205,9 @@ pub fn test_update_next_xid() {
|
|||||||
#[test]
|
#[test]
|
||||||
pub fn test_encode_logical_message() {
|
pub fn test_encode_logical_message() {
|
||||||
let expected = [
|
let expected = [
|
||||||
64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255,
|
64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255, 38,
|
||||||
38, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114,
|
0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114, 101, 102,
|
||||||
101, 102, 105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
|
105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
|
||||||
];
|
];
|
||||||
let actual = encode_logical_message("prefix", "message");
|
let actual = encode_logical_message("prefix", "message");
|
||||||
assert_eq!(expected, actual[..]);
|
assert_eq!(expected, actual[..]);
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ camino.workspace = true
|
|||||||
humantime.workspace = true
|
humantime.workspace = true
|
||||||
hyper = { workspace = true, features = ["stream"] }
|
hyper = { workspace = true, features = ["stream"] }
|
||||||
futures.workspace = true
|
futures.workspace = true
|
||||||
|
rand.workspace = true
|
||||||
serde.workspace = true
|
serde.workspace = true
|
||||||
serde_json.workspace = true
|
serde_json.workspace = true
|
||||||
tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
|
tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
|
||||||
|
|||||||
@@ -157,9 +157,8 @@ impl AzureBlobStorage {
|
|||||||
let mut bufs = Vec::new();
|
let mut bufs = Vec::new();
|
||||||
while let Some(part) = response.next().await {
|
while let Some(part) = response.next().await {
|
||||||
let part = part?;
|
let part = part?;
|
||||||
let etag_str: &str = part.blob.properties.etag.as_ref();
|
|
||||||
if etag.is_none() {
|
if etag.is_none() {
|
||||||
etag = Some(etag.unwrap_or_else(|| etag_str.to_owned()));
|
etag = Some(part.blob.properties.etag);
|
||||||
}
|
}
|
||||||
if last_modified.is_none() {
|
if last_modified.is_none() {
|
||||||
last_modified = Some(part.blob.properties.last_modified.into());
|
last_modified = Some(part.blob.properties.last_modified.into());
|
||||||
@@ -174,6 +173,16 @@ impl AzureBlobStorage {
|
|||||||
.map_err(|e| DownloadError::Other(e.into()))?;
|
.map_err(|e| DownloadError::Other(e.into()))?;
|
||||||
bufs.push(data);
|
bufs.push(data);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if bufs.is_empty() {
|
||||||
|
return Err(DownloadError::Other(anyhow::anyhow!(
|
||||||
|
"Azure GET response contained no buffers"
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
// unwrap safety: if these were None, bufs would be empty and we would have returned an error already
|
||||||
|
let etag = etag.unwrap();
|
||||||
|
let last_modified = last_modified.unwrap();
|
||||||
|
|
||||||
Ok(Download {
|
Ok(Download {
|
||||||
download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
|
download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
|
||||||
etag,
|
etag,
|
||||||
|
|||||||
@@ -42,6 +42,9 @@ pub use self::{
|
|||||||
};
|
};
|
||||||
use s3_bucket::RequestKind;
|
use s3_bucket::RequestKind;
|
||||||
|
|
||||||
|
/// Azure SDK's ETag type is a simple String wrapper: we use this internally instead of repeating it here.
|
||||||
|
pub use azure_core::Etag;
|
||||||
|
|
||||||
pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
|
pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
|
||||||
|
|
||||||
/// Currently, sync happens with AWS S3, that has two limits on requests per second:
|
/// Currently, sync happens with AWS S3, that has two limits on requests per second:
|
||||||
@@ -291,9 +294,9 @@ pub type DownloadStream =
|
|||||||
pub struct Download {
|
pub struct Download {
|
||||||
pub download_stream: DownloadStream,
|
pub download_stream: DownloadStream,
|
||||||
/// The last time the file was modified (`last-modified` HTTP header)
|
/// The last time the file was modified (`last-modified` HTTP header)
|
||||||
pub last_modified: Option<SystemTime>,
|
pub last_modified: SystemTime,
|
||||||
/// A way to identify this specific version of the resource (`etag` HTTP header)
|
/// A way to identify this specific version of the resource (`etag` HTTP header)
|
||||||
pub etag: Option<String>,
|
pub etag: Etag,
|
||||||
/// Extra key-value data, associated with the current remote file.
|
/// Extra key-value data, associated with the current remote file.
|
||||||
pub metadata: Option<StorageMetadata>,
|
pub metadata: Option<StorageMetadata>,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ use std::{
|
|||||||
io::ErrorKind,
|
io::ErrorKind,
|
||||||
num::NonZeroU32,
|
num::NonZeroU32,
|
||||||
pin::Pin,
|
pin::Pin,
|
||||||
time::{Duration, SystemTime},
|
time::{Duration, SystemTime, UNIX_EPOCH},
|
||||||
};
|
};
|
||||||
|
|
||||||
use anyhow::{bail, ensure, Context};
|
use anyhow::{bail, ensure, Context};
|
||||||
@@ -30,6 +30,7 @@ use crate::{
|
|||||||
};
|
};
|
||||||
|
|
||||||
use super::{RemoteStorage, StorageMetadata};
|
use super::{RemoteStorage, StorageMetadata};
|
||||||
|
use crate::Etag;
|
||||||
|
|
||||||
const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp";
|
const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp";
|
||||||
|
|
||||||
@@ -197,6 +198,7 @@ impl LocalFs {
|
|||||||
fs::OpenOptions::new()
|
fs::OpenOptions::new()
|
||||||
.write(true)
|
.write(true)
|
||||||
.create(true)
|
.create(true)
|
||||||
|
.truncate(true)
|
||||||
.open(&temp_file_path)
|
.open(&temp_file_path)
|
||||||
.await
|
.await
|
||||||
.with_context(|| {
|
.with_context(|| {
|
||||||
@@ -406,35 +408,37 @@ impl RemoteStorage for LocalFs {
|
|||||||
cancel: &CancellationToken,
|
cancel: &CancellationToken,
|
||||||
) -> Result<Download, DownloadError> {
|
) -> Result<Download, DownloadError> {
|
||||||
let target_path = from.with_base(&self.storage_root);
|
let target_path = from.with_base(&self.storage_root);
|
||||||
if file_exists(&target_path).map_err(DownloadError::BadInput)? {
|
|
||||||
let source = ReaderStream::new(
|
|
||||||
fs::OpenOptions::new()
|
|
||||||
.read(true)
|
|
||||||
.open(&target_path)
|
|
||||||
.await
|
|
||||||
.with_context(|| {
|
|
||||||
format!("Failed to open source file {target_path:?} to use in the download")
|
|
||||||
})
|
|
||||||
.map_err(DownloadError::Other)?,
|
|
||||||
);
|
|
||||||
|
|
||||||
let metadata = self
|
let file_metadata = file_metadata(&target_path).await?;
|
||||||
.read_storage_metadata(&target_path)
|
|
||||||
|
let source = ReaderStream::new(
|
||||||
|
fs::OpenOptions::new()
|
||||||
|
.read(true)
|
||||||
|
.open(&target_path)
|
||||||
.await
|
.await
|
||||||
.map_err(DownloadError::Other)?;
|
.with_context(|| {
|
||||||
|
format!("Failed to open source file {target_path:?} to use in the download")
|
||||||
|
})
|
||||||
|
.map_err(DownloadError::Other)?,
|
||||||
|
);
|
||||||
|
|
||||||
let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
|
let metadata = self
|
||||||
let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
|
.read_storage_metadata(&target_path)
|
||||||
|
.await
|
||||||
|
.map_err(DownloadError::Other)?;
|
||||||
|
|
||||||
Ok(Download {
|
let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
|
||||||
metadata,
|
let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
|
||||||
last_modified: None,
|
|
||||||
etag: None,
|
let etag = mock_etag(&file_metadata);
|
||||||
download_stream: Box::pin(source),
|
Ok(Download {
|
||||||
})
|
metadata,
|
||||||
} else {
|
last_modified: file_metadata
|
||||||
Err(DownloadError::NotFound)
|
.modified()
|
||||||
}
|
.map_err(|e| DownloadError::Other(anyhow::anyhow!(e).context("Reading mtime")))?,
|
||||||
|
etag,
|
||||||
|
download_stream: Box::pin(source),
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn download_byte_range(
|
async fn download_byte_range(
|
||||||
@@ -452,50 +456,51 @@ impl RemoteStorage for LocalFs {
|
|||||||
return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) and end_exclusive ({end_exclusive:?}) difference is zero bytes")));
|
return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) and end_exclusive ({end_exclusive:?}) difference is zero bytes")));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let target_path = from.with_base(&self.storage_root);
|
let target_path = from.with_base(&self.storage_root);
|
||||||
if file_exists(&target_path).map_err(DownloadError::BadInput)? {
|
let file_metadata = file_metadata(&target_path).await?;
|
||||||
let mut source = tokio::fs::OpenOptions::new()
|
let mut source = tokio::fs::OpenOptions::new()
|
||||||
.read(true)
|
.read(true)
|
||||||
.open(&target_path)
|
.open(&target_path)
|
||||||
.await
|
.await
|
||||||
.with_context(|| {
|
.with_context(|| {
|
||||||
format!("Failed to open source file {target_path:?} to use in the download")
|
format!("Failed to open source file {target_path:?} to use in the download")
|
||||||
})
|
|
||||||
.map_err(DownloadError::Other)?;
|
|
||||||
|
|
||||||
let len = source
|
|
||||||
.metadata()
|
|
||||||
.await
|
|
||||||
.context("query file length")
|
|
||||||
.map_err(DownloadError::Other)?
|
|
||||||
.len();
|
|
||||||
|
|
||||||
source
|
|
||||||
.seek(io::SeekFrom::Start(start_inclusive))
|
|
||||||
.await
|
|
||||||
.context("Failed to seek to the range start in a local storage file")
|
|
||||||
.map_err(DownloadError::Other)?;
|
|
||||||
|
|
||||||
let metadata = self
|
|
||||||
.read_storage_metadata(&target_path)
|
|
||||||
.await
|
|
||||||
.map_err(DownloadError::Other)?;
|
|
||||||
|
|
||||||
let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive);
|
|
||||||
let source = ReaderStream::new(source);
|
|
||||||
|
|
||||||
let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
|
|
||||||
let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
|
|
||||||
|
|
||||||
Ok(Download {
|
|
||||||
metadata,
|
|
||||||
last_modified: None,
|
|
||||||
etag: None,
|
|
||||||
download_stream: Box::pin(source),
|
|
||||||
})
|
})
|
||||||
} else {
|
.map_err(DownloadError::Other)?;
|
||||||
Err(DownloadError::NotFound)
|
|
||||||
}
|
let len = source
|
||||||
|
.metadata()
|
||||||
|
.await
|
||||||
|
.context("query file length")
|
||||||
|
.map_err(DownloadError::Other)?
|
||||||
|
.len();
|
||||||
|
|
||||||
|
source
|
||||||
|
.seek(io::SeekFrom::Start(start_inclusive))
|
||||||
|
.await
|
||||||
|
.context("Failed to seek to the range start in a local storage file")
|
||||||
|
.map_err(DownloadError::Other)?;
|
||||||
|
|
||||||
|
let metadata = self
|
||||||
|
.read_storage_metadata(&target_path)
|
||||||
|
.await
|
||||||
|
.map_err(DownloadError::Other)?;
|
||||||
|
|
||||||
|
let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive);
|
||||||
|
let source = ReaderStream::new(source);
|
||||||
|
|
||||||
|
let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
|
||||||
|
let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
|
||||||
|
|
||||||
|
let etag = mock_etag(&file_metadata);
|
||||||
|
Ok(Download {
|
||||||
|
metadata,
|
||||||
|
last_modified: file_metadata
|
||||||
|
.modified()
|
||||||
|
.map_err(|e| DownloadError::Other(anyhow::anyhow!(e).context("Reading mtime")))?,
|
||||||
|
etag,
|
||||||
|
download_stream: Box::pin(source),
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn delete(&self, path: &RemotePath, _cancel: &CancellationToken) -> anyhow::Result<()> {
|
async fn delete(&self, path: &RemotePath, _cancel: &CancellationToken) -> anyhow::Result<()> {
|
||||||
@@ -610,13 +615,22 @@ async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
|
async fn file_metadata(file_path: &Utf8Path) -> Result<std::fs::Metadata, DownloadError> {
|
||||||
if file_path.exists() {
|
tokio::fs::metadata(&file_path).await.map_err(|e| {
|
||||||
ensure!(file_path.is_file(), "file path '{file_path}' is not a file");
|
if e.kind() == ErrorKind::NotFound {
|
||||||
Ok(true)
|
DownloadError::NotFound
|
||||||
} else {
|
} else {
|
||||||
Ok(false)
|
DownloadError::BadInput(e.into())
|
||||||
}
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use mtime as stand-in for ETag. We could calculate a meaningful one by md5'ing the contents of files we
|
||||||
|
// read, but that's expensive and the local_fs test helper's whole reason for existence is to run small tests
|
||||||
|
// quickly, with less overhead than using a mock S3 server.
|
||||||
|
fn mock_etag(meta: &std::fs::Metadata) -> Etag {
|
||||||
|
let mtime = meta.modified().expect("Filesystem mtime missing");
|
||||||
|
format!("{}", mtime.duration_since(UNIX_EPOCH).unwrap().as_millis()).into()
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
|||||||
@@ -35,8 +35,8 @@ use aws_sdk_s3::{
|
|||||||
};
|
};
|
||||||
use aws_smithy_async::rt::sleep::TokioSleep;
|
use aws_smithy_async::rt::sleep::TokioSleep;
|
||||||
|
|
||||||
use aws_smithy_types::byte_stream::ByteStream;
|
|
||||||
use aws_smithy_types::{body::SdkBody, DateTime};
|
use aws_smithy_types::{body::SdkBody, DateTime};
|
||||||
|
use aws_smithy_types::{byte_stream::ByteStream, date_time::ConversionError};
|
||||||
use bytes::Bytes;
|
use bytes::Bytes;
|
||||||
use futures::stream::Stream;
|
use futures::stream::Stream;
|
||||||
use hyper::Body;
|
use hyper::Body;
|
||||||
@@ -287,8 +287,17 @@ impl S3Bucket {
|
|||||||
let remaining = self.timeout.saturating_sub(started_at.elapsed());
|
let remaining = self.timeout.saturating_sub(started_at.elapsed());
|
||||||
|
|
||||||
let metadata = object_output.metadata().cloned().map(StorageMetadata);
|
let metadata = object_output.metadata().cloned().map(StorageMetadata);
|
||||||
let etag = object_output.e_tag;
|
let etag = object_output
|
||||||
let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());
|
.e_tag
|
||||||
|
.ok_or(DownloadError::Other(anyhow::anyhow!("Missing ETag header")))?
|
||||||
|
.into();
|
||||||
|
let last_modified = object_output
|
||||||
|
.last_modified
|
||||||
|
.ok_or(DownloadError::Other(anyhow::anyhow!(
|
||||||
|
"Missing LastModified header"
|
||||||
|
)))?
|
||||||
|
.try_into()
|
||||||
|
.map_err(|e: ConversionError| DownloadError::Other(e.into()))?;
|
||||||
|
|
||||||
let body = object_output.body;
|
let body = object_output.body;
|
||||||
let body = ByteStreamAsStream::from(body);
|
let body = ByteStreamAsStream::from(body);
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ use remote_storage::{
|
|||||||
};
|
};
|
||||||
use test_context::test_context;
|
use test_context::test_context;
|
||||||
use test_context::AsyncTestContext;
|
use test_context::AsyncTestContext;
|
||||||
|
use tokio::io::AsyncBufReadExt;
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
|
|
||||||
@@ -117,7 +118,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
|||||||
// A little check to ensure that our clock is not too far off from the S3 clock
|
// A little check to ensure that our clock is not too far off from the S3 clock
|
||||||
{
|
{
|
||||||
let dl = retry(|| ctx.client.download(&path2, &cancel)).await?;
|
let dl = retry(|| ctx.client.download(&path2, &cancel)).await?;
|
||||||
let last_modified = dl.last_modified.unwrap();
|
let last_modified = dl.last_modified;
|
||||||
let half_wt = WAIT_TIME.mul_f32(0.5);
|
let half_wt = WAIT_TIME.mul_f32(0.5);
|
||||||
let t0_hwt = t0 + half_wt;
|
let t0_hwt = t0 + half_wt;
|
||||||
let t1_hwt = t1 - half_wt;
|
let t1_hwt = t1 - half_wt;
|
||||||
@@ -484,32 +485,33 @@ async fn download_is_cancelled(ctx: &mut MaybeEnabledStorage) {
|
|||||||
))
|
))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let len = upload_large_enough_file(&ctx.client, &path, &cancel).await;
|
let file_len = upload_large_enough_file(&ctx.client, &path, &cancel).await;
|
||||||
|
|
||||||
{
|
{
|
||||||
let mut stream = ctx
|
let stream = ctx
|
||||||
.client
|
.client
|
||||||
.download(&path, &cancel)
|
.download(&path, &cancel)
|
||||||
.await
|
.await
|
||||||
.expect("download succeeds")
|
.expect("download succeeds")
|
||||||
.download_stream;
|
.download_stream;
|
||||||
|
|
||||||
let first = stream
|
let mut reader = std::pin::pin!(tokio_util::io::StreamReader::new(stream));
|
||||||
.next()
|
|
||||||
.await
|
|
||||||
.expect("should have the first blob")
|
|
||||||
.expect("should have succeeded");
|
|
||||||
|
|
||||||
tracing::info!(len = first.len(), "downloaded first chunk");
|
let first = reader.fill_buf().await.expect("should have the first blob");
|
||||||
|
|
||||||
|
let len = first.len();
|
||||||
|
tracing::info!(len, "downloaded first chunk");
|
||||||
|
|
||||||
assert!(
|
assert!(
|
||||||
first.len() < len,
|
first.len() < file_len,
|
||||||
"uploaded file is too small, we downloaded all on first chunk"
|
"uploaded file is too small, we downloaded all on first chunk"
|
||||||
);
|
);
|
||||||
|
|
||||||
|
reader.consume(len);
|
||||||
|
|
||||||
cancel.cancel();
|
cancel.cancel();
|
||||||
|
|
||||||
let next = stream.next().await.expect("stream should have more");
|
let next = reader.fill_buf().await;
|
||||||
|
|
||||||
let e = next.expect_err("expected an error, but got a chunk?");
|
let e = next.expect_err("expected an error, but got a chunk?");
|
||||||
|
|
||||||
@@ -520,6 +522,10 @@ async fn download_is_cancelled(ctx: &mut MaybeEnabledStorage) {
|
|||||||
.is_some_and(|e| matches!(e, DownloadError::Cancelled)),
|
.is_some_and(|e| matches!(e, DownloadError::Cancelled)),
|
||||||
"{inner:?}"
|
"{inner:?}"
|
||||||
);
|
);
|
||||||
|
|
||||||
|
let e = DownloadError::from(e);
|
||||||
|
|
||||||
|
assert!(matches!(e, DownloadError::Cancelled), "{e:?}");
|
||||||
}
|
}
|
||||||
|
|
||||||
let cancel = CancellationToken::new();
|
let cancel = CancellationToken::new();
|
||||||
|
|||||||
@@ -247,7 +247,7 @@ fn scenario_4() {
|
|||||||
//
|
//
|
||||||
// This is in total 5000 + 1000 + 5000 + 1000 = 12000
|
// This is in total 5000 + 1000 + 5000 + 1000 = 12000
|
||||||
//
|
//
|
||||||
// (If we used the the method from the previous scenario, and
|
// (If we used the method from the previous scenario, and
|
||||||
// kept only snapshot at the branch point, we'd need to keep
|
// kept only snapshot at the branch point, we'd need to keep
|
||||||
// all the WAL between 10000-18000 on the main branch, so
|
// all the WAL between 10000-18000 on the main branch, so
|
||||||
// the total size would be 5000 + 1000 + 8000 = 14000. The
|
// the total size would be 5000 + 1000 + 8000 = 14000. The
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ testing = ["fail/failpoints"]
|
|||||||
[dependencies]
|
[dependencies]
|
||||||
arc-swap.workspace = true
|
arc-swap.workspace = true
|
||||||
sentry.workspace = true
|
sentry.workspace = true
|
||||||
|
async-compression.workspace = true
|
||||||
async-trait.workspace = true
|
async-trait.workspace = true
|
||||||
anyhow.workspace = true
|
anyhow.workspace = true
|
||||||
bincode.workspace = true
|
bincode.workspace = true
|
||||||
@@ -36,6 +37,7 @@ serde_json.workspace = true
|
|||||||
signal-hook.workspace = true
|
signal-hook.workspace = true
|
||||||
thiserror.workspace = true
|
thiserror.workspace = true
|
||||||
tokio.workspace = true
|
tokio.workspace = true
|
||||||
|
tokio-tar.workspace = true
|
||||||
tokio-util.workspace = true
|
tokio-util.workspace = true
|
||||||
tracing.workspace = true
|
tracing.workspace = true
|
||||||
tracing-error.workspace = true
|
tracing-error.workspace = true
|
||||||
@@ -46,6 +48,7 @@ strum.workspace = true
|
|||||||
strum_macros.workspace = true
|
strum_macros.workspace = true
|
||||||
url.workspace = true
|
url.workspace = true
|
||||||
uuid.workspace = true
|
uuid.workspace = true
|
||||||
|
walkdir.workspace = true
|
||||||
|
|
||||||
pq_proto.workspace = true
|
pq_proto.workspace = true
|
||||||
postgres_connection.workspace = true
|
postgres_connection.workspace = true
|
||||||
|
|||||||
@@ -47,9 +47,10 @@ impl<T, const L: usize> ops::Deref for HistoryBufferWithDropCounter<T, L> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(serde::Serialize)]
|
#[derive(serde::Serialize, serde::Deserialize)]
|
||||||
struct SerdeRepr<T> {
|
struct SerdeRepr<T> {
|
||||||
buffer: Vec<T>,
|
buffer: Vec<T>,
|
||||||
|
buffer_size: usize,
|
||||||
drop_count: u64,
|
drop_count: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -61,6 +62,7 @@ where
|
|||||||
let HistoryBufferWithDropCounter { buffer, drop_count } = value;
|
let HistoryBufferWithDropCounter { buffer, drop_count } = value;
|
||||||
SerdeRepr {
|
SerdeRepr {
|
||||||
buffer: buffer.iter().cloned().collect(),
|
buffer: buffer.iter().cloned().collect(),
|
||||||
|
buffer_size: L,
|
||||||
drop_count: *drop_count,
|
drop_count: *drop_count,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -78,19 +80,52 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<'de, T, const L: usize> serde::de::Deserialize<'de> for HistoryBufferWithDropCounter<T, L>
|
||||||
|
where
|
||||||
|
T: Clone + serde::Deserialize<'de>,
|
||||||
|
{
|
||||||
|
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||||
|
where
|
||||||
|
D: serde::Deserializer<'de>,
|
||||||
|
{
|
||||||
|
let SerdeRepr {
|
||||||
|
buffer: des_buffer,
|
||||||
|
drop_count,
|
||||||
|
buffer_size,
|
||||||
|
} = SerdeRepr::<T>::deserialize(deserializer)?;
|
||||||
|
if buffer_size != L {
|
||||||
|
use serde::de::Error;
|
||||||
|
return Err(D::Error::custom(format!(
|
||||||
|
"invalid buffer_size, expecting {L} got {buffer_size}"
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
let mut buffer = HistoryBuffer::new();
|
||||||
|
buffer.extend(des_buffer);
|
||||||
|
Ok(HistoryBufferWithDropCounter { buffer, drop_count })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use super::HistoryBufferWithDropCounter;
|
use super::HistoryBufferWithDropCounter;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_basics() {
|
fn test_basics() {
|
||||||
let mut b = HistoryBufferWithDropCounter::<_, 2>::default();
|
let mut b = HistoryBufferWithDropCounter::<usize, 2>::default();
|
||||||
b.write(1);
|
b.write(1);
|
||||||
b.write(2);
|
b.write(2);
|
||||||
b.write(3);
|
b.write(3);
|
||||||
assert!(b.iter().any(|e| *e == 2));
|
assert!(b.iter().any(|e| *e == 2));
|
||||||
assert!(b.iter().any(|e| *e == 3));
|
assert!(b.iter().any(|e| *e == 3));
|
||||||
assert!(!b.iter().any(|e| *e == 1));
|
assert!(!b.iter().any(|e| *e == 1));
|
||||||
|
|
||||||
|
// round-trip serde
|
||||||
|
let round_tripped: HistoryBufferWithDropCounter<usize, 2> =
|
||||||
|
serde_json::from_str(&serde_json::to_string(&b).unwrap()).unwrap();
|
||||||
|
assert_eq!(
|
||||||
|
round_tripped.iter().cloned().collect::<Vec<_>>(),
|
||||||
|
b.iter().cloned().collect::<Vec<_>>()
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|||||||
@@ -245,7 +245,7 @@ impl std::io::Write for ChannelWriter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
SERVE_METRICS_COUNT.inc();
|
SERVE_METRICS_COUNT.inc();
|
||||||
|
|
||||||
let started_at = std::time::Instant::now();
|
let started_at = std::time::Instant::now();
|
||||||
@@ -367,7 +367,6 @@ pub fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
|
|||||||
.middleware(Middleware::post_with_info(
|
.middleware(Middleware::post_with_info(
|
||||||
add_request_id_header_to_response,
|
add_request_id_header_to_response,
|
||||||
))
|
))
|
||||||
.get("/metrics", |r| request_span(r, prometheus_metrics_handler))
|
|
||||||
.err_handler(route_error_handler)
|
.err_handler(route_error_handler)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -87,6 +87,8 @@ pub mod failpoint_support;
|
|||||||
|
|
||||||
pub mod yielding_loop;
|
pub mod yielding_loop;
|
||||||
|
|
||||||
|
pub mod zstd;
|
||||||
|
|
||||||
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
|
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
|
||||||
///
|
///
|
||||||
/// we have several cases:
|
/// we have several cases:
|
||||||
|
|||||||
@@ -63,6 +63,7 @@ impl UnwrittenLockFile {
|
|||||||
pub fn create_exclusive(lock_file_path: &Utf8Path) -> anyhow::Result<UnwrittenLockFile> {
|
pub fn create_exclusive(lock_file_path: &Utf8Path) -> anyhow::Result<UnwrittenLockFile> {
|
||||||
let lock_file = fs::OpenOptions::new()
|
let lock_file = fs::OpenOptions::new()
|
||||||
.create(true) // O_CREAT
|
.create(true) // O_CREAT
|
||||||
|
.truncate(true)
|
||||||
.write(true)
|
.write(true)
|
||||||
.open(lock_file_path)
|
.open(lock_file_path)
|
||||||
.context("open lock file")?;
|
.context("open lock file")?;
|
||||||
|
|||||||
@@ -29,12 +29,10 @@ pub struct PageserverFeedback {
|
|||||||
// Serialize with RFC3339 format.
|
// Serialize with RFC3339 format.
|
||||||
#[serde(with = "serde_systemtime")]
|
#[serde(with = "serde_systemtime")]
|
||||||
pub replytime: SystemTime,
|
pub replytime: SystemTime,
|
||||||
|
/// Used to track feedbacks from different shards. Always zero for unsharded tenants.
|
||||||
|
pub shard_number: u32,
|
||||||
}
|
}
|
||||||
|
|
||||||
// NOTE: Do not forget to increment this number when adding new fields to PageserverFeedback.
|
|
||||||
// Do not remove previously available fields because this might be backwards incompatible.
|
|
||||||
pub const PAGESERVER_FEEDBACK_FIELDS_NUMBER: u8 = 5;
|
|
||||||
|
|
||||||
impl PageserverFeedback {
|
impl PageserverFeedback {
|
||||||
pub fn empty() -> PageserverFeedback {
|
pub fn empty() -> PageserverFeedback {
|
||||||
PageserverFeedback {
|
PageserverFeedback {
|
||||||
@@ -43,6 +41,7 @@ impl PageserverFeedback {
|
|||||||
remote_consistent_lsn: Lsn::INVALID,
|
remote_consistent_lsn: Lsn::INVALID,
|
||||||
disk_consistent_lsn: Lsn::INVALID,
|
disk_consistent_lsn: Lsn::INVALID,
|
||||||
replytime: *PG_EPOCH,
|
replytime: *PG_EPOCH,
|
||||||
|
shard_number: 0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -59,17 +58,26 @@ impl PageserverFeedback {
|
|||||||
//
|
//
|
||||||
// TODO: change serialized fields names once all computes migrate to rename.
|
// TODO: change serialized fields names once all computes migrate to rename.
|
||||||
pub fn serialize(&self, buf: &mut BytesMut) {
|
pub fn serialize(&self, buf: &mut BytesMut) {
|
||||||
buf.put_u8(PAGESERVER_FEEDBACK_FIELDS_NUMBER); // # of keys
|
let buf_ptr = buf.len();
|
||||||
|
buf.put_u8(0); // # of keys, will be filled later
|
||||||
|
let mut nkeys = 0;
|
||||||
|
|
||||||
|
nkeys += 1;
|
||||||
buf.put_slice(b"current_timeline_size\0");
|
buf.put_slice(b"current_timeline_size\0");
|
||||||
buf.put_i32(8);
|
buf.put_i32(8);
|
||||||
buf.put_u64(self.current_timeline_size);
|
buf.put_u64(self.current_timeline_size);
|
||||||
|
|
||||||
|
nkeys += 1;
|
||||||
buf.put_slice(b"ps_writelsn\0");
|
buf.put_slice(b"ps_writelsn\0");
|
||||||
buf.put_i32(8);
|
buf.put_i32(8);
|
||||||
buf.put_u64(self.last_received_lsn.0);
|
buf.put_u64(self.last_received_lsn.0);
|
||||||
|
|
||||||
|
nkeys += 1;
|
||||||
buf.put_slice(b"ps_flushlsn\0");
|
buf.put_slice(b"ps_flushlsn\0");
|
||||||
buf.put_i32(8);
|
buf.put_i32(8);
|
||||||
buf.put_u64(self.disk_consistent_lsn.0);
|
buf.put_u64(self.disk_consistent_lsn.0);
|
||||||
|
|
||||||
|
nkeys += 1;
|
||||||
buf.put_slice(b"ps_applylsn\0");
|
buf.put_slice(b"ps_applylsn\0");
|
||||||
buf.put_i32(8);
|
buf.put_i32(8);
|
||||||
buf.put_u64(self.remote_consistent_lsn.0);
|
buf.put_u64(self.remote_consistent_lsn.0);
|
||||||
@@ -80,9 +88,19 @@ impl PageserverFeedback {
|
|||||||
.expect("failed to serialize pg_replytime earlier than PG_EPOCH")
|
.expect("failed to serialize pg_replytime earlier than PG_EPOCH")
|
||||||
.as_micros() as i64;
|
.as_micros() as i64;
|
||||||
|
|
||||||
|
nkeys += 1;
|
||||||
buf.put_slice(b"ps_replytime\0");
|
buf.put_slice(b"ps_replytime\0");
|
||||||
buf.put_i32(8);
|
buf.put_i32(8);
|
||||||
buf.put_i64(timestamp);
|
buf.put_i64(timestamp);
|
||||||
|
|
||||||
|
if self.shard_number > 0 {
|
||||||
|
nkeys += 1;
|
||||||
|
buf.put_slice(b"shard_number\0");
|
||||||
|
buf.put_i32(4);
|
||||||
|
buf.put_u32(self.shard_number);
|
||||||
|
}
|
||||||
|
|
||||||
|
buf[buf_ptr] = nkeys;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Deserialize PageserverFeedback message
|
// Deserialize PageserverFeedback message
|
||||||
@@ -123,6 +141,11 @@ impl PageserverFeedback {
|
|||||||
rf.replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64);
|
rf.replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
b"shard_number" => {
|
||||||
|
let len = buf.get_i32();
|
||||||
|
assert_eq!(len, 4);
|
||||||
|
rf.shard_number = buf.get_u32();
|
||||||
|
}
|
||||||
_ => {
|
_ => {
|
||||||
let len = buf.get_i32();
|
let len = buf.get_i32();
|
||||||
warn!(
|
warn!(
|
||||||
@@ -194,10 +217,7 @@ mod tests {
|
|||||||
rf.serialize(&mut data);
|
rf.serialize(&mut data);
|
||||||
|
|
||||||
// Add an extra field to the buffer and adjust number of keys
|
// Add an extra field to the buffer and adjust number of keys
|
||||||
if let Some(first) = data.first_mut() {
|
data[0] += 1;
|
||||||
*first = PAGESERVER_FEEDBACK_FIELDS_NUMBER + 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
data.put_slice(b"new_field_one\0");
|
data.put_slice(b"new_field_one\0");
|
||||||
data.put_i32(8);
|
data.put_i32(8);
|
||||||
data.put_u64(42);
|
data.put_u64(42);
|
||||||
|
|||||||
@@ -110,6 +110,49 @@ impl<T> OnceCell<T> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns a guard to an existing initialized value, or returns an unique initialization
|
||||||
|
/// permit which can be used to initialize this `OnceCell` using `OnceCell::set`.
|
||||||
|
pub async fn get_or_init_detached(&self) -> Result<Guard<'_, T>, InitPermit> {
|
||||||
|
// It looks like OnceCell::get_or_init could be implemented using this method instead of
|
||||||
|
// duplication. However, that makes the future be !Send due to possibly holding on to the
|
||||||
|
// MutexGuard over an await point.
|
||||||
|
loop {
|
||||||
|
let sem = {
|
||||||
|
let guard = self.inner.lock().unwrap();
|
||||||
|
if guard.value.is_some() {
|
||||||
|
return Ok(Guard(guard));
|
||||||
|
}
|
||||||
|
guard.init_semaphore.clone()
|
||||||
|
};
|
||||||
|
|
||||||
|
{
|
||||||
|
let permit = {
|
||||||
|
// increment the count for the duration of queued
|
||||||
|
let _guard = CountWaitingInitializers::start(self);
|
||||||
|
sem.acquire().await
|
||||||
|
};
|
||||||
|
|
||||||
|
let Ok(permit) = permit else {
|
||||||
|
let guard = self.inner.lock().unwrap();
|
||||||
|
if !Arc::ptr_eq(&sem, &guard.init_semaphore) {
|
||||||
|
// there was a take_and_deinit in between
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
assert!(
|
||||||
|
guard.value.is_some(),
|
||||||
|
"semaphore got closed, must be initialized"
|
||||||
|
);
|
||||||
|
return Ok(Guard(guard));
|
||||||
|
};
|
||||||
|
|
||||||
|
permit.forget();
|
||||||
|
}
|
||||||
|
|
||||||
|
let permit = InitPermit(sem);
|
||||||
|
return Err(permit);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
|
/// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
|
||||||
/// to complete initializing the inner value.
|
/// to complete initializing the inner value.
|
||||||
///
|
///
|
||||||
@@ -202,7 +245,7 @@ impl<'a, T> Guard<'a, T> {
|
|||||||
///
|
///
|
||||||
/// The permit will be on a semaphore part of the new internal value, and any following
|
/// The permit will be on a semaphore part of the new internal value, and any following
|
||||||
/// [`OnceCell::get_or_init`] will wait on it to complete.
|
/// [`OnceCell::get_or_init`] will wait on it to complete.
|
||||||
pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
|
pub fn take_and_deinit(mut self) -> (T, InitPermit) {
|
||||||
let mut swapped = Inner::default();
|
let mut swapped = Inner::default();
|
||||||
let sem = swapped.init_semaphore.clone();
|
let sem = swapped.init_semaphore.clone();
|
||||||
// acquire and forget right away, moving the control over to InitPermit
|
// acquire and forget right away, moving the control over to InitPermit
|
||||||
@@ -481,4 +524,39 @@ mod tests {
|
|||||||
|
|
||||||
assert_eq!("t1", *cell.get().unwrap());
|
assert_eq!("t1", *cell.get().unwrap());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[tokio::test(start_paused = true)]
|
||||||
|
async fn detached_init_smoke() {
|
||||||
|
let target = OnceCell::default();
|
||||||
|
|
||||||
|
let Err(permit) = target.get_or_init_detached().await else {
|
||||||
|
unreachable!("it is not initialized")
|
||||||
|
};
|
||||||
|
|
||||||
|
tokio::time::timeout(
|
||||||
|
std::time::Duration::from_secs(3600 * 24 * 7 * 365),
|
||||||
|
target.get_or_init(|permit2| async { Ok::<_, Infallible>((11, permit2)) }),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.expect_err("should timeout since we are already holding the permit");
|
||||||
|
|
||||||
|
target.set(42, permit);
|
||||||
|
|
||||||
|
let (_answer, permit) = {
|
||||||
|
let guard = target
|
||||||
|
.get_or_init(|permit| async { Ok::<_, Infallible>((11, permit)) })
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
assert_eq!(*guard, 42);
|
||||||
|
|
||||||
|
guard.take_and_deinit()
|
||||||
|
};
|
||||||
|
|
||||||
|
assert!(target.get().is_none());
|
||||||
|
|
||||||
|
target.set(11, permit);
|
||||||
|
|
||||||
|
assert_eq!(*target.get().unwrap(), 11);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,27 +1,60 @@
|
|||||||
use std::{alloc::Layout, cmp::Ordering, ops::RangeBounds};
|
use std::{alloc::Layout, cmp::Ordering, ops::RangeBounds};
|
||||||
|
|
||||||
|
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
||||||
|
pub enum VecMapOrdering {
|
||||||
|
Greater,
|
||||||
|
GreaterOrEqual,
|
||||||
|
}
|
||||||
|
|
||||||
/// Ordered map datastructure implemented in a Vec.
|
/// Ordered map datastructure implemented in a Vec.
|
||||||
/// Append only - can only add keys that are larger than the
|
/// Append only - can only add keys that are larger than the
|
||||||
/// current max key.
|
/// current max key.
|
||||||
|
/// Ordering can be adjusted using [`VecMapOrdering`]
|
||||||
|
/// during `VecMap` construction.
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
pub struct VecMap<K, V>(Vec<(K, V)>);
|
pub struct VecMap<K, V> {
|
||||||
|
data: Vec<(K, V)>,
|
||||||
|
ordering: VecMapOrdering,
|
||||||
|
}
|
||||||
|
|
||||||
impl<K, V> Default for VecMap<K, V> {
|
impl<K, V> Default for VecMap<K, V> {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
VecMap(Default::default())
|
VecMap {
|
||||||
|
data: Default::default(),
|
||||||
|
ordering: VecMapOrdering::Greater,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(thiserror::Error, Debug)]
|
||||||
pub struct InvalidKey;
|
pub enum VecMapError {
|
||||||
|
#[error("Key violates ordering constraint")]
|
||||||
|
InvalidKey,
|
||||||
|
#[error("Mismatched ordering constraints")]
|
||||||
|
ExtendOrderingError,
|
||||||
|
}
|
||||||
|
|
||||||
impl<K: Ord, V> VecMap<K, V> {
|
impl<K: Ord, V> VecMap<K, V> {
|
||||||
|
pub fn new(ordering: VecMapOrdering) -> Self {
|
||||||
|
Self {
|
||||||
|
data: Vec::new(),
|
||||||
|
ordering,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn with_capacity(capacity: usize, ordering: VecMapOrdering) -> Self {
|
||||||
|
Self {
|
||||||
|
data: Vec::with_capacity(capacity),
|
||||||
|
ordering,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn is_empty(&self) -> bool {
|
pub fn is_empty(&self) -> bool {
|
||||||
self.0.is_empty()
|
self.data.is_empty()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn as_slice(&self) -> &[(K, V)] {
|
pub fn as_slice(&self) -> &[(K, V)] {
|
||||||
self.0.as_slice()
|
self.data.as_slice()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// This function may panic if given a range where the lower bound is
|
/// This function may panic if given a range where the lower bound is
|
||||||
@@ -29,7 +62,7 @@ impl<K: Ord, V> VecMap<K, V> {
|
|||||||
pub fn slice_range<R: RangeBounds<K>>(&self, range: R) -> &[(K, V)] {
|
pub fn slice_range<R: RangeBounds<K>>(&self, range: R) -> &[(K, V)] {
|
||||||
use std::ops::Bound::*;
|
use std::ops::Bound::*;
|
||||||
|
|
||||||
let binary_search = |k: &K| self.0.binary_search_by_key(&k, extract_key);
|
let binary_search = |k: &K| self.data.binary_search_by_key(&k, extract_key);
|
||||||
|
|
||||||
let start_idx = match range.start_bound() {
|
let start_idx = match range.start_bound() {
|
||||||
Unbounded => 0,
|
Unbounded => 0,
|
||||||
@@ -41,7 +74,7 @@ impl<K: Ord, V> VecMap<K, V> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
let end_idx = match range.end_bound() {
|
let end_idx = match range.end_bound() {
|
||||||
Unbounded => self.0.len(),
|
Unbounded => self.data.len(),
|
||||||
Included(k) => match binary_search(k) {
|
Included(k) => match binary_search(k) {
|
||||||
Ok(idx) => idx + 1,
|
Ok(idx) => idx + 1,
|
||||||
Err(idx) => idx,
|
Err(idx) => idx,
|
||||||
@@ -49,34 +82,30 @@ impl<K: Ord, V> VecMap<K, V> {
|
|||||||
Excluded(k) => binary_search(k).unwrap_or_else(std::convert::identity),
|
Excluded(k) => binary_search(k).unwrap_or_else(std::convert::identity),
|
||||||
};
|
};
|
||||||
|
|
||||||
&self.0[start_idx..end_idx]
|
&self.data[start_idx..end_idx]
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Add a key value pair to the map.
|
/// Add a key value pair to the map.
|
||||||
/// If `key` is less than or equal to the current maximum key
|
/// If `key` is not respective of the `self` ordering the
|
||||||
/// the pair will not be added and InvalidKey error will be returned.
|
/// pair will not be added and `InvalidKey` error will be returned.
|
||||||
pub fn append(&mut self, key: K, value: V) -> Result<usize, InvalidKey> {
|
pub fn append(&mut self, key: K, value: V) -> Result<usize, VecMapError> {
|
||||||
if let Some((last_key, _last_value)) = self.0.last() {
|
self.validate_key_order(&key)?;
|
||||||
if &key <= last_key {
|
|
||||||
return Err(InvalidKey);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let delta_size = self.instrument_vec_op(|vec| vec.push((key, value)));
|
let delta_size = self.instrument_vec_op(|vec| vec.push((key, value)));
|
||||||
Ok(delta_size)
|
Ok(delta_size)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Update the maximum key value pair or add a new key value pair to the map.
|
/// Update the maximum key value pair or add a new key value pair to the map.
|
||||||
/// If `key` is less than the current maximum key no updates or additions
|
/// If `key` is not respective of the `self` ordering no updates or additions
|
||||||
/// will occur and InvalidKey error will be returned.
|
/// will occur and `InvalidKey` error will be returned.
|
||||||
pub fn append_or_update_last(
|
pub fn append_or_update_last(
|
||||||
&mut self,
|
&mut self,
|
||||||
key: K,
|
key: K,
|
||||||
mut value: V,
|
mut value: V,
|
||||||
) -> Result<(Option<V>, usize), InvalidKey> {
|
) -> Result<(Option<V>, usize), VecMapError> {
|
||||||
if let Some((last_key, last_value)) = self.0.last_mut() {
|
if let Some((last_key, last_value)) = self.data.last_mut() {
|
||||||
match key.cmp(last_key) {
|
match key.cmp(last_key) {
|
||||||
Ordering::Less => return Err(InvalidKey),
|
Ordering::Less => return Err(VecMapError::InvalidKey),
|
||||||
Ordering::Equal => {
|
Ordering::Equal => {
|
||||||
std::mem::swap(last_value, &mut value);
|
std::mem::swap(last_value, &mut value);
|
||||||
const DELTA_SIZE: usize = 0;
|
const DELTA_SIZE: usize = 0;
|
||||||
@@ -100,40 +129,67 @@ impl<K: Ord, V> VecMap<K, V> {
|
|||||||
V: Clone,
|
V: Clone,
|
||||||
{
|
{
|
||||||
let split_idx = self
|
let split_idx = self
|
||||||
.0
|
.data
|
||||||
.binary_search_by_key(&cutoff, extract_key)
|
.binary_search_by_key(&cutoff, extract_key)
|
||||||
.unwrap_or_else(std::convert::identity);
|
.unwrap_or_else(std::convert::identity);
|
||||||
|
|
||||||
(
|
(
|
||||||
VecMap(self.0[..split_idx].to_vec()),
|
VecMap {
|
||||||
VecMap(self.0[split_idx..].to_vec()),
|
data: self.data[..split_idx].to_vec(),
|
||||||
|
ordering: self.ordering,
|
||||||
|
},
|
||||||
|
VecMap {
|
||||||
|
data: self.data[split_idx..].to_vec(),
|
||||||
|
ordering: self.ordering,
|
||||||
|
},
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Move items from `other` to the end of `self`, leaving `other` empty.
|
/// Move items from `other` to the end of `self`, leaving `other` empty.
|
||||||
/// If any keys in `other` is less than or equal to any key in `self`,
|
/// If the `other` ordering is different from `self` ordering
|
||||||
/// `InvalidKey` error will be returned and no mutation will occur.
|
/// `ExtendOrderingError` error will be returned.
|
||||||
pub fn extend(&mut self, other: &mut Self) -> Result<usize, InvalidKey> {
|
/// If any keys in `other` is not respective of the ordering defined in
|
||||||
let self_last_opt = self.0.last().map(extract_key);
|
/// `self`, `InvalidKey` error will be returned and no mutation will occur.
|
||||||
let other_first_opt = other.0.last().map(extract_key);
|
pub fn extend(&mut self, other: &mut Self) -> Result<usize, VecMapError> {
|
||||||
|
if self.ordering != other.ordering {
|
||||||
|
return Err(VecMapError::ExtendOrderingError);
|
||||||
|
}
|
||||||
|
|
||||||
if let (Some(self_last), Some(other_first)) = (self_last_opt, other_first_opt) {
|
let other_first_opt = other.data.last().map(extract_key);
|
||||||
if self_last >= other_first {
|
if let Some(other_first) = other_first_opt {
|
||||||
return Err(InvalidKey);
|
self.validate_key_order(other_first)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let delta_size = self.instrument_vec_op(|vec| vec.append(&mut other.data));
|
||||||
|
Ok(delta_size)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Validate the current last key in `self` and key being
|
||||||
|
/// inserted against the order defined in `self`.
|
||||||
|
fn validate_key_order(&self, key: &K) -> Result<(), VecMapError> {
|
||||||
|
if let Some(last_key) = self.data.last().map(extract_key) {
|
||||||
|
match (&self.ordering, &key.cmp(last_key)) {
|
||||||
|
(VecMapOrdering::Greater, Ordering::Less | Ordering::Equal) => {
|
||||||
|
return Err(VecMapError::InvalidKey);
|
||||||
|
}
|
||||||
|
(VecMapOrdering::Greater, Ordering::Greater) => {}
|
||||||
|
(VecMapOrdering::GreaterOrEqual, Ordering::Less) => {
|
||||||
|
return Err(VecMapError::InvalidKey);
|
||||||
|
}
|
||||||
|
(VecMapOrdering::GreaterOrEqual, Ordering::Equal | Ordering::Greater) => {}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let delta_size = self.instrument_vec_op(|vec| vec.append(&mut other.0));
|
Ok(())
|
||||||
Ok(delta_size)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Instrument an operation on the underlying [`Vec`].
|
/// Instrument an operation on the underlying [`Vec`].
|
||||||
/// Will panic if the operation decreases capacity.
|
/// Will panic if the operation decreases capacity.
|
||||||
/// Returns the increase in memory usage caused by the op.
|
/// Returns the increase in memory usage caused by the op.
|
||||||
fn instrument_vec_op(&mut self, op: impl FnOnce(&mut Vec<(K, V)>)) -> usize {
|
fn instrument_vec_op(&mut self, op: impl FnOnce(&mut Vec<(K, V)>)) -> usize {
|
||||||
let old_cap = self.0.capacity();
|
let old_cap = self.data.capacity();
|
||||||
op(&mut self.0);
|
op(&mut self.data);
|
||||||
let new_cap = self.0.capacity();
|
let new_cap = self.data.capacity();
|
||||||
|
|
||||||
match old_cap.cmp(&new_cap) {
|
match old_cap.cmp(&new_cap) {
|
||||||
Ordering::Less => {
|
Ordering::Less => {
|
||||||
@@ -145,6 +201,36 @@ impl<K: Ord, V> VecMap<K, V> {
|
|||||||
Ordering::Greater => panic!("VecMap capacity shouldn't ever decrease"),
|
Ordering::Greater => panic!("VecMap capacity shouldn't ever decrease"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Similar to `from_iter` defined in `FromIter` trait except
|
||||||
|
/// that it accepts an [`VecMapOrdering`]
|
||||||
|
pub fn from_iter<I: IntoIterator<Item = (K, V)>>(iter: I, ordering: VecMapOrdering) -> Self {
|
||||||
|
let iter = iter.into_iter();
|
||||||
|
let initial_capacity = {
|
||||||
|
match iter.size_hint() {
|
||||||
|
(lower_bound, None) => lower_bound,
|
||||||
|
(_, Some(upper_bound)) => upper_bound,
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut vec_map = VecMap::with_capacity(initial_capacity, ordering);
|
||||||
|
for (key, value) in iter {
|
||||||
|
vec_map
|
||||||
|
.append(key, value)
|
||||||
|
.expect("The passed collection needs to be sorted!");
|
||||||
|
}
|
||||||
|
|
||||||
|
vec_map
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<K: Ord, V> IntoIterator for VecMap<K, V> {
|
||||||
|
type Item = (K, V);
|
||||||
|
type IntoIter = std::vec::IntoIter<(K, V)>;
|
||||||
|
|
||||||
|
fn into_iter(self) -> Self::IntoIter {
|
||||||
|
self.data.into_iter()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn extract_key<K, V>(entry: &(K, V)) -> &K {
|
fn extract_key<K, V>(entry: &(K, V)) -> &K {
|
||||||
@@ -155,7 +241,7 @@ fn extract_key<K, V>(entry: &(K, V)) -> &K {
|
|||||||
mod tests {
|
mod tests {
|
||||||
use std::{collections::BTreeMap, ops::Bound};
|
use std::{collections::BTreeMap, ops::Bound};
|
||||||
|
|
||||||
use super::VecMap;
|
use super::{VecMap, VecMapOrdering};
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn unbounded_range() {
|
fn unbounded_range() {
|
||||||
@@ -310,5 +396,59 @@ mod tests {
|
|||||||
left.extend(&mut one_map).unwrap_err();
|
left.extend(&mut one_map).unwrap_err();
|
||||||
assert_eq!(left.as_slice(), &[(0, ()), (1, ())]);
|
assert_eq!(left.as_slice(), &[(0, ()), (1, ())]);
|
||||||
assert_eq!(one_map.as_slice(), &[(1, ())]);
|
assert_eq!(one_map.as_slice(), &[(1, ())]);
|
||||||
|
|
||||||
|
let mut map_greater_or_equal = VecMap::new(VecMapOrdering::GreaterOrEqual);
|
||||||
|
map_greater_or_equal.append(2, ()).unwrap();
|
||||||
|
map_greater_or_equal.append(2, ()).unwrap();
|
||||||
|
|
||||||
|
left.extend(&mut map_greater_or_equal).unwrap_err();
|
||||||
|
assert_eq!(left.as_slice(), &[(0, ()), (1, ())]);
|
||||||
|
assert_eq!(map_greater_or_equal.as_slice(), &[(2, ()), (2, ())]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn extend_with_ordering() {
|
||||||
|
let mut left = VecMap::new(VecMapOrdering::GreaterOrEqual);
|
||||||
|
left.append(0, ()).unwrap();
|
||||||
|
assert_eq!(left.as_slice(), &[(0, ())]);
|
||||||
|
|
||||||
|
let mut greater_right = VecMap::new(VecMapOrdering::Greater);
|
||||||
|
greater_right.append(0, ()).unwrap();
|
||||||
|
left.extend(&mut greater_right).unwrap_err();
|
||||||
|
assert_eq!(left.as_slice(), &[(0, ())]);
|
||||||
|
|
||||||
|
let mut greater_or_equal_right = VecMap::new(VecMapOrdering::GreaterOrEqual);
|
||||||
|
greater_or_equal_right.append(2, ()).unwrap();
|
||||||
|
greater_or_equal_right.append(2, ()).unwrap();
|
||||||
|
left.extend(&mut greater_or_equal_right).unwrap();
|
||||||
|
assert_eq!(left.as_slice(), &[(0, ()), (2, ()), (2, ())]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn vec_map_from_sorted() {
|
||||||
|
let vec = vec![(1, ()), (2, ()), (3, ()), (6, ())];
|
||||||
|
let vec_map = VecMap::from_iter(vec, VecMapOrdering::Greater);
|
||||||
|
assert_eq!(vec_map.as_slice(), &[(1, ()), (2, ()), (3, ()), (6, ())]);
|
||||||
|
|
||||||
|
let vec = vec![(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())];
|
||||||
|
let vec_map = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual);
|
||||||
|
assert_eq!(
|
||||||
|
vec_map.as_slice(),
|
||||||
|
&[(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic]
|
||||||
|
fn vec_map_from_unsorted_greater() {
|
||||||
|
let vec = vec![(1, ()), (2, ()), (2, ()), (3, ()), (6, ())];
|
||||||
|
let _ = VecMap::from_iter(vec, VecMapOrdering::Greater);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic]
|
||||||
|
fn vec_map_from_unsorted_greater_or_equal() {
|
||||||
|
let vec = vec![(1, ()), (2, ()), (3, ()), (6, ()), (5, ())];
|
||||||
|
let _ = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
78
libs/utils/src/zstd.rs
Normal file
78
libs/utils/src/zstd.rs
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
use std::io::SeekFrom;
|
||||||
|
|
||||||
|
use anyhow::{Context, Result};
|
||||||
|
use async_compression::{
|
||||||
|
tokio::{bufread::ZstdDecoder, write::ZstdEncoder},
|
||||||
|
zstd::CParameter,
|
||||||
|
Level,
|
||||||
|
};
|
||||||
|
use camino::Utf8Path;
|
||||||
|
use nix::NixPath;
|
||||||
|
use tokio::{
|
||||||
|
fs::{File, OpenOptions},
|
||||||
|
io::AsyncBufRead,
|
||||||
|
io::AsyncSeekExt,
|
||||||
|
io::AsyncWriteExt,
|
||||||
|
};
|
||||||
|
use tokio_tar::{Archive, Builder, HeaderMode};
|
||||||
|
use walkdir::WalkDir;
|
||||||
|
|
||||||
|
/// Creates a Zstandard tarball.
|
||||||
|
pub async fn create_zst_tarball(path: &Utf8Path, tarball: &Utf8Path) -> Result<(File, u64)> {
|
||||||
|
let file = OpenOptions::new()
|
||||||
|
.create(true)
|
||||||
|
.truncate(true)
|
||||||
|
.read(true)
|
||||||
|
.write(true)
|
||||||
|
.open(&tarball)
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("tempfile creation {tarball}"))?;
|
||||||
|
|
||||||
|
let mut paths = Vec::new();
|
||||||
|
for entry in WalkDir::new(path) {
|
||||||
|
let entry = entry?;
|
||||||
|
let metadata = entry.metadata().expect("error getting dir entry metadata");
|
||||||
|
// Also allow directories so that we also get empty directories
|
||||||
|
if !(metadata.is_file() || metadata.is_dir()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let path = entry.into_path();
|
||||||
|
paths.push(path);
|
||||||
|
}
|
||||||
|
// Do a sort to get a more consistent listing
|
||||||
|
paths.sort_unstable();
|
||||||
|
let zstd = ZstdEncoder::with_quality_and_params(
|
||||||
|
file,
|
||||||
|
Level::Default,
|
||||||
|
&[CParameter::enable_long_distance_matching(true)],
|
||||||
|
);
|
||||||
|
let mut builder = Builder::new(zstd);
|
||||||
|
// Use reproducible header mode
|
||||||
|
builder.mode(HeaderMode::Deterministic);
|
||||||
|
for p in paths {
|
||||||
|
let rel_path = p.strip_prefix(path)?;
|
||||||
|
if rel_path.is_empty() {
|
||||||
|
// The top directory should not be compressed,
|
||||||
|
// the tar crate doesn't like that
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
builder.append_path_with_name(&p, rel_path).await?;
|
||||||
|
}
|
||||||
|
let mut zstd = builder.into_inner().await?;
|
||||||
|
zstd.shutdown().await?;
|
||||||
|
let mut compressed = zstd.into_inner();
|
||||||
|
let compressed_len = compressed.metadata().await?.len();
|
||||||
|
compressed.seek(SeekFrom::Start(0)).await?;
|
||||||
|
Ok((compressed, compressed_len))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates a Zstandard tarball.
|
||||||
|
pub async fn extract_zst_tarball(
|
||||||
|
path: &Utf8Path,
|
||||||
|
tarball: impl AsyncBufRead + Unpin,
|
||||||
|
) -> Result<()> {
|
||||||
|
let decoder = Box::pin(ZstdDecoder::new(tarball));
|
||||||
|
let mut archive = Archive::new(decoder);
|
||||||
|
archive.unpack(path).await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
@@ -69,7 +69,7 @@ pub struct Config {
|
|||||||
/// should be removed once we have a better solution there.
|
/// should be removed once we have a better solution there.
|
||||||
sys_buffer_bytes: u64,
|
sys_buffer_bytes: u64,
|
||||||
|
|
||||||
/// Minimum fraction of total system memory reserved *before* the the cgroup threshold; in
|
/// Minimum fraction of total system memory reserved *before* the cgroup threshold; in
|
||||||
/// other words, providing a ceiling for the highest value of the threshold by enforcing that
|
/// other words, providing a ceiling for the highest value of the threshold by enforcing that
|
||||||
/// there's at least `cgroup_min_overhead_fraction` of the total memory remaining beyond the
|
/// there's at least `cgroup_min_overhead_fraction` of the total memory remaining beyond the
|
||||||
/// threshold.
|
/// threshold.
|
||||||
|
|||||||
@@ -324,11 +324,11 @@ extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, commit_lsn: XLogRecPtr) {
|
extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, sk: *mut Safekeeper) {
|
||||||
unsafe {
|
unsafe {
|
||||||
let callback_data = (*(*wp).config).callback_data;
|
let callback_data = (*(*wp).config).callback_data;
|
||||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||||
(*api).process_safekeeper_feedback(&mut (*wp), commit_lsn)
|
(*api).process_safekeeper_feedback(&mut (*wp), &mut (*sk));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -142,7 +142,7 @@ pub trait ApiImpl {
|
|||||||
todo!()
|
todo!()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn process_safekeeper_feedback(&self, _wp: &mut WalProposer, _commit_lsn: u64) {
|
fn process_safekeeper_feedback(&mut self, _wp: &mut WalProposer, _sk: &mut Safekeeper) {
|
||||||
todo!()
|
todo!()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -59,6 +59,7 @@ signal-hook.workspace = true
|
|||||||
smallvec = { workspace = true, features = ["write"] }
|
smallvec = { workspace = true, features = ["write"] }
|
||||||
svg_fmt.workspace = true
|
svg_fmt.workspace = true
|
||||||
sync_wrapper.workspace = true
|
sync_wrapper.workspace = true
|
||||||
|
sysinfo.workspace = true
|
||||||
tokio-tar.workspace = true
|
tokio-tar.workspace = true
|
||||||
thiserror.workspace = true
|
thiserror.workspace = true
|
||||||
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
|
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
|
||||||
@@ -89,6 +90,9 @@ enumset = { workspace = true, features = ["serde"]}
|
|||||||
strum.workspace = true
|
strum.workspace = true
|
||||||
strum_macros.workspace = true
|
strum_macros.workspace = true
|
||||||
|
|
||||||
|
[target.'cfg(target_os = "linux")'.dependencies]
|
||||||
|
procfs.workspace = true
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
criterion.workspace = true
|
criterion.workspace = true
|
||||||
hex-literal.workspace = true
|
hex-literal.workspace = true
|
||||||
|
|||||||
@@ -1,160 +1,156 @@
|
|||||||
//! Simple benchmarking around walredo.
|
//! Quantify a single walredo manager's throughput under N concurrent callers.
|
||||||
//!
|
//!
|
||||||
//! Right now they hope to just set a baseline. Later we can try to expand into latency and
|
//! The benchmark implementation ([`bench_impl`]) is parametrized by
|
||||||
//! throughput after figuring out the coordinated omission problems below.
|
//! - `redo_work` => [`Request::short_request`] or [`Request::medium_request`]
|
||||||
|
//! - `n_redos` => number of times the benchmark shell execute the `redo_work`
|
||||||
|
//! - `nclients` => number of clients (more on this shortly).
|
||||||
//!
|
//!
|
||||||
//! There are two sets of inputs; `short` and `medium`. They were collected on postgres v14 by
|
//! The benchmark impl sets up a multi-threaded tokio runtime with default parameters.
|
||||||
//! logging what happens when a sequential scan is requested on a small table, then picking out two
|
//! It spawns `nclients` times [`client`] tokio tasks.
|
||||||
//! suitable from logs.
|
//! Each task executes the `redo_work` `n_redos/nclients` times.
|
||||||
//!
|
//!
|
||||||
|
//! We exercise the following combinations:
|
||||||
|
//! - `redo_work = short / medium``
|
||||||
|
//! - `nclients = [1, 2, 4, 8, 16, 32, 64, 128]`
|
||||||
//!
|
//!
|
||||||
//! Reference data (git blame to see commit) on an i3en.3xlarge
|
//! We let `criterion` determine the `n_redos` using `iter_custom`.
|
||||||
// ```text
|
//! The idea is that for each `(redo_work, nclients)` combination,
|
||||||
//! short/short/1 time: [39.175 µs 39.348 µs 39.536 µs]
|
//! criterion will run the `bench_impl` multiple times with different `n_redos`.
|
||||||
//! short/short/2 time: [51.227 µs 51.487 µs 51.755 µs]
|
//! The `bench_impl` reports the aggregate wall clock time from the clients' perspective.
|
||||||
//! short/short/4 time: [76.048 µs 76.362 µs 76.674 µs]
|
//! Criterion will divide that by `n_redos` to compute the "time per iteration".
|
||||||
//! short/short/8 time: [128.94 µs 129.82 µs 130.74 µs]
|
//! In our case, "time per iteration" means "time per redo_work execution".
|
||||||
//! short/short/16 time: [227.84 µs 229.00 µs 230.28 µs]
|
//!
|
||||||
//! short/short/32 time: [455.97 µs 457.81 µs 459.90 µs]
|
//! NB: the way by which `iter_custom` determines the "number of iterations"
|
||||||
//! short/short/64 time: [902.46 µs 904.84 µs 907.32 µs]
|
//! is called sampling. Apparently the idea here is to detect outliers.
|
||||||
//! short/short/128 time: [1.7416 ms 1.7487 ms 1.7561 ms]
|
//! We're not sure whether the current choice of sampling method makes sense.
|
||||||
//! ``
|
//! See https://bheisler.github.io/criterion.rs/book/user_guide/command_line_output.html#collecting-samples
|
||||||
|
//!
|
||||||
use std::sync::Arc;
|
//! # Reference Numbers
|
||||||
|
//!
|
||||||
|
//! 2024-03-20 on i3en.3xlarge
|
||||||
|
//!
|
||||||
|
//! ```text
|
||||||
|
//! short/1 time: [26.483 µs 26.614 µs 26.767 µs]
|
||||||
|
//! short/2 time: [32.223 µs 32.465 µs 32.767 µs]
|
||||||
|
//! short/4 time: [47.203 µs 47.583 µs 47.984 µs]
|
||||||
|
//! short/8 time: [89.135 µs 89.612 µs 90.139 µs]
|
||||||
|
//! short/16 time: [190.12 µs 191.52 µs 192.88 µs]
|
||||||
|
//! short/32 time: [380.96 µs 382.63 µs 384.20 µs]
|
||||||
|
//! short/64 time: [736.86 µs 741.07 µs 745.03 µs]
|
||||||
|
//! short/128 time: [1.4106 ms 1.4206 ms 1.4294 ms]
|
||||||
|
//! medium/1 time: [111.81 µs 112.25 µs 112.79 µs]
|
||||||
|
//! medium/2 time: [158.26 µs 159.13 µs 160.21 µs]
|
||||||
|
//! medium/4 time: [334.65 µs 337.14 µs 340.07 µs]
|
||||||
|
//! medium/8 time: [675.32 µs 679.91 µs 685.25 µs]
|
||||||
|
//! medium/16 time: [1.2929 ms 1.2996 ms 1.3067 ms]
|
||||||
|
//! medium/32 time: [2.4295 ms 2.4461 ms 2.4623 ms]
|
||||||
|
//! medium/64 time: [4.3973 ms 4.4458 ms 4.4875 ms]
|
||||||
|
//! medium/128 time: [7.5955 ms 7.7847 ms 7.9481 ms]
|
||||||
|
//! ```
|
||||||
|
|
||||||
use bytes::{Buf, Bytes};
|
use bytes::{Buf, Bytes};
|
||||||
use pageserver::{
|
use criterion::{BenchmarkId, Criterion};
|
||||||
config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
|
use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
|
||||||
|
use pageserver_api::{key::Key, shard::TenantShardId};
|
||||||
|
use std::{
|
||||||
|
sync::Arc,
|
||||||
|
time::{Duration, Instant},
|
||||||
};
|
};
|
||||||
use pageserver_api::shard::TenantShardId;
|
use tokio::{sync::Barrier, task::JoinSet};
|
||||||
use tokio::task::JoinSet;
|
|
||||||
use utils::{id::TenantId, lsn::Lsn};
|
use utils::{id::TenantId, lsn::Lsn};
|
||||||
|
|
||||||
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
|
fn bench(c: &mut Criterion) {
|
||||||
|
{
|
||||||
|
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||||
|
for nclients in nclients {
|
||||||
|
let mut group = c.benchmark_group("short");
|
||||||
|
group.bench_with_input(
|
||||||
|
BenchmarkId::from_parameter(nclients),
|
||||||
|
&nclients,
|
||||||
|
|b, nclients| {
|
||||||
|
let redo_work = Arc::new(Request::short_input());
|
||||||
|
b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
|
||||||
|
},
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn redo_scenarios(c: &mut Criterion) {
|
{
|
||||||
// logging should be enabled when adding more inputs, since walredo will only report malformed
|
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||||
// input to the stderr.
|
for nclients in nclients {
|
||||||
// utils::logging::init(utils::logging::LogFormat::Plain).unwrap();
|
let mut group = c.benchmark_group("medium");
|
||||||
|
group.bench_with_input(
|
||||||
|
BenchmarkId::from_parameter(nclients),
|
||||||
|
&nclients,
|
||||||
|
|b, nclients| {
|
||||||
|
let redo_work = Arc::new(Request::medium_input());
|
||||||
|
b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
|
||||||
|
},
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
criterion::criterion_group!(benches, bench);
|
||||||
|
criterion::criterion_main!(benches);
|
||||||
|
|
||||||
|
// Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
|
||||||
|
fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
|
||||||
let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
|
let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
|
||||||
|
|
||||||
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
|
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
|
||||||
let conf = Box::leak(Box::new(conf));
|
let conf = Box::leak(Box::new(conf));
|
||||||
let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
|
let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
|
||||||
|
|
||||||
let manager = PostgresRedoManager::new(conf, tenant_shard_id);
|
|
||||||
|
|
||||||
let manager = Arc::new(manager);
|
|
||||||
|
|
||||||
{
|
|
||||||
let rt = tokio::runtime::Builder::new_current_thread()
|
|
||||||
.enable_all()
|
|
||||||
.build()
|
|
||||||
.unwrap();
|
|
||||||
tracing::info!("executing first");
|
|
||||||
rt.block_on(short().execute(&manager)).unwrap();
|
|
||||||
tracing::info!("first executed");
|
|
||||||
}
|
|
||||||
|
|
||||||
let thread_counts = [1, 2, 4, 8, 16, 32, 64, 128];
|
|
||||||
|
|
||||||
let mut group = c.benchmark_group("short");
|
|
||||||
group.sampling_mode(criterion::SamplingMode::Flat);
|
|
||||||
|
|
||||||
for thread_count in thread_counts {
|
|
||||||
group.bench_with_input(
|
|
||||||
BenchmarkId::new("short", thread_count),
|
|
||||||
&thread_count,
|
|
||||||
|b, thread_count| {
|
|
||||||
add_multithreaded_walredo_requesters(b, *thread_count, &manager, short);
|
|
||||||
},
|
|
||||||
);
|
|
||||||
}
|
|
||||||
drop(group);
|
|
||||||
|
|
||||||
let mut group = c.benchmark_group("medium");
|
|
||||||
group.sampling_mode(criterion::SamplingMode::Flat);
|
|
||||||
|
|
||||||
for thread_count in thread_counts {
|
|
||||||
group.bench_with_input(
|
|
||||||
BenchmarkId::new("medium", thread_count),
|
|
||||||
&thread_count,
|
|
||||||
|b, thread_count| {
|
|
||||||
add_multithreaded_walredo_requesters(b, *thread_count, &manager, medium);
|
|
||||||
},
|
|
||||||
);
|
|
||||||
}
|
|
||||||
drop(group);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Sets up a multi-threaded tokio runtime with default worker thread count,
|
|
||||||
/// then, spawn `requesters` tasks that repeatedly:
|
|
||||||
/// - get input from `input_factor()`
|
|
||||||
/// - call `manager.request_redo()` with their input
|
|
||||||
///
|
|
||||||
/// This stress-tests the scalability of a single walredo manager at high tokio-level concurrency.
|
|
||||||
///
|
|
||||||
/// Using tokio's default worker thread count means the results will differ on machines
|
|
||||||
/// with different core countrs. We don't care about that, the performance will always
|
|
||||||
/// be different on different hardware. To compare performance of different software versions,
|
|
||||||
/// use the same hardware.
|
|
||||||
fn add_multithreaded_walredo_requesters(
|
|
||||||
b: &mut criterion::Bencher,
|
|
||||||
nrequesters: usize,
|
|
||||||
manager: &Arc<PostgresRedoManager>,
|
|
||||||
input_factory: fn() -> Request,
|
|
||||||
) {
|
|
||||||
assert_ne!(nrequesters, 0);
|
|
||||||
|
|
||||||
let rt = tokio::runtime::Builder::new_multi_thread()
|
let rt = tokio::runtime::Builder::new_multi_thread()
|
||||||
.enable_all()
|
.enable_all()
|
||||||
.build()
|
.build()
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let barrier = Arc::new(tokio::sync::Barrier::new(nrequesters + 1));
|
let start = Arc::new(Barrier::new(nclients as usize));
|
||||||
|
|
||||||
let mut requesters = JoinSet::new();
|
let mut tasks = JoinSet::new();
|
||||||
for _ in 0..nrequesters {
|
|
||||||
let _entered = rt.enter();
|
let manager = PostgresRedoManager::new(conf, tenant_shard_id);
|
||||||
let manager = manager.clone();
|
let manager = Arc::new(manager);
|
||||||
let barrier = barrier.clone();
|
|
||||||
requesters.spawn(async move {
|
for _ in 0..nclients {
|
||||||
loop {
|
rt.block_on(async {
|
||||||
let input = input_factory();
|
tasks.spawn(client(
|
||||||
barrier.wait().await;
|
Arc::clone(&manager),
|
||||||
let page = input.execute(&manager).await.unwrap();
|
Arc::clone(&start),
|
||||||
assert_eq!(page.remaining(), 8192);
|
Arc::clone(&redo_work),
|
||||||
barrier.wait().await;
|
// divide the amount of work equally among the clients
|
||||||
}
|
n_redos / nclients,
|
||||||
|
))
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
let do_one_iteration = || {
|
rt.block_on(async move {
|
||||||
rt.block_on(async {
|
let mut total_wallclock_time = std::time::Duration::from_millis(0);
|
||||||
barrier.wait().await;
|
while let Some(res) = tasks.join_next().await {
|
||||||
// wait for work to complete
|
total_wallclock_time += res.unwrap();
|
||||||
barrier.wait().await;
|
}
|
||||||
})
|
total_wallclock_time
|
||||||
};
|
})
|
||||||
|
|
||||||
b.iter_batched(
|
|
||||||
|| {
|
|
||||||
// warmup
|
|
||||||
do_one_iteration();
|
|
||||||
},
|
|
||||||
|()| {
|
|
||||||
// work loop
|
|
||||||
do_one_iteration();
|
|
||||||
},
|
|
||||||
criterion::BatchSize::PerIteration,
|
|
||||||
);
|
|
||||||
|
|
||||||
rt.block_on(requesters.shutdown());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
criterion_group!(benches, redo_scenarios);
|
async fn client(
|
||||||
criterion_main!(benches);
|
mgr: Arc<PostgresRedoManager>,
|
||||||
|
start: Arc<Barrier>,
|
||||||
|
redo_work: Arc<Request>,
|
||||||
|
n_redos: u64,
|
||||||
|
) -> Duration {
|
||||||
|
start.wait().await;
|
||||||
|
let start = Instant::now();
|
||||||
|
for _ in 0..n_redos {
|
||||||
|
let page = redo_work.execute(&mgr).await.unwrap();
|
||||||
|
assert_eq!(page.remaining(), 8192);
|
||||||
|
// The real pageserver will rarely if ever do 2 walredos in a row without
|
||||||
|
// yielding to the executor.
|
||||||
|
tokio::task::yield_now().await;
|
||||||
|
}
|
||||||
|
start.elapsed()
|
||||||
|
}
|
||||||
|
|
||||||
macro_rules! lsn {
|
macro_rules! lsn {
|
||||||
($input:expr) => {{
|
($input:expr) => {{
|
||||||
@@ -166,12 +162,46 @@ macro_rules! lsn {
|
|||||||
}};
|
}};
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Short payload, 1132 bytes.
|
/// Simple wrapper around `WalRedoManager::request_redo`.
|
||||||
// pg_records are copypasted from log, where they are put with Debug impl of Bytes, which uses \0
|
///
|
||||||
// for null bytes.
|
/// In benchmarks this is cloned around.
|
||||||
#[allow(clippy::octal_escapes)]
|
#[derive(Clone)]
|
||||||
fn short() -> Request {
|
struct Request {
|
||||||
Request {
|
key: Key,
|
||||||
|
lsn: Lsn,
|
||||||
|
base_img: Option<(Lsn, Bytes)>,
|
||||||
|
records: Vec<(Lsn, NeonWalRecord)>,
|
||||||
|
pg_version: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Request {
|
||||||
|
async fn execute(&self, manager: &PostgresRedoManager) -> anyhow::Result<Bytes> {
|
||||||
|
let Request {
|
||||||
|
key,
|
||||||
|
lsn,
|
||||||
|
base_img,
|
||||||
|
records,
|
||||||
|
pg_version,
|
||||||
|
} = self;
|
||||||
|
|
||||||
|
// TODO: avoid these clones
|
||||||
|
manager
|
||||||
|
.request_redo(*key, *lsn, base_img.clone(), records.clone(), *pg_version)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
|
fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord {
|
||||||
|
let rec = Bytes::from_static(bytes);
|
||||||
|
NeonWalRecord::Postgres { will_init, rec }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Short payload, 1132 bytes.
|
||||||
|
// pg_records are copypasted from log, where they are put with Debug impl of Bytes, which uses \0
|
||||||
|
// for null bytes.
|
||||||
|
#[allow(clippy::octal_escapes)]
|
||||||
|
pub fn short_input() -> Request {
|
||||||
|
let pg_record = Self::pg_record;
|
||||||
|
Request {
|
||||||
key: Key {
|
key: Key {
|
||||||
field1: 0,
|
field1: 0,
|
||||||
field2: 1663,
|
field2: 1663,
|
||||||
@@ -194,13 +224,14 @@ fn short() -> Request {
|
|||||||
],
|
],
|
||||||
pg_version: 14,
|
pg_version: 14,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Medium sized payload, serializes as 26393 bytes.
|
/// Medium sized payload, serializes as 26393 bytes.
|
||||||
// see [`short`]
|
// see [`short`]
|
||||||
#[allow(clippy::octal_escapes)]
|
#[allow(clippy::octal_escapes)]
|
||||||
fn medium() -> Request {
|
pub fn medium_input() -> Request {
|
||||||
Request {
|
let pg_record = Self::pg_record;
|
||||||
|
Request {
|
||||||
key: Key {
|
key: Key {
|
||||||
field1: 0,
|
field1: 0,
|
||||||
field2: 1663,
|
field2: 1663,
|
||||||
@@ -442,37 +473,5 @@ fn medium() -> Request {
|
|||||||
],
|
],
|
||||||
pg_version: 14,
|
pg_version: 14,
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord {
|
|
||||||
let rec = Bytes::from_static(bytes);
|
|
||||||
NeonWalRecord::Postgres { will_init, rec }
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Simple wrapper around `WalRedoManager::request_redo`.
|
|
||||||
///
|
|
||||||
/// In benchmarks this is cloned around.
|
|
||||||
#[derive(Clone)]
|
|
||||||
struct Request {
|
|
||||||
key: Key,
|
|
||||||
lsn: Lsn,
|
|
||||||
base_img: Option<(Lsn, Bytes)>,
|
|
||||||
records: Vec<(Lsn, NeonWalRecord)>,
|
|
||||||
pg_version: u32,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Request {
|
|
||||||
async fn execute(self, manager: &PostgresRedoManager) -> anyhow::Result<Bytes> {
|
|
||||||
let Request {
|
|
||||||
key,
|
|
||||||
lsn,
|
|
||||||
base_img,
|
|
||||||
records,
|
|
||||||
pg_version,
|
|
||||||
} = self;
|
|
||||||
|
|
||||||
manager
|
|
||||||
.request_redo(key, lsn, base_img, records, pg_version)
|
|
||||||
.await
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -169,7 +169,7 @@ impl Client {
|
|||||||
self.request(Method::GET, uri, ()).await
|
self.request(Method::GET, uri, ()).await
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
|
async fn request_noerror<B: serde::Serialize, U: reqwest::IntoUrl>(
|
||||||
&self,
|
&self,
|
||||||
method: Method,
|
method: Method,
|
||||||
uri: U,
|
uri: U,
|
||||||
@@ -181,7 +181,16 @@ impl Client {
|
|||||||
} else {
|
} else {
|
||||||
req
|
req
|
||||||
};
|
};
|
||||||
let res = req.json(&body).send().await.map_err(Error::ReceiveBody)?;
|
req.json(&body).send().await.map_err(Error::ReceiveBody)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
|
||||||
|
&self,
|
||||||
|
method: Method,
|
||||||
|
uri: U,
|
||||||
|
body: B,
|
||||||
|
) -> Result<reqwest::Response> {
|
||||||
|
let res = self.request_noerror(method, uri, body).await?;
|
||||||
let response = res.error_from_body().await?;
|
let response = res.error_from_body().await?;
|
||||||
Ok(response)
|
Ok(response)
|
||||||
}
|
}
|
||||||
@@ -240,13 +249,26 @@ impl Client {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn tenant_secondary_download(&self, tenant_id: TenantShardId) -> Result<()> {
|
pub async fn tenant_secondary_download(
|
||||||
let uri = format!(
|
&self,
|
||||||
|
tenant_id: TenantShardId,
|
||||||
|
wait: Option<std::time::Duration>,
|
||||||
|
) -> Result<(StatusCode, SecondaryProgress)> {
|
||||||
|
let mut path = reqwest::Url::parse(&format!(
|
||||||
"{}/v1/tenant/{}/secondary/download",
|
"{}/v1/tenant/{}/secondary/download",
|
||||||
self.mgmt_api_endpoint, tenant_id
|
self.mgmt_api_endpoint, tenant_id
|
||||||
);
|
))
|
||||||
self.request(Method::POST, &uri, ()).await?;
|
.expect("Cannot build URL");
|
||||||
Ok(())
|
|
||||||
|
if let Some(wait) = wait {
|
||||||
|
path.query_pairs_mut()
|
||||||
|
.append_pair("wait_ms", &format!("{}", wait.as_millis()));
|
||||||
|
}
|
||||||
|
|
||||||
|
let response = self.request(Method::POST, path, ()).await?;
|
||||||
|
let status = response.status();
|
||||||
|
let progress: SecondaryProgress = response.json().await.map_err(Error::ReceiveBody)?;
|
||||||
|
Ok((status, progress))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn location_config(
|
pub async fn location_config(
|
||||||
@@ -257,7 +279,7 @@ impl Client {
|
|||||||
lazy: bool,
|
lazy: bool,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let req_body = TenantLocationConfigRequest {
|
let req_body = TenantLocationConfigRequest {
|
||||||
tenant_id: tenant_shard_id,
|
tenant_id: Some(tenant_shard_id),
|
||||||
config,
|
config,
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -416,4 +438,77 @@ impl Client {
|
|||||||
.await
|
.await
|
||||||
.map_err(Error::ReceiveBody)
|
.map_err(Error::ReceiveBody)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn get_utilization(&self) -> Result<PageserverUtilization> {
|
||||||
|
let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint);
|
||||||
|
self.get(uri)
|
||||||
|
.await?
|
||||||
|
.json()
|
||||||
|
.await
|
||||||
|
.map_err(Error::ReceiveBody)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn layer_map_info(
|
||||||
|
&self,
|
||||||
|
tenant_shard_id: TenantShardId,
|
||||||
|
timeline_id: TimelineId,
|
||||||
|
) -> Result<LayerMapInfo> {
|
||||||
|
let uri = format!(
|
||||||
|
"{}/v1/tenant/{}/timeline/{}/layer",
|
||||||
|
self.mgmt_api_endpoint, tenant_shard_id, timeline_id,
|
||||||
|
);
|
||||||
|
self.get(&uri)
|
||||||
|
.await?
|
||||||
|
.json()
|
||||||
|
.await
|
||||||
|
.map_err(Error::ReceiveBody)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn layer_evict(
|
||||||
|
&self,
|
||||||
|
tenant_shard_id: TenantShardId,
|
||||||
|
timeline_id: TimelineId,
|
||||||
|
layer_file_name: &str,
|
||||||
|
) -> Result<bool> {
|
||||||
|
let uri = format!(
|
||||||
|
"{}/v1/tenant/{}/timeline/{}/layer/{}",
|
||||||
|
self.mgmt_api_endpoint, tenant_shard_id, timeline_id, layer_file_name
|
||||||
|
);
|
||||||
|
let resp = self.request_noerror(Method::DELETE, &uri, ()).await?;
|
||||||
|
match resp.status() {
|
||||||
|
StatusCode::OK => Ok(true),
|
||||||
|
StatusCode::NOT_MODIFIED => Ok(false),
|
||||||
|
// TODO: dedupe this pattern / introduce separate error variant?
|
||||||
|
status => Err(match resp.json::<HttpErrorBody>().await {
|
||||||
|
Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
|
||||||
|
Err(_) => {
|
||||||
|
Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
|
||||||
|
}
|
||||||
|
}),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn layer_ondemand_download(
|
||||||
|
&self,
|
||||||
|
tenant_shard_id: TenantShardId,
|
||||||
|
timeline_id: TimelineId,
|
||||||
|
layer_file_name: &str,
|
||||||
|
) -> Result<bool> {
|
||||||
|
let uri = format!(
|
||||||
|
"{}/v1/tenant/{}/timeline/{}/layer/{}",
|
||||||
|
self.mgmt_api_endpoint, tenant_shard_id, timeline_id, layer_file_name
|
||||||
|
);
|
||||||
|
let resp = self.request_noerror(Method::GET, &uri, ()).await?;
|
||||||
|
match resp.status() {
|
||||||
|
StatusCode::OK => Ok(true),
|
||||||
|
StatusCode::NOT_MODIFIED => Ok(false),
|
||||||
|
// TODO: dedupe this pattern / introduce separate error variant?
|
||||||
|
status => Err(match resp.json::<HttpErrorBody>().await {
|
||||||
|
Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
|
||||||
|
Err(_) => {
|
||||||
|
Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
|
||||||
|
}
|
||||||
|
}),
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
272
pageserver/pagebench/src/cmd/ondemand_download_churn.rs
Normal file
272
pageserver/pagebench/src/cmd/ondemand_download_churn.rs
Normal file
@@ -0,0 +1,272 @@
|
|||||||
|
use pageserver_api::{models::HistoricLayerInfo, shard::TenantShardId};
|
||||||
|
|
||||||
|
use pageserver_client::mgmt_api;
|
||||||
|
use rand::seq::SliceRandom;
|
||||||
|
use tracing::{debug, info};
|
||||||
|
use utils::id::{TenantTimelineId, TimelineId};
|
||||||
|
|
||||||
|
use tokio::{
|
||||||
|
sync::{mpsc, OwnedSemaphorePermit},
|
||||||
|
task::JoinSet,
|
||||||
|
};
|
||||||
|
|
||||||
|
use std::{
|
||||||
|
num::NonZeroUsize,
|
||||||
|
sync::{
|
||||||
|
atomic::{AtomicU64, Ordering},
|
||||||
|
Arc,
|
||||||
|
},
|
||||||
|
time::{Duration, Instant},
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Evict & on-demand download random layers.
|
||||||
|
#[derive(clap::Parser)]
|
||||||
|
pub(crate) struct Args {
|
||||||
|
#[clap(long, default_value = "http://localhost:9898")]
|
||||||
|
mgmt_api_endpoint: String,
|
||||||
|
#[clap(long)]
|
||||||
|
pageserver_jwt: Option<String>,
|
||||||
|
#[clap(long)]
|
||||||
|
runtime: Option<humantime::Duration>,
|
||||||
|
#[clap(long, default_value = "1")]
|
||||||
|
tasks_per_target: NonZeroUsize,
|
||||||
|
#[clap(long, default_value = "1")]
|
||||||
|
concurrency_per_target: NonZeroUsize,
|
||||||
|
/// Probability for sending `latest=true` in the request (uniform distribution).
|
||||||
|
#[clap(long)]
|
||||||
|
limit_to_first_n_targets: Option<usize>,
|
||||||
|
/// Before starting the benchmark, live-reconfigure the pageserver to use the given
|
||||||
|
/// [`pageserver_api::models::virtual_file::IoEngineKind`].
|
||||||
|
#[clap(long)]
|
||||||
|
set_io_engine: Option<pageserver_api::models::virtual_file::IoEngineKind>,
|
||||||
|
targets: Option<Vec<TenantTimelineId>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn main(args: Args) -> anyhow::Result<()> {
|
||||||
|
let rt = tokio::runtime::Builder::new_multi_thread()
|
||||||
|
.enable_all()
|
||||||
|
.build()?;
|
||||||
|
let task = rt.spawn(main_impl(args));
|
||||||
|
rt.block_on(task).unwrap().unwrap();
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Default)]
|
||||||
|
struct LiveStats {
|
||||||
|
evictions: AtomicU64,
|
||||||
|
downloads: AtomicU64,
|
||||||
|
timeline_restarts: AtomicU64,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl LiveStats {
|
||||||
|
fn eviction_done(&self) {
|
||||||
|
self.evictions.fetch_add(1, Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
fn download_done(&self) {
|
||||||
|
self.downloads.fetch_add(1, Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
fn timeline_restart_done(&self) {
|
||||||
|
self.timeline_restarts.fetch_add(1, Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn main_impl(args: Args) -> anyhow::Result<()> {
|
||||||
|
let args: &'static Args = Box::leak(Box::new(args));
|
||||||
|
|
||||||
|
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
|
||||||
|
args.mgmt_api_endpoint.clone(),
|
||||||
|
args.pageserver_jwt.as_deref(),
|
||||||
|
));
|
||||||
|
|
||||||
|
if let Some(engine_str) = &args.set_io_engine {
|
||||||
|
mgmt_api_client.put_io_engine(engine_str).await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// discover targets
|
||||||
|
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
|
||||||
|
&mgmt_api_client,
|
||||||
|
crate::util::cli::targets::Spec {
|
||||||
|
limit_to_first_n_targets: args.limit_to_first_n_targets,
|
||||||
|
targets: args.targets.clone(),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let mut tasks = JoinSet::new();
|
||||||
|
|
||||||
|
let live_stats = Arc::new(LiveStats::default());
|
||||||
|
tasks.spawn({
|
||||||
|
let live_stats = Arc::clone(&live_stats);
|
||||||
|
async move {
|
||||||
|
let mut last_at = Instant::now();
|
||||||
|
loop {
|
||||||
|
tokio::time::sleep_until((last_at + Duration::from_secs(1)).into()).await;
|
||||||
|
let now = Instant::now();
|
||||||
|
let delta: Duration = now - last_at;
|
||||||
|
last_at = now;
|
||||||
|
|
||||||
|
let LiveStats {
|
||||||
|
evictions,
|
||||||
|
downloads,
|
||||||
|
timeline_restarts,
|
||||||
|
} = &*live_stats;
|
||||||
|
let evictions = evictions.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
|
||||||
|
let downloads = downloads.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
|
||||||
|
let timeline_restarts = timeline_restarts.swap(0, Ordering::Relaxed);
|
||||||
|
info!("evictions={evictions:.2}/s downloads={downloads:.2}/s timeline_restarts={timeline_restarts}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
for tl in timelines {
|
||||||
|
for _ in 0..args.tasks_per_target.get() {
|
||||||
|
tasks.spawn(timeline_actor(
|
||||||
|
args,
|
||||||
|
Arc::clone(&mgmt_api_client),
|
||||||
|
tl,
|
||||||
|
Arc::clone(&live_stats),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
while let Some(res) = tasks.join_next().await {
|
||||||
|
res.unwrap();
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn timeline_actor(
|
||||||
|
args: &'static Args,
|
||||||
|
mgmt_api_client: Arc<pageserver_client::mgmt_api::Client>,
|
||||||
|
timeline: TenantTimelineId,
|
||||||
|
live_stats: Arc<LiveStats>,
|
||||||
|
) {
|
||||||
|
// TODO: support sharding
|
||||||
|
let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id);
|
||||||
|
|
||||||
|
struct Timeline {
|
||||||
|
joinset: JoinSet<()>,
|
||||||
|
layers: Vec<mpsc::Sender<OwnedSemaphorePermit>>,
|
||||||
|
concurrency: Arc<tokio::sync::Semaphore>,
|
||||||
|
}
|
||||||
|
loop {
|
||||||
|
debug!("restarting timeline");
|
||||||
|
let layer_map_info = mgmt_api_client
|
||||||
|
.layer_map_info(tenant_shard_id, timeline.timeline_id)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let concurrency = Arc::new(tokio::sync::Semaphore::new(
|
||||||
|
args.concurrency_per_target.get(),
|
||||||
|
));
|
||||||
|
|
||||||
|
let mut joinset = JoinSet::new();
|
||||||
|
let layers = layer_map_info
|
||||||
|
.historic_layers
|
||||||
|
.into_iter()
|
||||||
|
.map(|historic_layer| {
|
||||||
|
let (tx, rx) = mpsc::channel(1);
|
||||||
|
joinset.spawn(layer_actor(
|
||||||
|
tenant_shard_id,
|
||||||
|
timeline.timeline_id,
|
||||||
|
historic_layer,
|
||||||
|
rx,
|
||||||
|
Arc::clone(&mgmt_api_client),
|
||||||
|
Arc::clone(&live_stats),
|
||||||
|
));
|
||||||
|
tx
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
let mut timeline = Timeline {
|
||||||
|
joinset,
|
||||||
|
layers,
|
||||||
|
concurrency,
|
||||||
|
};
|
||||||
|
|
||||||
|
live_stats.timeline_restart_done();
|
||||||
|
|
||||||
|
loop {
|
||||||
|
assert!(!timeline.joinset.is_empty());
|
||||||
|
if let Some(res) = timeline.joinset.try_join_next() {
|
||||||
|
debug!(?res, "a layer actor exited, should not happen");
|
||||||
|
timeline.joinset.shutdown().await;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut permit = Some(
|
||||||
|
Arc::clone(&timeline.concurrency)
|
||||||
|
.acquire_owned()
|
||||||
|
.await
|
||||||
|
.unwrap(),
|
||||||
|
);
|
||||||
|
|
||||||
|
loop {
|
||||||
|
let layer_tx = {
|
||||||
|
let mut rng = rand::thread_rng();
|
||||||
|
timeline.layers.choose_mut(&mut rng).expect("no layers")
|
||||||
|
};
|
||||||
|
match layer_tx.try_send(permit.take().unwrap()) {
|
||||||
|
Ok(_) => break,
|
||||||
|
Err(e) => match e {
|
||||||
|
mpsc::error::TrySendError::Full(back) => {
|
||||||
|
// TODO: retrying introduces bias away from slow downloaders
|
||||||
|
permit.replace(back);
|
||||||
|
}
|
||||||
|
mpsc::error::TrySendError::Closed(_) => panic!(),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn layer_actor(
|
||||||
|
tenant_shard_id: TenantShardId,
|
||||||
|
timeline_id: TimelineId,
|
||||||
|
mut layer: HistoricLayerInfo,
|
||||||
|
mut rx: mpsc::Receiver<tokio::sync::OwnedSemaphorePermit>,
|
||||||
|
mgmt_api_client: Arc<mgmt_api::Client>,
|
||||||
|
live_stats: Arc<LiveStats>,
|
||||||
|
) {
|
||||||
|
#[derive(Clone, Copy)]
|
||||||
|
enum Action {
|
||||||
|
Evict,
|
||||||
|
OnDemandDownload,
|
||||||
|
}
|
||||||
|
|
||||||
|
while let Some(_permit) = rx.recv().await {
|
||||||
|
let action = if layer.is_remote() {
|
||||||
|
Action::OnDemandDownload
|
||||||
|
} else {
|
||||||
|
Action::Evict
|
||||||
|
};
|
||||||
|
|
||||||
|
let did_it = match action {
|
||||||
|
Action::Evict => {
|
||||||
|
let did_it = mgmt_api_client
|
||||||
|
.layer_evict(tenant_shard_id, timeline_id, layer.layer_file_name())
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
live_stats.eviction_done();
|
||||||
|
did_it
|
||||||
|
}
|
||||||
|
Action::OnDemandDownload => {
|
||||||
|
let did_it = mgmt_api_client
|
||||||
|
.layer_ondemand_download(tenant_shard_id, timeline_id, layer.layer_file_name())
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
live_stats.download_done();
|
||||||
|
did_it
|
||||||
|
}
|
||||||
|
};
|
||||||
|
if !did_it {
|
||||||
|
debug!("local copy of layer map appears out of sync, re-downloading");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
debug!("did it");
|
||||||
|
layer.set_remote(match action {
|
||||||
|
Action::Evict => true,
|
||||||
|
Action::OnDemandDownload => false,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -16,6 +16,7 @@ mod util {
|
|||||||
mod cmd {
|
mod cmd {
|
||||||
pub(super) mod basebackup;
|
pub(super) mod basebackup;
|
||||||
pub(super) mod getpage_latest_lsn;
|
pub(super) mod getpage_latest_lsn;
|
||||||
|
pub(super) mod ondemand_download_churn;
|
||||||
pub(super) mod trigger_initial_size_calculation;
|
pub(super) mod trigger_initial_size_calculation;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -25,6 +26,7 @@ enum Args {
|
|||||||
Basebackup(cmd::basebackup::Args),
|
Basebackup(cmd::basebackup::Args),
|
||||||
GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
|
GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
|
||||||
TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
|
TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
|
||||||
|
OndemandDownloadChurn(cmd::ondemand_download_churn::Args),
|
||||||
}
|
}
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
@@ -43,6 +45,7 @@ fn main() {
|
|||||||
Args::TriggerInitialSizeCalculation(args) => {
|
Args::TriggerInitialSizeCalculation(args) => {
|
||||||
cmd::trigger_initial_size_calculation::main(args)
|
cmd::trigger_initial_size_calculation::main(args)
|
||||||
}
|
}
|
||||||
|
Args::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args),
|
||||||
}
|
}
|
||||||
.unwrap()
|
.unwrap()
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
#![recursion_limit = "300"]
|
||||||
|
|
||||||
//! Main entry point for the Page Server executable.
|
//! Main entry point for the Page Server executable.
|
||||||
|
|
||||||
use std::env::{var, VarError};
|
use std::env::{var, VarError};
|
||||||
@@ -118,6 +120,9 @@ fn main() -> anyhow::Result<()> {
|
|||||||
&[("node_id", &conf.id.to_string())],
|
&[("node_id", &conf.id.to_string())],
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// after setting up logging, log the effective IO engine choice
|
||||||
|
info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
|
||||||
|
|
||||||
let tenants_path = conf.tenants_path();
|
let tenants_path = conf.tenants_path();
|
||||||
if !tenants_path.exists() {
|
if !tenants_path.exists() {
|
||||||
utils::crashsafe::create_dir_all(conf.tenants_path())
|
utils::crashsafe::create_dir_all(conf.tenants_path())
|
||||||
@@ -312,6 +317,7 @@ fn start_pageserver(
|
|||||||
let http_listener = tcp_listener::bind(http_addr)?;
|
let http_listener = tcp_listener::bind(http_addr)?;
|
||||||
|
|
||||||
let pg_addr = &conf.listen_pg_addr;
|
let pg_addr = &conf.listen_pg_addr;
|
||||||
|
|
||||||
info!("Starting pageserver pg protocol handler on {pg_addr}");
|
info!("Starting pageserver pg protocol handler on {pg_addr}");
|
||||||
let pageserver_listener = tcp_listener::bind(pg_addr)?;
|
let pageserver_listener = tcp_listener::bind(pg_addr)?;
|
||||||
|
|
||||||
@@ -544,7 +550,7 @@ fn start_pageserver(
|
|||||||
let router_state = Arc::new(
|
let router_state = Arc::new(
|
||||||
http::routes::State::new(
|
http::routes::State::new(
|
||||||
conf,
|
conf,
|
||||||
tenant_manager,
|
tenant_manager.clone(),
|
||||||
http_auth.clone(),
|
http_auth.clone(),
|
||||||
remote_storage.clone(),
|
remote_storage.clone(),
|
||||||
broker_client.clone(),
|
broker_client.clone(),
|
||||||
@@ -594,32 +600,37 @@ fn start_pageserver(
|
|||||||
None,
|
None,
|
||||||
"consumption metrics collection",
|
"consumption metrics collection",
|
||||||
true,
|
true,
|
||||||
async move {
|
{
|
||||||
// first wait until background jobs are cleared to launch.
|
let tenant_manager = tenant_manager.clone();
|
||||||
//
|
async move {
|
||||||
// this is because we only process active tenants and timelines, and the
|
// first wait until background jobs are cleared to launch.
|
||||||
// Timeline::get_current_logical_size will spawn the logical size calculation,
|
//
|
||||||
// which will not be rate-limited.
|
// this is because we only process active tenants and timelines, and the
|
||||||
let cancel = task_mgr::shutdown_token();
|
// Timeline::get_current_logical_size will spawn the logical size calculation,
|
||||||
|
// which will not be rate-limited.
|
||||||
|
let cancel = task_mgr::shutdown_token();
|
||||||
|
|
||||||
tokio::select! {
|
tokio::select! {
|
||||||
_ = cancel.cancelled() => { return Ok(()); },
|
_ = cancel.cancelled() => { return Ok(()); },
|
||||||
_ = background_jobs_barrier.wait() => {}
|
_ = background_jobs_barrier.wait() => {}
|
||||||
};
|
};
|
||||||
|
|
||||||
pageserver::consumption_metrics::collect_metrics(
|
pageserver::consumption_metrics::collect_metrics(
|
||||||
metric_collection_endpoint,
|
tenant_manager,
|
||||||
conf.metric_collection_interval,
|
metric_collection_endpoint,
|
||||||
conf.cached_metric_collection_interval,
|
&conf.metric_collection_bucket,
|
||||||
conf.synthetic_size_calculation_interval,
|
conf.metric_collection_interval,
|
||||||
conf.id,
|
conf.cached_metric_collection_interval,
|
||||||
local_disk_storage,
|
conf.synthetic_size_calculation_interval,
|
||||||
cancel,
|
conf.id,
|
||||||
metrics_ctx,
|
local_disk_storage,
|
||||||
)
|
cancel,
|
||||||
.instrument(info_span!("metrics_collection"))
|
metrics_ctx,
|
||||||
.await?;
|
)
|
||||||
Ok(())
|
.instrument(info_span!("metrics_collection"))
|
||||||
|
.await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -688,6 +699,7 @@ fn start_pageserver(
|
|||||||
let bg_remote_storage = remote_storage.clone();
|
let bg_remote_storage = remote_storage.clone();
|
||||||
let bg_deletion_queue = deletion_queue.clone();
|
let bg_deletion_queue = deletion_queue.clone();
|
||||||
BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
|
BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
|
||||||
|
&tenant_manager,
|
||||||
bg_remote_storage.map(|_| bg_deletion_queue),
|
bg_remote_storage.map(|_| bg_deletion_queue),
|
||||||
0,
|
0,
|
||||||
));
|
));
|
||||||
|
|||||||
@@ -7,8 +7,9 @@
|
|||||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||||
use pageserver_api::shard::TenantShardId;
|
use pageserver_api::shard::TenantShardId;
|
||||||
use remote_storage::{RemotePath, RemoteStorageConfig};
|
use remote_storage::{RemotePath, RemoteStorageConfig};
|
||||||
|
use serde;
|
||||||
use serde::de::IntoDeserializer;
|
use serde::de::IntoDeserializer;
|
||||||
use std::env;
|
use std::{collections::HashMap, env};
|
||||||
use storage_broker::Uri;
|
use storage_broker::Uri;
|
||||||
use utils::crashsafe::path_with_suffix_extension;
|
use utils::crashsafe::path_with_suffix_extension;
|
||||||
use utils::id::ConnectionId;
|
use utils::id::ConnectionId;
|
||||||
@@ -29,18 +30,17 @@ use utils::{
|
|||||||
logging::LogFormat,
|
logging::LogFormat,
|
||||||
};
|
};
|
||||||
|
|
||||||
use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
|
|
||||||
use crate::tenant::config::TenantConf;
|
|
||||||
use crate::tenant::config::TenantConfOpt;
|
use crate::tenant::config::TenantConfOpt;
|
||||||
use crate::tenant::timeline::GetVectoredImpl;
|
use crate::tenant::timeline::GetVectoredImpl;
|
||||||
use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
|
use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
|
||||||
use crate::tenant::{
|
use crate::tenant::{
|
||||||
TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
|
TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
|
||||||
};
|
};
|
||||||
use crate::virtual_file;
|
use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
|
||||||
|
use crate::{tenant::config::TenantConf, virtual_file};
|
||||||
use crate::{
|
use crate::{
|
||||||
IGNORED_TENANT_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
|
IGNORED_TENANT_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
|
||||||
TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
|
TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
|
||||||
};
|
};
|
||||||
|
|
||||||
use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
|
use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
|
||||||
@@ -83,6 +83,10 @@ pub mod defaults {
|
|||||||
|
|
||||||
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
|
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
|
||||||
|
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "tokio-epoll-uring";
|
||||||
|
|
||||||
|
#[cfg(not(target_os = "linux"))]
|
||||||
pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs";
|
pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs";
|
||||||
|
|
||||||
pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential";
|
pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential";
|
||||||
@@ -91,6 +95,8 @@ pub mod defaults {
|
|||||||
|
|
||||||
pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
|
pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
|
||||||
|
|
||||||
|
pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
|
||||||
|
|
||||||
///
|
///
|
||||||
/// Default built-in configuration file.
|
/// Default built-in configuration file.
|
||||||
///
|
///
|
||||||
@@ -152,6 +158,8 @@ pub mod defaults {
|
|||||||
#heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
|
#heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
|
||||||
#secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY}
|
#secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY}
|
||||||
|
|
||||||
|
#ephemeral_bytes_per_memory_kb = {DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB}
|
||||||
|
|
||||||
[remote_storage]
|
[remote_storage]
|
||||||
|
|
||||||
"#
|
"#
|
||||||
@@ -230,6 +238,7 @@ pub struct PageServerConf {
|
|||||||
// How often to send unchanged cached metrics to the metrics endpoint.
|
// How often to send unchanged cached metrics to the metrics endpoint.
|
||||||
pub cached_metric_collection_interval: Duration,
|
pub cached_metric_collection_interval: Duration,
|
||||||
pub metric_collection_endpoint: Option<Url>,
|
pub metric_collection_endpoint: Option<Url>,
|
||||||
|
pub metric_collection_bucket: Option<RemoteStorageConfig>,
|
||||||
pub synthetic_size_calculation_interval: Duration,
|
pub synthetic_size_calculation_interval: Duration,
|
||||||
|
|
||||||
pub disk_usage_based_eviction: Option<DiskUsageEvictionTaskConfig>,
|
pub disk_usage_based_eviction: Option<DiskUsageEvictionTaskConfig>,
|
||||||
@@ -274,6 +283,13 @@ pub struct PageServerConf {
|
|||||||
pub max_vectored_read_bytes: MaxVectoredReadBytes,
|
pub max_vectored_read_bytes: MaxVectoredReadBytes,
|
||||||
|
|
||||||
pub validate_vectored_get: bool,
|
pub validate_vectored_get: bool,
|
||||||
|
|
||||||
|
/// How many bytes of ephemeral layer content will we allow per kilobyte of RAM. When this
|
||||||
|
/// is exceeded, we start proactively closing ephemeral layers to limit the total amount
|
||||||
|
/// of ephemeral data.
|
||||||
|
///
|
||||||
|
/// Setting this to zero disables limits on total ephemeral layer size.
|
||||||
|
pub ephemeral_bytes_per_memory_kb: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||||
@@ -286,21 +302,49 @@ pub static SAFEKEEPER_AUTH_TOKEN: OnceCell<Arc<String>> = OnceCell::new();
|
|||||||
|
|
||||||
// use dedicated enum for builder to better indicate the intention
|
// use dedicated enum for builder to better indicate the intention
|
||||||
// and avoid possible confusion with nested options
|
// and avoid possible confusion with nested options
|
||||||
|
#[derive(Clone, Default)]
|
||||||
pub enum BuilderValue<T> {
|
pub enum BuilderValue<T> {
|
||||||
Set(T),
|
Set(T),
|
||||||
|
#[default]
|
||||||
NotSet,
|
NotSet,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T> BuilderValue<T> {
|
impl<T: Clone> BuilderValue<T> {
|
||||||
pub fn ok_or<E>(self, err: E) -> Result<T, E> {
|
pub fn ok_or(&self, field_name: &'static str, default: BuilderValue<T>) -> anyhow::Result<T> {
|
||||||
match self {
|
match self {
|
||||||
Self::Set(v) => Ok(v),
|
Self::Set(v) => Ok(v.clone()),
|
||||||
Self::NotSet => Err(err),
|
Self::NotSet => match default {
|
||||||
|
BuilderValue::Set(v) => Ok(v.clone()),
|
||||||
|
BuilderValue::NotSet => {
|
||||||
|
anyhow::bail!("missing config value {field_name:?}")
|
||||||
|
}
|
||||||
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Certain metadata (e.g. externally-addressable name, AZ) is delivered
|
||||||
|
// as a separate structure. This information is not neeed by the pageserver
|
||||||
|
// itself, it is only used for registering the pageserver with the control
|
||||||
|
// plane and/or storage controller.
|
||||||
|
//
|
||||||
|
#[derive(serde::Deserialize)]
|
||||||
|
pub(crate) struct NodeMetadata {
|
||||||
|
#[serde(rename = "host")]
|
||||||
|
pub(crate) postgres_host: String,
|
||||||
|
#[serde(rename = "port")]
|
||||||
|
pub(crate) postgres_port: u16,
|
||||||
|
pub(crate) http_host: String,
|
||||||
|
pub(crate) http_port: u16,
|
||||||
|
|
||||||
|
// Deployment tools may write fields to the metadata file beyond what we
|
||||||
|
// use in this type: this type intentionally only names fields that require.
|
||||||
|
#[serde(flatten)]
|
||||||
|
pub(crate) other: HashMap<String, serde_json::Value>,
|
||||||
|
}
|
||||||
|
|
||||||
// needed to simplify config construction
|
// needed to simplify config construction
|
||||||
|
#[derive(Default)]
|
||||||
struct PageServerConfigBuilder {
|
struct PageServerConfigBuilder {
|
||||||
listen_pg_addr: BuilderValue<String>,
|
listen_pg_addr: BuilderValue<String>,
|
||||||
|
|
||||||
@@ -341,6 +385,7 @@ struct PageServerConfigBuilder {
|
|||||||
cached_metric_collection_interval: BuilderValue<Duration>,
|
cached_metric_collection_interval: BuilderValue<Duration>,
|
||||||
metric_collection_endpoint: BuilderValue<Option<Url>>,
|
metric_collection_endpoint: BuilderValue<Option<Url>>,
|
||||||
synthetic_size_calculation_interval: BuilderValue<Duration>,
|
synthetic_size_calculation_interval: BuilderValue<Duration>,
|
||||||
|
metric_collection_bucket: BuilderValue<Option<RemoteStorageConfig>>,
|
||||||
|
|
||||||
disk_usage_based_eviction: BuilderValue<Option<DiskUsageEvictionTaskConfig>>,
|
disk_usage_based_eviction: BuilderValue<Option<DiskUsageEvictionTaskConfig>>,
|
||||||
|
|
||||||
@@ -366,10 +411,13 @@ struct PageServerConfigBuilder {
|
|||||||
max_vectored_read_bytes: BuilderValue<MaxVectoredReadBytes>,
|
max_vectored_read_bytes: BuilderValue<MaxVectoredReadBytes>,
|
||||||
|
|
||||||
validate_vectored_get: BuilderValue<bool>,
|
validate_vectored_get: BuilderValue<bool>,
|
||||||
|
|
||||||
|
ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for PageServerConfigBuilder {
|
impl PageServerConfigBuilder {
|
||||||
fn default() -> Self {
|
#[inline(always)]
|
||||||
|
fn default_values() -> Self {
|
||||||
use self::BuilderValue::*;
|
use self::BuilderValue::*;
|
||||||
use defaults::*;
|
use defaults::*;
|
||||||
Self {
|
Self {
|
||||||
@@ -422,6 +470,8 @@ impl Default for PageServerConfigBuilder {
|
|||||||
.expect("cannot parse default synthetic size calculation interval")),
|
.expect("cannot parse default synthetic size calculation interval")),
|
||||||
metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT),
|
metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT),
|
||||||
|
|
||||||
|
metric_collection_bucket: Set(None),
|
||||||
|
|
||||||
disk_usage_based_eviction: Set(None),
|
disk_usage_based_eviction: Set(None),
|
||||||
|
|
||||||
test_remote_failures: Set(0),
|
test_remote_failures: Set(0),
|
||||||
@@ -449,6 +499,7 @@ impl Default for PageServerConfigBuilder {
|
|||||||
NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
|
NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
|
||||||
)),
|
)),
|
||||||
validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
|
validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
|
||||||
|
ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -553,6 +604,13 @@ impl PageServerConfigBuilder {
|
|||||||
self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint)
|
self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn metric_collection_bucket(
|
||||||
|
&mut self,
|
||||||
|
metric_collection_bucket: Option<RemoteStorageConfig>,
|
||||||
|
) {
|
||||||
|
self.metric_collection_bucket = BuilderValue::Set(metric_collection_bucket)
|
||||||
|
}
|
||||||
|
|
||||||
pub fn synthetic_size_calculation_interval(
|
pub fn synthetic_size_calculation_interval(
|
||||||
&mut self,
|
&mut self,
|
||||||
synthetic_size_calculation_interval: Duration,
|
synthetic_size_calculation_interval: Duration,
|
||||||
@@ -621,126 +679,103 @@ impl PageServerConfigBuilder {
|
|||||||
self.validate_vectored_get = BuilderValue::Set(value);
|
self.validate_vectored_get = BuilderValue::Set(value);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn get_ephemeral_bytes_per_memory_kb(&mut self, value: usize) {
|
||||||
|
self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
|
||||||
|
}
|
||||||
|
|
||||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||||
let concurrent_tenant_warmup = self
|
let default = Self::default_values();
|
||||||
.concurrent_tenant_warmup
|
|
||||||
.ok_or(anyhow!("missing concurrent_tenant_warmup"))?;
|
macro_rules! conf {
|
||||||
let concurrent_tenant_size_logical_size_queries = self
|
(USING DEFAULT { $($field:ident,)* } CUSTOM LOGIC { $($custom_field:ident : $custom_value:expr,)* } ) => {
|
||||||
.concurrent_tenant_size_logical_size_queries
|
PageServerConf {
|
||||||
.ok_or(anyhow!(
|
$(
|
||||||
"missing concurrent_tenant_size_logical_size_queries"
|
$field: self.$field.ok_or(stringify!($field), default.$field)?,
|
||||||
))?;
|
)*
|
||||||
Ok(PageServerConf {
|
$(
|
||||||
listen_pg_addr: self
|
$custom_field: $custom_value,
|
||||||
.listen_pg_addr
|
)*
|
||||||
.ok_or(anyhow!("missing listen_pg_addr"))?,
|
}
|
||||||
listen_http_addr: self
|
};
|
||||||
.listen_http_addr
|
}
|
||||||
.ok_or(anyhow!("missing listen_http_addr"))?,
|
|
||||||
availability_zone: self
|
Ok(conf!(
|
||||||
.availability_zone
|
USING DEFAULT
|
||||||
.ok_or(anyhow!("missing availability_zone"))?,
|
{
|
||||||
wait_lsn_timeout: self
|
listen_pg_addr,
|
||||||
.wait_lsn_timeout
|
listen_http_addr,
|
||||||
.ok_or(anyhow!("missing wait_lsn_timeout"))?,
|
availability_zone,
|
||||||
wal_redo_timeout: self
|
wait_lsn_timeout,
|
||||||
.wal_redo_timeout
|
wal_redo_timeout,
|
||||||
.ok_or(anyhow!("missing wal_redo_timeout"))?,
|
superuser,
|
||||||
superuser: self.superuser.ok_or(anyhow!("missing superuser"))?,
|
page_cache_size,
|
||||||
page_cache_size: self
|
max_file_descriptors,
|
||||||
.page_cache_size
|
workdir,
|
||||||
.ok_or(anyhow!("missing page_cache_size"))?,
|
pg_distrib_dir,
|
||||||
max_file_descriptors: self
|
http_auth_type,
|
||||||
.max_file_descriptors
|
pg_auth_type,
|
||||||
.ok_or(anyhow!("missing max_file_descriptors"))?,
|
auth_validation_public_key_path,
|
||||||
workdir: self.workdir.ok_or(anyhow!("missing workdir"))?,
|
remote_storage_config,
|
||||||
pg_distrib_dir: self
|
id,
|
||||||
.pg_distrib_dir
|
broker_endpoint,
|
||||||
.ok_or(anyhow!("missing pg_distrib_dir"))?,
|
broker_keepalive_interval,
|
||||||
http_auth_type: self
|
log_format,
|
||||||
.http_auth_type
|
metric_collection_interval,
|
||||||
.ok_or(anyhow!("missing http_auth_type"))?,
|
cached_metric_collection_interval,
|
||||||
pg_auth_type: self.pg_auth_type.ok_or(anyhow!("missing pg_auth_type"))?,
|
metric_collection_endpoint,
|
||||||
auth_validation_public_key_path: self
|
metric_collection_bucket,
|
||||||
.auth_validation_public_key_path
|
synthetic_size_calculation_interval,
|
||||||
.ok_or(anyhow!("missing auth_validation_public_key_path"))?,
|
disk_usage_based_eviction,
|
||||||
remote_storage_config: self
|
test_remote_failures,
|
||||||
.remote_storage_config
|
ondemand_download_behavior_treat_error_as_warn,
|
||||||
.ok_or(anyhow!("missing remote_storage_config"))?,
|
background_task_maximum_delay,
|
||||||
id: self.id.ok_or(anyhow!("missing id"))?,
|
control_plane_api,
|
||||||
// TenantConf is handled separately
|
control_plane_api_token,
|
||||||
default_tenant_conf: TenantConf::default(),
|
control_plane_emergency_mode,
|
||||||
broker_endpoint: self
|
heatmap_upload_concurrency,
|
||||||
.broker_endpoint
|
secondary_download_concurrency,
|
||||||
.ok_or(anyhow!("No broker endpoints provided"))?,
|
ingest_batch_size,
|
||||||
broker_keepalive_interval: self
|
get_vectored_impl,
|
||||||
.broker_keepalive_interval
|
max_vectored_read_bytes,
|
||||||
.ok_or(anyhow!("No broker keepalive interval provided"))?,
|
validate_vectored_get,
|
||||||
log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
|
ephemeral_bytes_per_memory_kb,
|
||||||
concurrent_tenant_warmup: ConfigurableSemaphore::new(concurrent_tenant_warmup),
|
}
|
||||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new(
|
CUSTOM LOGIC
|
||||||
concurrent_tenant_size_logical_size_queries,
|
{
|
||||||
),
|
// TenantConf is handled separately
|
||||||
eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new(
|
default_tenant_conf: TenantConf::default(),
|
||||||
concurrent_tenant_size_logical_size_queries,
|
concurrent_tenant_warmup: ConfigurableSemaphore::new({
|
||||||
),
|
self
|
||||||
metric_collection_interval: self
|
.concurrent_tenant_warmup
|
||||||
.metric_collection_interval
|
.ok_or("concurrent_tenant_warmpup",
|
||||||
.ok_or(anyhow!("missing metric_collection_interval"))?,
|
default.concurrent_tenant_warmup)?
|
||||||
cached_metric_collection_interval: self
|
}),
|
||||||
.cached_metric_collection_interval
|
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new(
|
||||||
.ok_or(anyhow!("missing cached_metric_collection_interval"))?,
|
self
|
||||||
metric_collection_endpoint: self
|
.concurrent_tenant_size_logical_size_queries
|
||||||
.metric_collection_endpoint
|
.ok_or("concurrent_tenant_size_logical_size_queries",
|
||||||
.ok_or(anyhow!("missing metric_collection_endpoint"))?,
|
default.concurrent_tenant_size_logical_size_queries.clone())?
|
||||||
synthetic_size_calculation_interval: self
|
),
|
||||||
.synthetic_size_calculation_interval
|
eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new(
|
||||||
.ok_or(anyhow!("missing synthetic_size_calculation_interval"))?,
|
// re-use `concurrent_tenant_size_logical_size_queries`
|
||||||
disk_usage_based_eviction: self
|
self
|
||||||
.disk_usage_based_eviction
|
.concurrent_tenant_size_logical_size_queries
|
||||||
.ok_or(anyhow!("missing disk_usage_based_eviction"))?,
|
.ok_or("eviction_task_immitated_concurrent_logical_size_queries",
|
||||||
test_remote_failures: self
|
default.concurrent_tenant_size_logical_size_queries.clone())?,
|
||||||
.test_remote_failures
|
),
|
||||||
.ok_or(anyhow!("missing test_remote_failuers"))?,
|
virtual_file_io_engine: match self.virtual_file_io_engine {
|
||||||
ondemand_download_behavior_treat_error_as_warn: self
|
BuilderValue::Set(v) => v,
|
||||||
.ondemand_download_behavior_treat_error_as_warn
|
BuilderValue::NotSet => match crate::virtual_file::io_engine_feature_test().context("auto-detect virtual_file_io_engine")? {
|
||||||
.ok_or(anyhow!(
|
io_engine::FeatureTestResult::PlatformPreferred(v) => v, // make no noise
|
||||||
"missing ondemand_download_behavior_treat_error_as_warn"
|
io_engine::FeatureTestResult::Worse { engine, remark } => {
|
||||||
))?,
|
// TODO: bubble this up to the caller so we can tracing::warn! it.
|
||||||
background_task_maximum_delay: self
|
eprintln!("auto-detected IO engine is not platform-preferred: engine={engine:?} remark={remark:?}");
|
||||||
.background_task_maximum_delay
|
engine
|
||||||
.ok_or(anyhow!("missing background_task_maximum_delay"))?,
|
}
|
||||||
control_plane_api: self
|
},
|
||||||
.control_plane_api
|
},
|
||||||
.ok_or(anyhow!("missing control_plane_api"))?,
|
}
|
||||||
control_plane_api_token: self
|
))
|
||||||
.control_plane_api_token
|
|
||||||
.ok_or(anyhow!("missing control_plane_api_token"))?,
|
|
||||||
control_plane_emergency_mode: self
|
|
||||||
.control_plane_emergency_mode
|
|
||||||
.ok_or(anyhow!("missing control_plane_emergency_mode"))?,
|
|
||||||
heatmap_upload_concurrency: self
|
|
||||||
.heatmap_upload_concurrency
|
|
||||||
.ok_or(anyhow!("missing heatmap_upload_concurrency"))?,
|
|
||||||
secondary_download_concurrency: self
|
|
||||||
.secondary_download_concurrency
|
|
||||||
.ok_or(anyhow!("missing secondary_download_concurrency"))?,
|
|
||||||
ingest_batch_size: self
|
|
||||||
.ingest_batch_size
|
|
||||||
.ok_or(anyhow!("missing ingest_batch_size"))?,
|
|
||||||
virtual_file_io_engine: self
|
|
||||||
.virtual_file_io_engine
|
|
||||||
.ok_or(anyhow!("missing virtual_file_io_engine"))?,
|
|
||||||
get_vectored_impl: self
|
|
||||||
.get_vectored_impl
|
|
||||||
.ok_or(anyhow!("missing get_vectored_impl"))?,
|
|
||||||
max_vectored_read_bytes: self
|
|
||||||
.max_vectored_read_bytes
|
|
||||||
.ok_or(anyhow!("missing max_vectored_read_bytes"))?,
|
|
||||||
validate_vectored_get: self
|
|
||||||
.validate_vectored_get
|
|
||||||
.ok_or(anyhow!("missing validate_vectored_get"))?,
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -757,6 +792,10 @@ impl PageServerConf {
|
|||||||
self.workdir.join("deletion")
|
self.workdir.join("deletion")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn metadata_path(&self) -> Utf8PathBuf {
|
||||||
|
self.workdir.join("metadata.json")
|
||||||
|
}
|
||||||
|
|
||||||
pub fn deletion_list_path(&self, sequence: u64) -> Utf8PathBuf {
|
pub fn deletion_list_path(&self, sequence: u64) -> Utf8PathBuf {
|
||||||
// Encode a version in the filename, so that if we ever switch away from JSON we can
|
// Encode a version in the filename, so that if we ever switch away from JSON we can
|
||||||
// increment this.
|
// increment this.
|
||||||
@@ -816,18 +855,7 @@ impl PageServerConf {
|
|||||||
.join(timeline_id.to_string())
|
.join(timeline_id.to_string())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn timeline_uninit_mark_file_path(
|
pub(crate) fn timeline_delete_mark_file_path(
|
||||||
&self,
|
|
||||||
tenant_shard_id: TenantShardId,
|
|
||||||
timeline_id: TimelineId,
|
|
||||||
) -> Utf8PathBuf {
|
|
||||||
path_with_suffix_extension(
|
|
||||||
self.timeline_path(&tenant_shard_id, &timeline_id),
|
|
||||||
TIMELINE_UNINIT_MARK_SUFFIX,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn timeline_delete_mark_file_path(
|
|
||||||
&self,
|
&self,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_shard_id: TenantShardId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
@@ -838,7 +866,10 @@ impl PageServerConf {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn tenant_deleted_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
|
pub(crate) fn tenant_deleted_mark_file_path(
|
||||||
|
&self,
|
||||||
|
tenant_shard_id: &TenantShardId,
|
||||||
|
) -> Utf8PathBuf {
|
||||||
self.tenant_path(tenant_shard_id)
|
self.tenant_path(tenant_shard_id)
|
||||||
.join(TENANT_DELETED_MARKER_FILE_NAME)
|
.join(TENANT_DELETED_MARKER_FILE_NAME)
|
||||||
}
|
}
|
||||||
@@ -942,6 +973,9 @@ impl PageServerConf {
|
|||||||
let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?;
|
let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?;
|
||||||
builder.metric_collection_endpoint(Some(endpoint));
|
builder.metric_collection_endpoint(Some(endpoint));
|
||||||
},
|
},
|
||||||
|
"metric_collection_bucket" => {
|
||||||
|
builder.metric_collection_bucket(RemoteStorageConfig::from_toml(item)?)
|
||||||
|
}
|
||||||
"synthetic_size_calculation_interval" =>
|
"synthetic_size_calculation_interval" =>
|
||||||
builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
|
builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
|
||||||
"test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
|
"test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
|
||||||
@@ -995,6 +1029,9 @@ impl PageServerConf {
|
|||||||
"validate_vectored_get" => {
|
"validate_vectored_get" => {
|
||||||
builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
|
builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
|
||||||
}
|
}
|
||||||
|
"ephemeral_bytes_per_memory_kb" => {
|
||||||
|
builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
|
||||||
|
}
|
||||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1057,6 +1094,7 @@ impl PageServerConf {
|
|||||||
metric_collection_interval: Duration::from_secs(60),
|
metric_collection_interval: Duration::from_secs(60),
|
||||||
cached_metric_collection_interval: Duration::from_secs(60 * 60),
|
cached_metric_collection_interval: Duration::from_secs(60 * 60),
|
||||||
metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
|
metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
|
||||||
|
metric_collection_bucket: None,
|
||||||
synthetic_size_calculation_interval: Duration::from_secs(60),
|
synthetic_size_calculation_interval: Duration::from_secs(60),
|
||||||
disk_usage_based_eviction: None,
|
disk_usage_based_eviction: None,
|
||||||
test_remote_failures: 0,
|
test_remote_failures: 0,
|
||||||
@@ -1075,6 +1113,7 @@ impl PageServerConf {
|
|||||||
.expect("Invalid default constant"),
|
.expect("Invalid default constant"),
|
||||||
),
|
),
|
||||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||||
|
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1289,6 +1328,7 @@ background_task_maximum_delay = '334 s'
|
|||||||
defaults::DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL
|
defaults::DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL
|
||||||
)?,
|
)?,
|
||||||
metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
|
metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
|
||||||
|
metric_collection_bucket: None,
|
||||||
synthetic_size_calculation_interval: humantime::parse_duration(
|
synthetic_size_calculation_interval: humantime::parse_duration(
|
||||||
defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL
|
defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL
|
||||||
)?,
|
)?,
|
||||||
@@ -1311,6 +1351,7 @@ background_task_maximum_delay = '334 s'
|
|||||||
.expect("Invalid default constant")
|
.expect("Invalid default constant")
|
||||||
),
|
),
|
||||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||||
|
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
|
||||||
},
|
},
|
||||||
"Correct defaults should be used when no config values are provided"
|
"Correct defaults should be used when no config values are provided"
|
||||||
);
|
);
|
||||||
@@ -1363,6 +1404,7 @@ background_task_maximum_delay = '334 s'
|
|||||||
metric_collection_interval: Duration::from_secs(222),
|
metric_collection_interval: Duration::from_secs(222),
|
||||||
cached_metric_collection_interval: Duration::from_secs(22200),
|
cached_metric_collection_interval: Duration::from_secs(22200),
|
||||||
metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
|
metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
|
||||||
|
metric_collection_bucket: None,
|
||||||
synthetic_size_calculation_interval: Duration::from_secs(333),
|
synthetic_size_calculation_interval: Duration::from_secs(333),
|
||||||
disk_usage_based_eviction: None,
|
disk_usage_based_eviction: None,
|
||||||
test_remote_failures: 0,
|
test_remote_failures: 0,
|
||||||
@@ -1381,6 +1423,7 @@ background_task_maximum_delay = '334 s'
|
|||||||
.expect("Invalid default constant")
|
.expect("Invalid default constant")
|
||||||
),
|
),
|
||||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||||
|
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
|
||||||
},
|
},
|
||||||
"Should be able to parse all basic config values correctly"
|
"Should be able to parse all basic config values correctly"
|
||||||
);
|
);
|
||||||
|
|||||||
@@ -3,10 +3,13 @@
|
|||||||
use crate::context::{DownloadBehavior, RequestContext};
|
use crate::context::{DownloadBehavior, RequestContext};
|
||||||
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
|
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
|
||||||
use crate::tenant::tasks::BackgroundLoopKind;
|
use crate::tenant::tasks::BackgroundLoopKind;
|
||||||
use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError, Tenant};
|
use crate::tenant::{
|
||||||
|
mgr::TenantManager, LogicalSizeCalculationCause, PageReconstructError, Tenant,
|
||||||
|
};
|
||||||
use camino::Utf8PathBuf;
|
use camino::Utf8PathBuf;
|
||||||
use consumption_metrics::EventType;
|
use consumption_metrics::EventType;
|
||||||
use pageserver_api::models::TenantState;
|
use pageserver_api::models::TenantState;
|
||||||
|
use remote_storage::{GenericRemoteStorage, RemoteStorageConfig};
|
||||||
use reqwest::Url;
|
use reqwest::Url;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
@@ -40,7 +43,9 @@ type Cache = HashMap<MetricsKey, (EventType, u64)>;
|
|||||||
/// Main thread that serves metrics collection
|
/// Main thread that serves metrics collection
|
||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
pub async fn collect_metrics(
|
pub async fn collect_metrics(
|
||||||
|
tenant_manager: Arc<TenantManager>,
|
||||||
metric_collection_endpoint: &Url,
|
metric_collection_endpoint: &Url,
|
||||||
|
metric_collection_bucket: &Option<RemoteStorageConfig>,
|
||||||
metric_collection_interval: Duration,
|
metric_collection_interval: Duration,
|
||||||
_cached_metric_collection_interval: Duration,
|
_cached_metric_collection_interval: Duration,
|
||||||
synthetic_size_calculation_interval: Duration,
|
synthetic_size_calculation_interval: Duration,
|
||||||
@@ -65,15 +70,19 @@ pub async fn collect_metrics(
|
|||||||
None,
|
None,
|
||||||
"synthetic size calculation",
|
"synthetic size calculation",
|
||||||
false,
|
false,
|
||||||
async move {
|
{
|
||||||
calculate_synthetic_size_worker(
|
let tenant_manager = tenant_manager.clone();
|
||||||
synthetic_size_calculation_interval,
|
async move {
|
||||||
&cancel,
|
calculate_synthetic_size_worker(
|
||||||
&worker_ctx,
|
tenant_manager,
|
||||||
)
|
synthetic_size_calculation_interval,
|
||||||
.instrument(info_span!("synthetic_size_worker"))
|
&cancel,
|
||||||
.await?;
|
&worker_ctx,
|
||||||
Ok(())
|
)
|
||||||
|
.instrument(info_span!("synthetic_size_worker"))
|
||||||
|
.await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -94,13 +103,27 @@ pub async fn collect_metrics(
|
|||||||
.build()
|
.build()
|
||||||
.expect("Failed to create http client with timeout");
|
.expect("Failed to create http client with timeout");
|
||||||
|
|
||||||
|
let bucket_client = if let Some(bucket_config) = metric_collection_bucket {
|
||||||
|
match GenericRemoteStorage::from_config(bucket_config) {
|
||||||
|
Ok(client) => Some(client),
|
||||||
|
Err(e) => {
|
||||||
|
// Non-fatal error: if we were given an invalid config, we will proceed
|
||||||
|
// with sending metrics over the network, but not to S3.
|
||||||
|
tracing::warn!("Invalid configuration for metric_collection_bucket: {e}");
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
let node_id = node_id.to_string();
|
let node_id = node_id.to_string();
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
let started_at = Instant::now();
|
let started_at = Instant::now();
|
||||||
|
|
||||||
// these are point in time, with variable "now"
|
// these are point in time, with variable "now"
|
||||||
let metrics = metrics::collect_all_metrics(&cached_metrics, &ctx).await;
|
let metrics = metrics::collect_all_metrics(&tenant_manager, &cached_metrics, &ctx).await;
|
||||||
|
|
||||||
let metrics = Arc::new(metrics);
|
let metrics = Arc::new(metrics);
|
||||||
|
|
||||||
@@ -118,10 +141,18 @@ pub async fn collect_metrics(
|
|||||||
tracing::error!("failed to persist metrics to {path:?}: {e:#}");
|
tracing::error!("failed to persist metrics to {path:?}: {e:#}");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if let Some(bucket_client) = &bucket_client {
|
||||||
|
let res =
|
||||||
|
upload::upload_metrics_bucket(bucket_client, &cancel, &node_id, &metrics).await;
|
||||||
|
if let Err(e) = res {
|
||||||
|
tracing::error!("failed to upload to S3: {e:#}");
|
||||||
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let upload = async {
|
let upload = async {
|
||||||
let res = upload::upload_metrics(
|
let res = upload::upload_metrics_http(
|
||||||
&client,
|
&client,
|
||||||
metric_collection_endpoint,
|
metric_collection_endpoint,
|
||||||
&cancel,
|
&cancel,
|
||||||
@@ -132,7 +163,7 @@ pub async fn collect_metrics(
|
|||||||
.await;
|
.await;
|
||||||
if let Err(e) = res {
|
if let Err(e) = res {
|
||||||
// serialization error which should never happen
|
// serialization error which should never happen
|
||||||
tracing::error!("failed to upload due to {e:#}");
|
tracing::error!("failed to upload via HTTP due to {e:#}");
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -247,6 +278,7 @@ async fn reschedule(
|
|||||||
|
|
||||||
/// Caclculate synthetic size for each active tenant
|
/// Caclculate synthetic size for each active tenant
|
||||||
async fn calculate_synthetic_size_worker(
|
async fn calculate_synthetic_size_worker(
|
||||||
|
tenant_manager: Arc<TenantManager>,
|
||||||
synthetic_size_calculation_interval: Duration,
|
synthetic_size_calculation_interval: Duration,
|
||||||
cancel: &CancellationToken,
|
cancel: &CancellationToken,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
@@ -259,7 +291,7 @@ async fn calculate_synthetic_size_worker(
|
|||||||
loop {
|
loop {
|
||||||
let started_at = Instant::now();
|
let started_at = Instant::now();
|
||||||
|
|
||||||
let tenants = match mgr::list_tenants().await {
|
let tenants = match tenant_manager.list_tenants() {
|
||||||
Ok(tenants) => tenants,
|
Ok(tenants) => tenants,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
warn!("cannot get tenant list: {e:#}");
|
warn!("cannot get tenant list: {e:#}");
|
||||||
@@ -278,10 +310,14 @@ async fn calculate_synthetic_size_worker(
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
let Ok(tenant) = mgr::get_tenant(tenant_shard_id, true) else {
|
let Ok(tenant) = tenant_manager.get_attached_tenant_shard(tenant_shard_id) else {
|
||||||
continue;
|
continue;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
if !tenant.is_active() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
// there is never any reason to exit calculate_synthetic_size_worker following any
|
// there is never any reason to exit calculate_synthetic_size_worker following any
|
||||||
// return value -- we don't need to care about shutdown because no tenant is found when
|
// return value -- we don't need to care about shutdown because no tenant is found when
|
||||||
// pageserver is shut down.
|
// pageserver is shut down.
|
||||||
@@ -319,9 +355,7 @@ async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &Re
|
|||||||
};
|
};
|
||||||
|
|
||||||
// this error can be returned if timeline is shutting down, but it does not
|
// this error can be returned if timeline is shutting down, but it does not
|
||||||
// mean the synthetic size worker should terminate. we do not need any checks
|
// mean the synthetic size worker should terminate.
|
||||||
// in this function because `mgr::get_tenant` will error out after shutdown has
|
|
||||||
// progressed to shutting down tenants.
|
|
||||||
let shutting_down = matches!(
|
let shutting_down = matches!(
|
||||||
e.downcast_ref::<PageReconstructError>(),
|
e.downcast_ref::<PageReconstructError>(),
|
||||||
Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
|
Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
use crate::tenant::mgr::TenantManager;
|
||||||
use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogicalSize};
|
use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogicalSize};
|
||||||
use chrono::{DateTime, Utc};
|
use chrono::{DateTime, Utc};
|
||||||
use consumption_metrics::EventType;
|
use consumption_metrics::EventType;
|
||||||
@@ -181,6 +182,7 @@ impl MetricsKey {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub(super) async fn collect_all_metrics(
|
pub(super) async fn collect_all_metrics(
|
||||||
|
tenant_manager: &Arc<TenantManager>,
|
||||||
cached_metrics: &Cache,
|
cached_metrics: &Cache,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Vec<RawMetric> {
|
) -> Vec<RawMetric> {
|
||||||
@@ -188,7 +190,7 @@ pub(super) async fn collect_all_metrics(
|
|||||||
|
|
||||||
let started_at = std::time::Instant::now();
|
let started_at = std::time::Instant::now();
|
||||||
|
|
||||||
let tenants = match crate::tenant::mgr::list_tenants().await {
|
let tenants = match tenant_manager.list_tenants() {
|
||||||
Ok(tenants) => tenants,
|
Ok(tenants) => tenants,
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
tracing::error!("failed to list tenants: {:?}", err);
|
tracing::error!("failed to list tenants: {:?}", err);
|
||||||
@@ -200,7 +202,8 @@ pub(super) async fn collect_all_metrics(
|
|||||||
if state != TenantState::Active || !id.is_zero() {
|
if state != TenantState::Active || !id.is_zero() {
|
||||||
None
|
None
|
||||||
} else {
|
} else {
|
||||||
crate::tenant::mgr::get_tenant(id, true)
|
tenant_manager
|
||||||
|
.get_attached_tenant_shard(id)
|
||||||
.ok()
|
.ok()
|
||||||
.map(|tenant| (id.tenant_id, tenant))
|
.map(|tenant| (id.tenant_id, tenant))
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,4 +1,9 @@
|
|||||||
|
use std::time::SystemTime;
|
||||||
|
|
||||||
|
use chrono::{DateTime, Utc};
|
||||||
use consumption_metrics::{Event, EventChunk, IdempotencyKey, CHUNK_SIZE};
|
use consumption_metrics::{Event, EventChunk, IdempotencyKey, CHUNK_SIZE};
|
||||||
|
use remote_storage::{GenericRemoteStorage, RemotePath};
|
||||||
|
use tokio::io::AsyncWriteExt;
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
use tracing::Instrument;
|
use tracing::Instrument;
|
||||||
|
|
||||||
@@ -13,8 +18,9 @@ struct Ids {
|
|||||||
pub(super) timeline_id: Option<TimelineId>,
|
pub(super) timeline_id: Option<TimelineId>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Serialize and write metrics to an HTTP endpoint
|
||||||
#[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))]
|
#[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))]
|
||||||
pub(super) async fn upload_metrics(
|
pub(super) async fn upload_metrics_http(
|
||||||
client: &reqwest::Client,
|
client: &reqwest::Client,
|
||||||
metric_collection_endpoint: &reqwest::Url,
|
metric_collection_endpoint: &reqwest::Url,
|
||||||
cancel: &CancellationToken,
|
cancel: &CancellationToken,
|
||||||
@@ -74,6 +80,60 @@ pub(super) async fn upload_metrics(
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Serialize and write metrics to a remote storage object
|
||||||
|
#[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))]
|
||||||
|
pub(super) async fn upload_metrics_bucket(
|
||||||
|
client: &GenericRemoteStorage,
|
||||||
|
cancel: &CancellationToken,
|
||||||
|
node_id: &str,
|
||||||
|
metrics: &[RawMetric],
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
if metrics.is_empty() {
|
||||||
|
// Skip uploads if we have no metrics, so that readers don't have to handle the edge case
|
||||||
|
// of an empty object.
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compose object path
|
||||||
|
let datetime: DateTime<Utc> = SystemTime::now().into();
|
||||||
|
let ts_prefix = datetime.format("year=%Y/month=%m/day=%d/%H:%M:%SZ");
|
||||||
|
let path = RemotePath::from_string(&format!("{ts_prefix}_{node_id}.ndjson.gz"))?;
|
||||||
|
|
||||||
|
// Set up a gzip writer into a buffer
|
||||||
|
let mut compressed_bytes: Vec<u8> = Vec::new();
|
||||||
|
let compressed_writer = std::io::Cursor::new(&mut compressed_bytes);
|
||||||
|
let mut gzip_writer = async_compression::tokio::write::GzipEncoder::new(compressed_writer);
|
||||||
|
|
||||||
|
// Serialize and write into compressed buffer
|
||||||
|
let started_at = std::time::Instant::now();
|
||||||
|
for res in serialize_in_chunks(CHUNK_SIZE, metrics, node_id) {
|
||||||
|
let (_chunk, body) = res?;
|
||||||
|
gzip_writer.write_all(&body).await?;
|
||||||
|
}
|
||||||
|
gzip_writer.flush().await?;
|
||||||
|
gzip_writer.shutdown().await?;
|
||||||
|
let compressed_length = compressed_bytes.len();
|
||||||
|
|
||||||
|
// Write to remote storage
|
||||||
|
client
|
||||||
|
.upload_storage_object(
|
||||||
|
futures::stream::once(futures::future::ready(Ok(compressed_bytes.into()))),
|
||||||
|
compressed_length,
|
||||||
|
&path,
|
||||||
|
cancel,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
let elapsed = started_at.elapsed();
|
||||||
|
|
||||||
|
tracing::info!(
|
||||||
|
compressed_length,
|
||||||
|
elapsed_ms = elapsed.as_millis(),
|
||||||
|
"write metrics bucket at {path}",
|
||||||
|
);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
// The return type is quite ugly, but we gain testability in isolation
|
// The return type is quite ugly, but we gain testability in isolation
|
||||||
fn serialize_in_chunks<'a, F>(
|
fn serialize_in_chunks<'a, F>(
|
||||||
chunk_size: usize,
|
chunk_size: usize,
|
||||||
|
|||||||
@@ -2,9 +2,11 @@ use std::collections::HashMap;
|
|||||||
|
|
||||||
use futures::Future;
|
use futures::Future;
|
||||||
use pageserver_api::{
|
use pageserver_api::{
|
||||||
|
controller_api::NodeRegisterRequest,
|
||||||
shard::TenantShardId,
|
shard::TenantShardId,
|
||||||
upcall_api::{
|
upcall_api::{
|
||||||
ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
|
ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
|
||||||
|
ValidateRequestTenant, ValidateResponse,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
use serde::{de::DeserializeOwned, Serialize};
|
use serde::{de::DeserializeOwned, Serialize};
|
||||||
@@ -12,7 +14,10 @@ use tokio_util::sync::CancellationToken;
|
|||||||
use url::Url;
|
use url::Url;
|
||||||
use utils::{backoff, generation::Generation, id::NodeId};
|
use utils::{backoff, generation::Generation, id::NodeId};
|
||||||
|
|
||||||
use crate::config::PageServerConf;
|
use crate::{
|
||||||
|
config::{NodeMetadata, PageServerConf},
|
||||||
|
virtual_file::on_fatal_io_error,
|
||||||
|
};
|
||||||
|
|
||||||
/// The Pageserver's client for using the control plane API: this is a small subset
|
/// The Pageserver's client for using the control plane API: this is a small subset
|
||||||
/// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md)
|
/// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md)
|
||||||
@@ -32,7 +37,10 @@ pub enum RetryForeverError {
|
|||||||
pub trait ControlPlaneGenerationsApi {
|
pub trait ControlPlaneGenerationsApi {
|
||||||
fn re_attach(
|
fn re_attach(
|
||||||
&self,
|
&self,
|
||||||
) -> impl Future<Output = Result<HashMap<TenantShardId, Generation>, RetryForeverError>> + Send;
|
conf: &PageServerConf,
|
||||||
|
) -> impl Future<
|
||||||
|
Output = Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError>,
|
||||||
|
> + Send;
|
||||||
fn validate(
|
fn validate(
|
||||||
&self,
|
&self,
|
||||||
tenants: Vec<(TenantShardId, Generation)>,
|
tenants: Vec<(TenantShardId, Generation)>,
|
||||||
@@ -110,13 +118,59 @@ impl ControlPlaneClient {
|
|||||||
|
|
||||||
impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
||||||
/// Block until we get a successful response, or error out if we are shut down
|
/// Block until we get a successful response, or error out if we are shut down
|
||||||
async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
|
async fn re_attach(
|
||||||
|
&self,
|
||||||
|
conf: &PageServerConf,
|
||||||
|
) -> Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError> {
|
||||||
let re_attach_path = self
|
let re_attach_path = self
|
||||||
.base_url
|
.base_url
|
||||||
.join("re-attach")
|
.join("re-attach")
|
||||||
.expect("Failed to build re-attach path");
|
.expect("Failed to build re-attach path");
|
||||||
|
|
||||||
|
// Include registration content in the re-attach request if a metadata file is readable
|
||||||
|
let metadata_path = conf.metadata_path();
|
||||||
|
let register = match tokio::fs::read_to_string(&metadata_path).await {
|
||||||
|
Ok(metadata_str) => match serde_json::from_str::<NodeMetadata>(&metadata_str) {
|
||||||
|
Ok(m) => {
|
||||||
|
// Since we run one time at startup, be generous in our logging and
|
||||||
|
// dump all metadata.
|
||||||
|
tracing::info!(
|
||||||
|
"Loaded node metadata: postgres {}:{}, http {}:{}, other fields: {:?}",
|
||||||
|
m.postgres_host,
|
||||||
|
m.postgres_port,
|
||||||
|
m.http_host,
|
||||||
|
m.http_port,
|
||||||
|
m.other
|
||||||
|
);
|
||||||
|
|
||||||
|
Some(NodeRegisterRequest {
|
||||||
|
node_id: conf.id,
|
||||||
|
listen_pg_addr: m.postgres_host,
|
||||||
|
listen_pg_port: m.postgres_port,
|
||||||
|
listen_http_addr: m.http_host,
|
||||||
|
listen_http_port: m.http_port,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
tracing::error!("Unreadable metadata in {metadata_path}: {e}");
|
||||||
|
None
|
||||||
|
}
|
||||||
|
},
|
||||||
|
Err(e) => {
|
||||||
|
if e.kind() == std::io::ErrorKind::NotFound {
|
||||||
|
// This is legal: we may have been deployed with some external script
|
||||||
|
// doing registration for us.
|
||||||
|
tracing::info!("Metadata file not found at {metadata_path}");
|
||||||
|
} else {
|
||||||
|
on_fatal_io_error(&e, &format!("Loading metadata at {metadata_path}"))
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
let request = ReAttachRequest {
|
let request = ReAttachRequest {
|
||||||
node_id: self.node_id,
|
node_id: self.node_id,
|
||||||
|
register,
|
||||||
};
|
};
|
||||||
|
|
||||||
fail::fail_point!("control-plane-client-re-attach");
|
fail::fail_point!("control-plane-client-re-attach");
|
||||||
@@ -130,7 +184,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
|||||||
Ok(response
|
Ok(response
|
||||||
.tenants
|
.tenants
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|t| (t.id, Generation::new(t.gen)))
|
.map(|rart| (rart.id, rart))
|
||||||
.collect::<HashMap<_, _>>())
|
.collect::<HashMap<_, _>>())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -724,8 +724,8 @@ impl DeletionQueue {
|
|||||||
mod test {
|
mod test {
|
||||||
use camino::Utf8Path;
|
use camino::Utf8Path;
|
||||||
use hex_literal::hex;
|
use hex_literal::hex;
|
||||||
use pageserver_api::shard::ShardIndex;
|
use pageserver_api::{shard::ShardIndex, upcall_api::ReAttachResponseTenant};
|
||||||
use std::io::ErrorKind;
|
use std::{io::ErrorKind, time::Duration};
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
|
|
||||||
use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
|
use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
|
||||||
@@ -831,9 +831,13 @@ mod test {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl ControlPlaneGenerationsApi for MockControlPlane {
|
impl ControlPlaneGenerationsApi for MockControlPlane {
|
||||||
async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
|
async fn re_attach(
|
||||||
|
&self,
|
||||||
|
_conf: &PageServerConf,
|
||||||
|
) -> Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError> {
|
||||||
unimplemented!()
|
unimplemented!()
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn validate(
|
async fn validate(
|
||||||
&self,
|
&self,
|
||||||
tenants: Vec<(TenantShardId, Generation)>,
|
tenants: Vec<(TenantShardId, Generation)>,
|
||||||
|
|||||||
@@ -61,7 +61,6 @@ use crate::{
|
|||||||
metrics::disk_usage_based_eviction::METRICS,
|
metrics::disk_usage_based_eviction::METRICS,
|
||||||
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
||||||
tenant::{
|
tenant::{
|
||||||
self,
|
|
||||||
mgr::TenantManager,
|
mgr::TenantManager,
|
||||||
remote_timeline_client::LayerFileMetadata,
|
remote_timeline_client::LayerFileMetadata,
|
||||||
secondary::SecondaryTenant,
|
secondary::SecondaryTenant,
|
||||||
@@ -814,8 +813,8 @@ async fn collect_eviction_candidates(
|
|||||||
const LOG_DURATION_THRESHOLD: std::time::Duration = std::time::Duration::from_secs(10);
|
const LOG_DURATION_THRESHOLD: std::time::Duration = std::time::Duration::from_secs(10);
|
||||||
|
|
||||||
// get a snapshot of the list of tenants
|
// get a snapshot of the list of tenants
|
||||||
let tenants = tenant::mgr::list_tenants()
|
let tenants = tenant_manager
|
||||||
.await
|
.list_tenants()
|
||||||
.context("get list of tenants")?;
|
.context("get list of tenants")?;
|
||||||
|
|
||||||
// TODO: avoid listing every layer in every tenant: this loop can block the executor,
|
// TODO: avoid listing every layer in every tenant: this loop can block the executor,
|
||||||
@@ -827,8 +826,12 @@ async fn collect_eviction_candidates(
|
|||||||
if cancel.is_cancelled() {
|
if cancel.is_cancelled() {
|
||||||
return Ok(EvictionCandidates::Cancelled);
|
return Ok(EvictionCandidates::Cancelled);
|
||||||
}
|
}
|
||||||
let tenant = match tenant::mgr::get_tenant(tenant_id, true) {
|
let tenant = match tenant_manager.get_attached_tenant_shard(tenant_id) {
|
||||||
Ok(tenant) => tenant,
|
Ok(tenant) if tenant.is_active() => tenant,
|
||||||
|
Ok(_) => {
|
||||||
|
debug!(tenant_id=%tenant_id.tenant_id, shard_id=%tenant_id.shard_slug(), "Tenant shard is not active");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
// this can happen if tenant has lifecycle transition after we fetched it
|
// this can happen if tenant has lifecycle transition after we fetched it
|
||||||
debug!("failed to get tenant: {e:#}");
|
debug!("failed to get tenant: {e:#}");
|
||||||
|
|||||||
@@ -567,9 +567,9 @@ paths:
|
|||||||
application/json:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||||
/v1/tenant/{tenant_id}/location_config:
|
/v1/tenant/{tenant_shard_id}/location_config:
|
||||||
parameters:
|
parameters:
|
||||||
- name: tenant_id
|
- name: tenant_shard_id
|
||||||
in: path
|
in: path
|
||||||
required: true
|
required: true
|
||||||
schema:
|
schema:
|
||||||
@@ -932,6 +932,75 @@ paths:
|
|||||||
schema:
|
schema:
|
||||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||||
|
|
||||||
|
/v1/tenant/{tenant_shard_id}/heatmap_upload:
|
||||||
|
parameters:
|
||||||
|
- name: tenant_shard_id
|
||||||
|
in: path
|
||||||
|
required: true
|
||||||
|
schema:
|
||||||
|
type: string
|
||||||
|
post:
|
||||||
|
description: |
|
||||||
|
If the location is in an attached mode, upload the current state to the remote heatmap
|
||||||
|
responses:
|
||||||
|
"200":
|
||||||
|
description: Success
|
||||||
|
"500":
|
||||||
|
description: Generic operation error
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/Error"
|
||||||
|
"503":
|
||||||
|
description: Temporarily unavailable, please retry.
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||||
|
|
||||||
|
/v1/tenant/{tenant_shard_id}/secondary/download:
|
||||||
|
parameters:
|
||||||
|
- name: tenant_shard_id
|
||||||
|
in: path
|
||||||
|
required: true
|
||||||
|
schema:
|
||||||
|
type: string
|
||||||
|
- name: wait_ms
|
||||||
|
description: If set, we will wait this long for download to complete, and if it isn't complete then return 202
|
||||||
|
in: query
|
||||||
|
required: false
|
||||||
|
schema:
|
||||||
|
type: integer
|
||||||
|
post:
|
||||||
|
description: |
|
||||||
|
If the location is in secondary mode, download latest heatmap and layers
|
||||||
|
responses:
|
||||||
|
"200":
|
||||||
|
description: Success
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/SecondaryProgress"
|
||||||
|
"202":
|
||||||
|
description: Download has started but not yet finished
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/SecondaryProgress"
|
||||||
|
"500":
|
||||||
|
description: Generic operation error
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/Error"
|
||||||
|
"503":
|
||||||
|
description: Temporarily unavailable, please retry.
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/v1/tenant/{tenant_id}/timeline/:
|
/v1/tenant/{tenant_id}/timeline/:
|
||||||
parameters:
|
parameters:
|
||||||
@@ -969,7 +1038,7 @@ paths:
|
|||||||
format: hex
|
format: hex
|
||||||
responses:
|
responses:
|
||||||
"201":
|
"201":
|
||||||
description: TimelineInfo
|
description: Timeline was created, or already existed with matching parameters
|
||||||
content:
|
content:
|
||||||
application/json:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
@@ -999,11 +1068,17 @@ paths:
|
|||||||
schema:
|
schema:
|
||||||
$ref: "#/components/schemas/Error"
|
$ref: "#/components/schemas/Error"
|
||||||
"409":
|
"409":
|
||||||
description: Timeline already exists, creation skipped
|
description: Timeline already exists, with different parameters. Creation cannot proceed.
|
||||||
content:
|
content:
|
||||||
application/json:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
$ref: "#/components/schemas/ConflictError"
|
$ref: "#/components/schemas/ConflictError"
|
||||||
|
"429":
|
||||||
|
description: A creation request was sent for the same Timeline Id while a creation was already in progress. Back off and retry.
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/Error"
|
||||||
"500":
|
"500":
|
||||||
description: Generic operation error
|
description: Generic operation error
|
||||||
content:
|
content:
|
||||||
@@ -1314,10 +1389,11 @@ components:
|
|||||||
TenantLocationConfigRequest:
|
TenantLocationConfigRequest:
|
||||||
type: object
|
type: object
|
||||||
required:
|
required:
|
||||||
- tenant_id
|
- mode
|
||||||
properties:
|
properties:
|
||||||
tenant_id:
|
tenant_id:
|
||||||
type: string
|
type: string
|
||||||
|
description: Not used, scheduled for removal.
|
||||||
mode:
|
mode:
|
||||||
type: string
|
type: string
|
||||||
enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"]
|
enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"]
|
||||||
@@ -1391,7 +1467,7 @@ components:
|
|||||||
trace_read_requests:
|
trace_read_requests:
|
||||||
type: boolean
|
type: boolean
|
||||||
heatmap_period:
|
heatmap_period:
|
||||||
type: integer
|
type: string
|
||||||
TenantConfigResponse:
|
TenantConfigResponse:
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
@@ -1569,6 +1645,37 @@ components:
|
|||||||
Lower is better score for how good this pageserver would be for the next tenant.
|
Lower is better score for how good this pageserver would be for the next tenant.
|
||||||
The default or maximum value can be returned in situations when a proper score cannot (yet) be calculated.
|
The default or maximum value can be returned in situations when a proper score cannot (yet) be calculated.
|
||||||
|
|
||||||
|
SecondaryProgress:
|
||||||
|
type: object
|
||||||
|
required:
|
||||||
|
- heatmap_mtime
|
||||||
|
- layers_downloaded
|
||||||
|
- layers_total
|
||||||
|
- bytes_downloaded
|
||||||
|
- bytes_total
|
||||||
|
properties:
|
||||||
|
heatmap_mtime:
|
||||||
|
type: string
|
||||||
|
format: date-time
|
||||||
|
description: Modification time of the most recently downloaded layer heatmap (RFC 3339 format)
|
||||||
|
layers_downloaded:
|
||||||
|
type: integer
|
||||||
|
format: int64
|
||||||
|
description: How many layers from the latest layer heatmap are present on disk
|
||||||
|
bytes_downloaded:
|
||||||
|
type: integer
|
||||||
|
format: int64
|
||||||
|
description: How many bytes of layer content from the latest layer heatmap are present on disk
|
||||||
|
layers_total:
|
||||||
|
type: integer
|
||||||
|
format: int64
|
||||||
|
description: How many layers were in the latest layer heatmap
|
||||||
|
bytes_total:
|
||||||
|
type: integer
|
||||||
|
format: int64
|
||||||
|
description: How many bytes of layer content were in the latest layer heatmap
|
||||||
|
|
||||||
|
|
||||||
Error:
|
Error:
|
||||||
type: object
|
type: object
|
||||||
required:
|
required:
|
||||||
|
|||||||
@@ -36,6 +36,7 @@ use tokio_util::sync::CancellationToken;
|
|||||||
use tracing::*;
|
use tracing::*;
|
||||||
use utils::auth::JwtAuth;
|
use utils::auth::JwtAuth;
|
||||||
use utils::failpoint_support::failpoints_handler;
|
use utils::failpoint_support::failpoints_handler;
|
||||||
|
use utils::http::endpoint::prometheus_metrics_handler;
|
||||||
use utils::http::endpoint::request_span;
|
use utils::http::endpoint::request_span;
|
||||||
use utils::http::json::json_request_or_empty_body;
|
use utils::http::json::json_request_or_empty_body;
|
||||||
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
|
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
|
||||||
@@ -48,8 +49,8 @@ use crate::task_mgr::TaskKind;
|
|||||||
use crate::tenant::config::{LocationConf, TenantConfOpt};
|
use crate::tenant::config::{LocationConf, TenantConfOpt};
|
||||||
use crate::tenant::mgr::GetActiveTenantError;
|
use crate::tenant::mgr::GetActiveTenantError;
|
||||||
use crate::tenant::mgr::{
|
use crate::tenant::mgr::{
|
||||||
GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
|
GetTenantError, TenantManager, TenantMapError, TenantMapInsertError, TenantSlotError,
|
||||||
TenantSlotError, TenantSlotUpsertError, TenantStateError,
|
TenantSlotUpsertError, TenantStateError,
|
||||||
};
|
};
|
||||||
use crate::tenant::mgr::{TenantSlot, UpsertLocationError};
|
use crate::tenant::mgr::{TenantSlot, UpsertLocationError};
|
||||||
use crate::tenant::remote_timeline_client;
|
use crate::tenant::remote_timeline_client;
|
||||||
@@ -248,16 +249,11 @@ impl From<GetTenantError> for ApiError {
|
|||||||
fn from(tse: GetTenantError) -> ApiError {
|
fn from(tse: GetTenantError) -> ApiError {
|
||||||
match tse {
|
match tse {
|
||||||
GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
|
GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
|
||||||
GetTenantError::Broken(reason) => {
|
|
||||||
ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason))
|
|
||||||
}
|
|
||||||
GetTenantError::NotActive(_) => {
|
GetTenantError::NotActive(_) => {
|
||||||
// Why is this not `ApiError::NotFound`?
|
// Why is this not `ApiError::NotFound`?
|
||||||
// Because we must be careful to never return 404 for a tenant if it does
|
// Because we must be careful to never return 404 for a tenant if it does
|
||||||
// in fact exist locally. If we did, the caller could draw the conclusion
|
// in fact exist locally. If we did, the caller could draw the conclusion
|
||||||
// that it can attach the tenant to another PS and we'd be in split-brain.
|
// that it can attach the tenant to another PS and we'd be in split-brain.
|
||||||
//
|
|
||||||
// (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls).
|
|
||||||
ApiError::ResourceUnavailable("Tenant not yet active".into())
|
ApiError::ResourceUnavailable("Tenant not yet active".into())
|
||||||
}
|
}
|
||||||
GetTenantError::MapState(e) => ApiError::ResourceUnavailable(format!("{e}").into()),
|
GetTenantError::MapState(e) => ApiError::ResourceUnavailable(format!("{e}").into()),
|
||||||
@@ -268,6 +264,9 @@ impl From<GetTenantError> for ApiError {
|
|||||||
impl From<GetActiveTenantError> for ApiError {
|
impl From<GetActiveTenantError> for ApiError {
|
||||||
fn from(e: GetActiveTenantError) -> ApiError {
|
fn from(e: GetActiveTenantError) -> ApiError {
|
||||||
match e {
|
match e {
|
||||||
|
GetActiveTenantError::Broken(reason) => {
|
||||||
|
ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason))
|
||||||
|
}
|
||||||
GetActiveTenantError::WillNotBecomeActive(_) => ApiError::Conflict(format!("{}", e)),
|
GetActiveTenantError::WillNotBecomeActive(_) => ApiError::Conflict(format!("{}", e)),
|
||||||
GetActiveTenantError::Cancelled => ApiError::ShuttingDown,
|
GetActiveTenantError::Cancelled => ApiError::ShuttingDown,
|
||||||
GetActiveTenantError::NotFound(gte) => gte.into(),
|
GetActiveTenantError::NotFound(gte) => gte.into(),
|
||||||
@@ -278,19 +277,6 @@ impl From<GetActiveTenantError> for ApiError {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<SetNewTenantConfigError> for ApiError {
|
|
||||||
fn from(e: SetNewTenantConfigError) -> ApiError {
|
|
||||||
match e {
|
|
||||||
SetNewTenantConfigError::GetTenant(tid) => {
|
|
||||||
ApiError::NotFound(anyhow!("tenant {}", tid).into())
|
|
||||||
}
|
|
||||||
e @ (SetNewTenantConfigError::Persist(_) | SetNewTenantConfigError::Other(_)) => {
|
|
||||||
ApiError::InternalServerError(anyhow::Error::new(e))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<crate::tenant::DeleteTimelineError> for ApiError {
|
impl From<crate::tenant::DeleteTimelineError> for ApiError {
|
||||||
fn from(value: crate::tenant::DeleteTimelineError) -> Self {
|
fn from(value: crate::tenant::DeleteTimelineError) -> Self {
|
||||||
use crate::tenant::DeleteTimelineError::*;
|
use crate::tenant::DeleteTimelineError::*;
|
||||||
@@ -494,7 +480,7 @@ async fn timeline_create_handler(
|
|||||||
async {
|
async {
|
||||||
let tenant = state
|
let tenant = state
|
||||||
.tenant_manager
|
.tenant_manager
|
||||||
.get_attached_tenant_shard(tenant_shard_id, false)?;
|
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||||
|
|
||||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||||
|
|
||||||
@@ -534,10 +520,13 @@ async fn timeline_create_handler(
|
|||||||
HttpErrorBody::from_msg("Tenant shutting down".to_string()),
|
HttpErrorBody::from_msg("Tenant shutting down".to_string()),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
Err(
|
Err(e @ tenant::CreateTimelineError::Conflict) => {
|
||||||
tenant::CreateTimelineError::Conflict
|
json_response(StatusCode::CONFLICT, HttpErrorBody::from_msg(e.to_string()))
|
||||||
| tenant::CreateTimelineError::AlreadyCreating,
|
}
|
||||||
) => json_response(StatusCode::CONFLICT, ()),
|
Err(e @ tenant::CreateTimelineError::AlreadyCreating) => json_response(
|
||||||
|
StatusCode::TOO_MANY_REQUESTS,
|
||||||
|
HttpErrorBody::from_msg(e.to_string()),
|
||||||
|
),
|
||||||
Err(tenant::CreateTimelineError::AncestorLsn(err)) => json_response(
|
Err(tenant::CreateTimelineError::AncestorLsn(err)) => json_response(
|
||||||
StatusCode::NOT_ACCEPTABLE,
|
StatusCode::NOT_ACCEPTABLE,
|
||||||
HttpErrorBody::from_msg(format!("{err:#}")),
|
HttpErrorBody::from_msg(format!("{err:#}")),
|
||||||
@@ -580,7 +569,7 @@ async fn timeline_list_handler(
|
|||||||
let response_data = async {
|
let response_data = async {
|
||||||
let tenant = state
|
let tenant = state
|
||||||
.tenant_manager
|
.tenant_manager
|
||||||
.get_attached_tenant_shard(tenant_shard_id, false)?;
|
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||||
|
|
||||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||||
|
|
||||||
@@ -618,6 +607,7 @@ async fn timeline_preserve_initdb_handler(
|
|||||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||||
|
let state = get_state(&request);
|
||||||
|
|
||||||
// Part of the process for disaster recovery from safekeeper-stored WAL:
|
// Part of the process for disaster recovery from safekeeper-stored WAL:
|
||||||
// If we don't recover into a new timeline but want to keep the timeline ID,
|
// If we don't recover into a new timeline but want to keep the timeline ID,
|
||||||
@@ -625,7 +615,9 @@ async fn timeline_preserve_initdb_handler(
|
|||||||
// location where timeline recreation cand find it.
|
// location where timeline recreation cand find it.
|
||||||
|
|
||||||
async {
|
async {
|
||||||
let tenant = mgr::get_tenant(tenant_shard_id, false)?;
|
let tenant = state
|
||||||
|
.tenant_manager
|
||||||
|
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||||
|
|
||||||
let timeline = tenant
|
let timeline = tenant
|
||||||
.get_timeline(timeline_id, false)
|
.get_timeline(timeline_id, false)
|
||||||
@@ -667,7 +659,7 @@ async fn timeline_detail_handler(
|
|||||||
let timeline_info = async {
|
let timeline_info = async {
|
||||||
let tenant = state
|
let tenant = state
|
||||||
.tenant_manager
|
.tenant_manager
|
||||||
.get_attached_tenant_shard(tenant_shard_id, false)?;
|
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||||
|
|
||||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||||
|
|
||||||
@@ -854,7 +846,7 @@ async fn timeline_delete_handler(
|
|||||||
|
|
||||||
let tenant = state
|
let tenant = state
|
||||||
.tenant_manager
|
.tenant_manager
|
||||||
.get_attached_tenant_shard(tenant_shard_id, false)
|
.get_attached_tenant_shard(tenant_shard_id)
|
||||||
.map_err(|e| {
|
.map_err(|e| {
|
||||||
match e {
|
match e {
|
||||||
// GetTenantError has a built-in conversion to ApiError, but in this context we don't
|
// GetTenantError has a built-in conversion to ApiError, but in this context we don't
|
||||||
@@ -885,14 +877,16 @@ async fn tenant_detach_handler(
|
|||||||
|
|
||||||
let state = get_state(&request);
|
let state = get_state(&request);
|
||||||
let conf = state.conf;
|
let conf = state.conf;
|
||||||
mgr::detach_tenant(
|
state
|
||||||
conf,
|
.tenant_manager
|
||||||
tenant_shard_id,
|
.detach_tenant(
|
||||||
detach_ignored.unwrap_or(false),
|
conf,
|
||||||
&state.deletion_queue_client,
|
tenant_shard_id,
|
||||||
)
|
detach_ignored.unwrap_or(false),
|
||||||
.instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug()))
|
&state.deletion_queue_client,
|
||||||
.await?;
|
)
|
||||||
|
.instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug()))
|
||||||
|
.await?;
|
||||||
|
|
||||||
json_response(StatusCode::OK, ())
|
json_response(StatusCode::OK, ())
|
||||||
}
|
}
|
||||||
@@ -970,10 +964,11 @@ async fn tenant_list_handler(
|
|||||||
_cancel: CancellationToken,
|
_cancel: CancellationToken,
|
||||||
) -> Result<Response<Body>, ApiError> {
|
) -> Result<Response<Body>, ApiError> {
|
||||||
check_permission(&request, None)?;
|
check_permission(&request, None)?;
|
||||||
|
let state = get_state(&request);
|
||||||
|
|
||||||
let response_data = mgr::list_tenants()
|
let response_data = state
|
||||||
.instrument(info_span!("tenant_list"))
|
.tenant_manager
|
||||||
.await
|
.list_tenants()
|
||||||
.map_err(|_| {
|
.map_err(|_| {
|
||||||
ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".into())
|
ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".into())
|
||||||
})?
|
})?
|
||||||
@@ -996,9 +991,12 @@ async fn tenant_status(
|
|||||||
) -> Result<Response<Body>, ApiError> {
|
) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||||
|
let state = get_state(&request);
|
||||||
|
|
||||||
let tenant_info = async {
|
let tenant_info = async {
|
||||||
let tenant = mgr::get_tenant(tenant_shard_id, false)?;
|
let tenant = state
|
||||||
|
.tenant_manager
|
||||||
|
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||||
|
|
||||||
// Calculate total physical size of all timelines
|
// Calculate total physical size of all timelines
|
||||||
let mut current_physical_size = 0;
|
let mut current_physical_size = 0;
|
||||||
@@ -1071,9 +1069,7 @@ async fn tenant_size_handler(
|
|||||||
let inputs_only: Option<bool> = parse_query_param(&request, "inputs_only")?;
|
let inputs_only: Option<bool> = parse_query_param(&request, "inputs_only")?;
|
||||||
let retention_period: Option<u64> = parse_query_param(&request, "retention_period")?;
|
let retention_period: Option<u64> = parse_query_param(&request, "retention_period")?;
|
||||||
let headers = request.headers();
|
let headers = request.headers();
|
||||||
|
let state = get_state(&request);
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
|
||||||
let tenant = mgr::get_tenant(tenant_shard_id, true)?;
|
|
||||||
|
|
||||||
if !tenant_shard_id.is_zero() {
|
if !tenant_shard_id.is_zero() {
|
||||||
return Err(ApiError::BadRequest(anyhow!(
|
return Err(ApiError::BadRequest(anyhow!(
|
||||||
@@ -1081,6 +1077,12 @@ async fn tenant_size_handler(
|
|||||||
)));
|
)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||||
|
let tenant = state
|
||||||
|
.tenant_manager
|
||||||
|
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||||
|
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||||
|
|
||||||
// this can be long operation
|
// this can be long operation
|
||||||
let inputs = tenant
|
let inputs = tenant
|
||||||
.gather_size_inputs(
|
.gather_size_inputs(
|
||||||
@@ -1149,9 +1151,19 @@ async fn tenant_shard_split_handler(
|
|||||||
let state = get_state(&request);
|
let state = get_state(&request);
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||||
|
|
||||||
|
let tenant = state
|
||||||
|
.tenant_manager
|
||||||
|
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||||
|
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||||
|
|
||||||
let new_shards = state
|
let new_shards = state
|
||||||
.tenant_manager
|
.tenant_manager
|
||||||
.shard_split(tenant_shard_id, ShardCount::new(req.new_shard_count), &ctx)
|
.shard_split(
|
||||||
|
tenant,
|
||||||
|
ShardCount::new(req.new_shard_count),
|
||||||
|
req.new_stripe_size,
|
||||||
|
&ctx,
|
||||||
|
)
|
||||||
.await
|
.await
|
||||||
.map_err(ApiError::InternalServerError)?;
|
.map_err(ApiError::InternalServerError)?;
|
||||||
|
|
||||||
@@ -1365,8 +1377,11 @@ async fn get_tenant_config_handler(
|
|||||||
) -> Result<Response<Body>, ApiError> {
|
) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||||
|
let state = get_state(&request);
|
||||||
|
|
||||||
let tenant = mgr::get_tenant(tenant_shard_id, false)?;
|
let tenant = state
|
||||||
|
.tenant_manager
|
||||||
|
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||||
|
|
||||||
let response = HashMap::from([
|
let response = HashMap::from([
|
||||||
(
|
(
|
||||||
@@ -1394,13 +1409,31 @@ async fn update_tenant_config_handler(
|
|||||||
let tenant_id = request_data.tenant_id;
|
let tenant_id = request_data.tenant_id;
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_id))?;
|
||||||
|
|
||||||
let tenant_conf =
|
let new_tenant_conf =
|
||||||
TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
|
TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
|
||||||
|
|
||||||
let state = get_state(&request);
|
let state = get_state(&request);
|
||||||
mgr::set_new_tenant_config(state.conf, tenant_conf, tenant_id)
|
|
||||||
.instrument(info_span!("tenant_config", %tenant_id))
|
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||||
.await?;
|
|
||||||
|
let tenant = state
|
||||||
|
.tenant_manager
|
||||||
|
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||||
|
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||||
|
|
||||||
|
// This is a legacy API that only operates on attached tenants: the preferred
|
||||||
|
// API to use is the location_config/ endpoint, which lets the caller provide
|
||||||
|
// the full LocationConf.
|
||||||
|
let location_conf = LocationConf::attached_single(
|
||||||
|
new_tenant_conf.clone(),
|
||||||
|
tenant.get_generation(),
|
||||||
|
&ShardParameters::default(),
|
||||||
|
);
|
||||||
|
|
||||||
|
crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
|
||||||
|
.await
|
||||||
|
.map_err(ApiError::InternalServerError)?;
|
||||||
|
tenant.set_new_tenant_config(new_tenant_conf);
|
||||||
|
|
||||||
json_response(StatusCode::OK, ())
|
json_response(StatusCode::OK, ())
|
||||||
}
|
}
|
||||||
@@ -1423,13 +1456,14 @@ async fn put_tenant_location_config_handler(
|
|||||||
// The `Detached` state is special, it doesn't upsert a tenant, it removes
|
// The `Detached` state is special, it doesn't upsert a tenant, it removes
|
||||||
// its local disk content and drops it from memory.
|
// its local disk content and drops it from memory.
|
||||||
if let LocationConfigMode::Detached = request_data.config.mode {
|
if let LocationConfigMode::Detached = request_data.config.mode {
|
||||||
if let Err(e) =
|
if let Err(e) = state
|
||||||
mgr::detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
|
.tenant_manager
|
||||||
.instrument(info_span!("tenant_detach",
|
.detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
|
||||||
tenant_id = %tenant_shard_id.tenant_id,
|
.instrument(info_span!("tenant_detach",
|
||||||
shard_id = %tenant_shard_id.shard_slug()
|
tenant_id = %tenant_shard_id.tenant_id,
|
||||||
))
|
shard_id = %tenant_shard_id.shard_slug()
|
||||||
.await
|
))
|
||||||
|
.await
|
||||||
{
|
{
|
||||||
match e {
|
match e {
|
||||||
TenantStateError::SlotError(TenantSlotError::NotFound(_)) => {
|
TenantStateError::SlotError(TenantSlotError::NotFound(_)) => {
|
||||||
@@ -1623,10 +1657,12 @@ async fn handle_tenant_break(
|
|||||||
) -> Result<Response<Body>, ApiError> {
|
) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?;
|
let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?;
|
||||||
|
|
||||||
let tenant = crate::tenant::mgr::get_tenant(tenant_shard_id, true)
|
let state = get_state(&r);
|
||||||
.map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;
|
state
|
||||||
|
.tenant_manager
|
||||||
tenant.set_broken("broken from test".to_owned()).await;
|
.get_attached_tenant_shard(tenant_shard_id)?
|
||||||
|
.set_broken("broken from test".to_owned())
|
||||||
|
.await;
|
||||||
|
|
||||||
json_response(StatusCode::OK, ())
|
json_response(StatusCode::OK, ())
|
||||||
}
|
}
|
||||||
@@ -1643,8 +1679,7 @@ async fn timeline_gc_handler(
|
|||||||
let gc_req: TimelineGcRequest = json_request(&mut request).await?;
|
let gc_req: TimelineGcRequest = json_request(&mut request).await?;
|
||||||
|
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||||
let wait_task_done =
|
let wait_task_done = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx)?;
|
||||||
mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?;
|
|
||||||
let gc_result = wait_task_done
|
let gc_result = wait_task_done
|
||||||
.await
|
.await
|
||||||
.context("wait for gc task")
|
.context("wait for gc task")
|
||||||
@@ -1871,7 +1906,7 @@ async fn active_timeline_of_active_tenant(
|
|||||||
tenant_shard_id: TenantShardId,
|
tenant_shard_id: TenantShardId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
) -> Result<Arc<Timeline>, ApiError> {
|
) -> Result<Arc<Timeline>, ApiError> {
|
||||||
let tenant = tenant_manager.get_attached_tenant_shard(tenant_shard_id, false)?;
|
let tenant = tenant_manager.get_attached_tenant_shard(tenant_shard_id)?;
|
||||||
|
|
||||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||||
|
|
||||||
@@ -1982,13 +2017,42 @@ async fn secondary_download_handler(
|
|||||||
) -> Result<Response<Body>, ApiError> {
|
) -> Result<Response<Body>, ApiError> {
|
||||||
let state = get_state(&request);
|
let state = get_state(&request);
|
||||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||||
state
|
let wait = parse_query_param(&request, "wait_ms")?.map(Duration::from_millis);
|
||||||
.secondary_controller
|
|
||||||
.download_tenant(tenant_shard_id)
|
|
||||||
.await
|
|
||||||
.map_err(ApiError::InternalServerError)?;
|
|
||||||
|
|
||||||
json_response(StatusCode::OK, ())
|
// We don't need this to issue the download request, but:
|
||||||
|
// - it enables us to cleanly return 404 if we get a request for an absent shard
|
||||||
|
// - we will use this to provide status feedback in the response
|
||||||
|
let Some(secondary_tenant) = state
|
||||||
|
.tenant_manager
|
||||||
|
.get_secondary_tenant_shard(tenant_shard_id)
|
||||||
|
else {
|
||||||
|
return Err(ApiError::NotFound(
|
||||||
|
anyhow::anyhow!("Shard {} not found", tenant_shard_id).into(),
|
||||||
|
));
|
||||||
|
};
|
||||||
|
|
||||||
|
let timeout = wait.unwrap_or(Duration::MAX);
|
||||||
|
|
||||||
|
let status = match tokio::time::timeout(
|
||||||
|
timeout,
|
||||||
|
state.secondary_controller.download_tenant(tenant_shard_id),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
// Download job ran to completion.
|
||||||
|
Ok(Ok(())) => StatusCode::OK,
|
||||||
|
// Edge case: downloads aren't usually fallible: things like a missing heatmap are considered
|
||||||
|
// okay. We could get an error here in the unlikely edge case that the tenant
|
||||||
|
// was detached between our check above and executing the download job.
|
||||||
|
Ok(Err(e)) => return Err(ApiError::InternalServerError(e)),
|
||||||
|
// A timeout is not an error: we have started the download, we're just not done
|
||||||
|
// yet. The caller will get a response body indicating status.
|
||||||
|
Err(_) => StatusCode::ACCEPTED,
|
||||||
|
};
|
||||||
|
|
||||||
|
let progress = secondary_tenant.progress.lock().unwrap().clone();
|
||||||
|
|
||||||
|
json_response(status, progress)
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
|
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
@@ -2048,6 +2112,10 @@ async fn get_utilization(
|
|||||||
r: Request<Body>,
|
r: Request<Body>,
|
||||||
_cancel: CancellationToken,
|
_cancel: CancellationToken,
|
||||||
) -> Result<Response<Body>, ApiError> {
|
) -> Result<Response<Body>, ApiError> {
|
||||||
|
fail::fail_point!("get-utilization-http-handler", |_| {
|
||||||
|
Err(ApiError::ResourceUnavailable("failpoint".into()))
|
||||||
|
});
|
||||||
|
|
||||||
// this probably could be completely public, but lets make that change later.
|
// this probably could be completely public, but lets make that change later.
|
||||||
check_permission(&r, None)?;
|
check_permission(&r, None)?;
|
||||||
|
|
||||||
@@ -2224,6 +2292,7 @@ pub fn make_router(
|
|||||||
|
|
||||||
Ok(router
|
Ok(router
|
||||||
.data(state)
|
.data(state)
|
||||||
|
.get("/metrics", |r| request_span(r, prometheus_metrics_handler))
|
||||||
.get("/v1/status", |r| api_handler(r, status_handler))
|
.get("/v1/status", |r| api_handler(r, status_handler))
|
||||||
.put("/v1/failpoints", |r| {
|
.put("/v1/failpoints", |r| {
|
||||||
testing_api_handler("manage failpoints", r, failpoints_handler)
|
testing_api_handler("manage failpoints", r, failpoints_handler)
|
||||||
|
|||||||
@@ -2,28 +2,20 @@
|
|||||||
//! Import data and WAL from a PostgreSQL data directory and WAL segments into
|
//! Import data and WAL from a PostgreSQL data directory and WAL segments into
|
||||||
//! a neon Timeline.
|
//! a neon Timeline.
|
||||||
//!
|
//!
|
||||||
use std::io::SeekFrom;
|
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
|
|
||||||
use anyhow::{bail, ensure, Context, Result};
|
use anyhow::{bail, ensure, Context, Result};
|
||||||
use async_compression::tokio::bufread::ZstdDecoder;
|
|
||||||
use async_compression::{tokio::write::ZstdEncoder, zstd::CParameter, Level};
|
|
||||||
use bytes::Bytes;
|
use bytes::Bytes;
|
||||||
use camino::Utf8Path;
|
use camino::Utf8Path;
|
||||||
use futures::StreamExt;
|
use futures::StreamExt;
|
||||||
use nix::NixPath;
|
use tokio::io::{AsyncRead, AsyncReadExt};
|
||||||
use tokio::fs::{File, OpenOptions};
|
|
||||||
use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt};
|
|
||||||
use tokio_tar::Archive;
|
use tokio_tar::Archive;
|
||||||
use tokio_tar::Builder;
|
|
||||||
use tokio_tar::HeaderMode;
|
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
use walkdir::WalkDir;
|
use walkdir::WalkDir;
|
||||||
|
|
||||||
use crate::context::RequestContext;
|
use crate::context::RequestContext;
|
||||||
use crate::metrics::WAL_INGEST;
|
use crate::metrics::WAL_INGEST;
|
||||||
use crate::pgdatadir_mapping::*;
|
use crate::pgdatadir_mapping::*;
|
||||||
use crate::tenant::remote_timeline_client::INITDB_PATH;
|
|
||||||
use crate::tenant::Timeline;
|
use crate::tenant::Timeline;
|
||||||
use crate::walingest::WalIngest;
|
use crate::walingest::WalIngest;
|
||||||
use crate::walrecord::DecodedWALRecord;
|
use crate::walrecord::DecodedWALRecord;
|
||||||
@@ -633,65 +625,3 @@ async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> Result<Bytes>
|
|||||||
reader.read_to_end(&mut buf).await?;
|
reader.read_to_end(&mut buf).await?;
|
||||||
Ok(Bytes::from(buf))
|
Ok(Bytes::from(buf))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn create_tar_zst(pgdata_path: &Utf8Path, tmp_path: &Utf8Path) -> Result<(File, u64)> {
|
|
||||||
let file = OpenOptions::new()
|
|
||||||
.create(true)
|
|
||||||
.truncate(true)
|
|
||||||
.read(true)
|
|
||||||
.write(true)
|
|
||||||
.open(&tmp_path)
|
|
||||||
.await
|
|
||||||
.with_context(|| format!("tempfile creation {tmp_path}"))?;
|
|
||||||
|
|
||||||
let mut paths = Vec::new();
|
|
||||||
for entry in WalkDir::new(pgdata_path) {
|
|
||||||
let entry = entry?;
|
|
||||||
let metadata = entry.metadata().expect("error getting dir entry metadata");
|
|
||||||
// Also allow directories so that we also get empty directories
|
|
||||||
if !(metadata.is_file() || metadata.is_dir()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
let path = entry.into_path();
|
|
||||||
paths.push(path);
|
|
||||||
}
|
|
||||||
// Do a sort to get a more consistent listing
|
|
||||||
paths.sort_unstable();
|
|
||||||
let zstd = ZstdEncoder::with_quality_and_params(
|
|
||||||
file,
|
|
||||||
Level::Default,
|
|
||||||
&[CParameter::enable_long_distance_matching(true)],
|
|
||||||
);
|
|
||||||
let mut builder = Builder::new(zstd);
|
|
||||||
// Use reproducible header mode
|
|
||||||
builder.mode(HeaderMode::Deterministic);
|
|
||||||
for path in paths {
|
|
||||||
let rel_path = path.strip_prefix(pgdata_path)?;
|
|
||||||
if rel_path.is_empty() {
|
|
||||||
// The top directory should not be compressed,
|
|
||||||
// the tar crate doesn't like that
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
builder.append_path_with_name(&path, rel_path).await?;
|
|
||||||
}
|
|
||||||
let mut zstd = builder.into_inner().await?;
|
|
||||||
zstd.shutdown().await?;
|
|
||||||
let mut compressed = zstd.into_inner();
|
|
||||||
let compressed_len = compressed.metadata().await?.len();
|
|
||||||
const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024;
|
|
||||||
if compressed_len > INITDB_TAR_ZST_WARN_LIMIT {
|
|
||||||
warn!("compressed {INITDB_PATH} size of {compressed_len} is above limit {INITDB_TAR_ZST_WARN_LIMIT}.");
|
|
||||||
}
|
|
||||||
compressed.seek(SeekFrom::Start(0)).await?;
|
|
||||||
Ok((compressed, compressed_len))
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn extract_tar_zst(
|
|
||||||
pgdata_path: &Utf8Path,
|
|
||||||
tar_zst: impl AsyncBufRead + Unpin,
|
|
||||||
) -> Result<()> {
|
|
||||||
let tar = Box::pin(ZstdDecoder::new(tar_zst));
|
|
||||||
let mut archive = Archive::new(tar);
|
|
||||||
archive.unpack(pgdata_path).await?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -31,6 +31,7 @@ pub mod walredo;
|
|||||||
use crate::task_mgr::TaskKind;
|
use crate::task_mgr::TaskKind;
|
||||||
use camino::Utf8Path;
|
use camino::Utf8Path;
|
||||||
use deletion_queue::DeletionQueue;
|
use deletion_queue::DeletionQueue;
|
||||||
|
use tenant::mgr::TenantManager;
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
|
|
||||||
/// Current storage format version
|
/// Current storage format version
|
||||||
@@ -53,7 +54,11 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
|
|||||||
pub use crate::metrics::preinitialize_metrics;
|
pub use crate::metrics::preinitialize_metrics;
|
||||||
|
|
||||||
#[tracing::instrument(skip_all, fields(%exit_code))]
|
#[tracing::instrument(skip_all, fields(%exit_code))]
|
||||||
pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_code: i32) {
|
pub async fn shutdown_pageserver(
|
||||||
|
tenant_manager: &TenantManager,
|
||||||
|
deletion_queue: Option<DeletionQueue>,
|
||||||
|
exit_code: i32,
|
||||||
|
) {
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
// Shut down the libpq endpoint task. This prevents new connections from
|
// Shut down the libpq endpoint task. This prevents new connections from
|
||||||
// being accepted.
|
// being accepted.
|
||||||
@@ -67,7 +72,7 @@ pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_cod
|
|||||||
// Shut down all the tenants. This flushes everything to disk and kills
|
// Shut down all the tenants. This flushes everything to disk and kills
|
||||||
// the checkpoint and GC tasks.
|
// the checkpoint and GC tasks.
|
||||||
timed(
|
timed(
|
||||||
tenant::mgr::shutdown_all_tenants(),
|
tenant_manager.shutdown(),
|
||||||
"shutdown all tenants",
|
"shutdown all tenants",
|
||||||
Duration::from_secs(5),
|
Duration::from_secs(5),
|
||||||
)
|
)
|
||||||
@@ -114,27 +119,27 @@ pub const METADATA_FILE_NAME: &str = "metadata";
|
|||||||
|
|
||||||
/// Per-tenant configuration file.
|
/// Per-tenant configuration file.
|
||||||
/// Full path: `tenants/<tenant_id>/config`.
|
/// Full path: `tenants/<tenant_id>/config`.
|
||||||
pub const TENANT_CONFIG_NAME: &str = "config";
|
pub(crate) const TENANT_CONFIG_NAME: &str = "config";
|
||||||
|
|
||||||
/// Per-tenant configuration file.
|
/// Per-tenant configuration file.
|
||||||
/// Full path: `tenants/<tenant_id>/config`.
|
/// Full path: `tenants/<tenant_id>/config`.
|
||||||
pub const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";
|
pub(crate) const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";
|
||||||
|
|
||||||
/// Per-tenant copy of their remote heatmap, downloaded into the local
|
/// Per-tenant copy of their remote heatmap, downloaded into the local
|
||||||
/// tenant path while in secondary mode.
|
/// tenant path while in secondary mode.
|
||||||
pub const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json";
|
pub(crate) const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json";
|
||||||
|
|
||||||
/// A suffix used for various temporary files. Any temporary files found in the
|
/// A suffix used for various temporary files. Any temporary files found in the
|
||||||
/// data directory at pageserver startup can be automatically removed.
|
/// data directory at pageserver startup can be automatically removed.
|
||||||
pub const TEMP_FILE_SUFFIX: &str = "___temp";
|
pub(crate) const TEMP_FILE_SUFFIX: &str = "___temp";
|
||||||
|
|
||||||
/// A marker file to mark that a timeline directory was not fully initialized.
|
/// A marker file to mark that a timeline directory was not fully initialized.
|
||||||
/// If a timeline directory with this marker is encountered at pageserver startup,
|
/// If a timeline directory with this marker is encountered at pageserver startup,
|
||||||
/// the timeline directory and the marker file are both removed.
|
/// the timeline directory and the marker file are both removed.
|
||||||
/// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`.
|
/// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`.
|
||||||
pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
|
pub(crate) const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
|
||||||
|
|
||||||
pub const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
|
pub(crate) const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
|
||||||
|
|
||||||
/// A marker file to prevent pageserver from loading a certain tenant on restart.
|
/// A marker file to prevent pageserver from loading a certain tenant on restart.
|
||||||
/// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding
|
/// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding
|
||||||
@@ -161,11 +166,11 @@ fn ends_with_suffix(path: &Utf8Path, suffix: &str) -> bool {
|
|||||||
// from the directory name. Instead create type "UninitMark(TimelineId)" and only parse it once
|
// from the directory name. Instead create type "UninitMark(TimelineId)" and only parse it once
|
||||||
// from the name.
|
// from the name.
|
||||||
|
|
||||||
pub fn is_uninit_mark(path: &Utf8Path) -> bool {
|
pub(crate) fn is_uninit_mark(path: &Utf8Path) -> bool {
|
||||||
ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
|
ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn is_delete_mark(path: &Utf8Path) -> bool {
|
pub(crate) fn is_delete_mark(path: &Utf8Path) -> bool {
|
||||||
ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
|
ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
use enum_map::EnumMap;
|
use enum_map::EnumMap;
|
||||||
use metrics::metric_vec_duration::DurationResultObserver;
|
|
||||||
use metrics::{
|
use metrics::{
|
||||||
register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
|
register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
|
||||||
register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
|
register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
|
||||||
@@ -168,7 +167,7 @@ impl GetVectoredLatency {
|
|||||||
pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(|| {
|
pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(|| {
|
||||||
let inner = register_histogram_vec!(
|
let inner = register_histogram_vec!(
|
||||||
"pageserver_get_vectored_seconds",
|
"pageserver_get_vectored_seconds",
|
||||||
"Time spent in get_vectored",
|
"Time spent in get_vectored, excluding time spent in timeline_get_throttle.",
|
||||||
&["task_kind"],
|
&["task_kind"],
|
||||||
CRITICAL_OP_BUCKETS.into(),
|
CRITICAL_OP_BUCKETS.into(),
|
||||||
)
|
)
|
||||||
@@ -436,7 +435,7 @@ pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(||
|
|||||||
static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||||
register_uint_gauge_vec!(
|
register_uint_gauge_vec!(
|
||||||
"pageserver_remote_physical_size",
|
"pageserver_remote_physical_size",
|
||||||
"The size of the layer files present in the remote storage that are listed in the the remote index_part.json.",
|
"The size of the layer files present in the remote storage that are listed in the remote index_part.json.",
|
||||||
// Corollary: If any files are missing from the index part, they won't be included here.
|
// Corollary: If any files are missing from the index part, they won't be included here.
|
||||||
&["tenant_id", "shard_id", "timeline_id"]
|
&["tenant_id", "shard_id", "timeline_id"]
|
||||||
)
|
)
|
||||||
@@ -700,6 +699,14 @@ pub static STARTUP_IS_LOADING: Lazy<UIntGauge> = Lazy::new(|| {
|
|||||||
.expect("Failed to register pageserver_startup_is_loading")
|
.expect("Failed to register pageserver_startup_is_loading")
|
||||||
});
|
});
|
||||||
|
|
||||||
|
pub(crate) static TIMELINE_EPHEMERAL_BYTES: Lazy<UIntGauge> = Lazy::new(|| {
|
||||||
|
register_uint_gauge!(
|
||||||
|
"pageserver_timeline_ephemeral_bytes",
|
||||||
|
"Total number of bytes in ephemeral layers, summed for all timelines. Approximate, lazily updated."
|
||||||
|
)
|
||||||
|
.expect("Failed to register metric")
|
||||||
|
});
|
||||||
|
|
||||||
/// Metrics related to the lifecycle of a [`crate::tenant::Tenant`] object: things
|
/// Metrics related to the lifecycle of a [`crate::tenant::Tenant`] object: things
|
||||||
/// like how long it took to load.
|
/// like how long it took to load.
|
||||||
///
|
///
|
||||||
@@ -1283,11 +1290,65 @@ pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|
|
|||||||
})
|
})
|
||||||
});
|
});
|
||||||
|
|
||||||
impl DurationResultObserver for BasebackupQueryTime {
|
pub(crate) struct BasebackupQueryTimeOngoingRecording<'a, 'c> {
|
||||||
fn observe_result<T, E>(&self, res: &Result<T, E>, duration: std::time::Duration) {
|
parent: &'a BasebackupQueryTime,
|
||||||
|
ctx: &'c RequestContext,
|
||||||
|
start: std::time::Instant,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BasebackupQueryTime {
|
||||||
|
pub(crate) fn start_recording<'c: 'a, 'a>(
|
||||||
|
&'a self,
|
||||||
|
ctx: &'c RequestContext,
|
||||||
|
) -> BasebackupQueryTimeOngoingRecording<'_, '_> {
|
||||||
|
let start = Instant::now();
|
||||||
|
match ctx.micros_spent_throttled.open() {
|
||||||
|
Ok(()) => (),
|
||||||
|
Err(error) => {
|
||||||
|
use utils::rate_limit::RateLimit;
|
||||||
|
static LOGGED: Lazy<Mutex<RateLimit>> =
|
||||||
|
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
|
||||||
|
let mut rate_limit = LOGGED.lock().unwrap();
|
||||||
|
rate_limit.call(|| {
|
||||||
|
warn!(error, "error opening micros_spent_throttled; this message is logged at a global rate limit");
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
BasebackupQueryTimeOngoingRecording {
|
||||||
|
parent: self,
|
||||||
|
ctx,
|
||||||
|
start,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
|
||||||
|
pub(crate) fn observe<T, E>(self, res: &Result<T, E>) {
|
||||||
|
let elapsed = self.start.elapsed();
|
||||||
|
let ex_throttled = self
|
||||||
|
.ctx
|
||||||
|
.micros_spent_throttled
|
||||||
|
.close_and_checked_sub_from(elapsed);
|
||||||
|
let ex_throttled = match ex_throttled {
|
||||||
|
Ok(ex_throttled) => ex_throttled,
|
||||||
|
Err(error) => {
|
||||||
|
use utils::rate_limit::RateLimit;
|
||||||
|
static LOGGED: Lazy<Mutex<RateLimit>> =
|
||||||
|
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
|
||||||
|
let mut rate_limit = LOGGED.lock().unwrap();
|
||||||
|
rate_limit.call(|| {
|
||||||
|
warn!(error, "error deducting time spent throttled; this message is logged at a global rate limit");
|
||||||
|
});
|
||||||
|
elapsed
|
||||||
|
}
|
||||||
|
};
|
||||||
let label_value = if res.is_ok() { "ok" } else { "error" };
|
let label_value = if res.is_ok() { "ok" } else { "error" };
|
||||||
let metric = self.0.get_metric_with_label_values(&[label_value]).unwrap();
|
let metric = self
|
||||||
metric.observe(duration.as_secs_f64());
|
.parent
|
||||||
|
.0
|
||||||
|
.get_metric_with_label_values(&[label_value])
|
||||||
|
.unwrap();
|
||||||
|
metric.observe(ex_throttled.as_secs_f64());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1964,10 +2025,8 @@ impl TimelineMetrics {
|
|||||||
pub(crate) fn resident_physical_size_get(&self) -> u64 {
|
pub(crate) fn resident_physical_size_get(&self) -> u64 {
|
||||||
self.resident_physical_size_gauge.get()
|
self.resident_physical_size_gauge.get()
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
impl Drop for TimelineMetrics {
|
pub(crate) fn shutdown(&self) {
|
||||||
fn drop(&mut self) {
|
|
||||||
let tenant_id = &self.tenant_id;
|
let tenant_id = &self.tenant_id;
|
||||||
let timeline_id = &self.timeline_id;
|
let timeline_id = &self.timeline_id;
|
||||||
let shard_id = &self.shard_id;
|
let shard_id = &self.shard_id;
|
||||||
@@ -2414,7 +2473,8 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub mod tokio_epoll_uring {
|
pub mod tokio_epoll_uring {
|
||||||
use metrics::UIntGauge;
|
use metrics::{register_int_counter, UIntGauge};
|
||||||
|
use once_cell::sync::Lazy;
|
||||||
|
|
||||||
pub struct Collector {
|
pub struct Collector {
|
||||||
descs: Vec<metrics::core::Desc>,
|
descs: Vec<metrics::core::Desc>,
|
||||||
@@ -2422,15 +2482,13 @@ pub mod tokio_epoll_uring {
|
|||||||
systems_destroyed: UIntGauge,
|
systems_destroyed: UIntGauge,
|
||||||
}
|
}
|
||||||
|
|
||||||
const NMETRICS: usize = 2;
|
|
||||||
|
|
||||||
impl metrics::core::Collector for Collector {
|
impl metrics::core::Collector for Collector {
|
||||||
fn desc(&self) -> Vec<&metrics::core::Desc> {
|
fn desc(&self) -> Vec<&metrics::core::Desc> {
|
||||||
self.descs.iter().collect()
|
self.descs.iter().collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
|
fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
|
||||||
let mut mfs = Vec::with_capacity(NMETRICS);
|
let mut mfs = Vec::with_capacity(Self::NMETRICS);
|
||||||
let tokio_epoll_uring::metrics::Metrics {
|
let tokio_epoll_uring::metrics::Metrics {
|
||||||
systems_created,
|
systems_created,
|
||||||
systems_destroyed,
|
systems_destroyed,
|
||||||
@@ -2444,6 +2502,8 @@ pub mod tokio_epoll_uring {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Collector {
|
impl Collector {
|
||||||
|
const NMETRICS: usize = 2;
|
||||||
|
|
||||||
#[allow(clippy::new_without_default)]
|
#[allow(clippy::new_without_default)]
|
||||||
pub fn new() -> Self {
|
pub fn new() -> Self {
|
||||||
let mut descs = Vec::new();
|
let mut descs = Vec::new();
|
||||||
@@ -2477,6 +2537,22 @@ pub mod tokio_epoll_uring {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) static THREAD_LOCAL_LAUNCH_SUCCESSES: Lazy<metrics::IntCounter> = Lazy::new(|| {
|
||||||
|
register_int_counter!(
|
||||||
|
"pageserver_tokio_epoll_uring_pageserver_thread_local_launch_success_count",
|
||||||
|
"Number of times where thread_local_system creation spanned multiple executor threads",
|
||||||
|
)
|
||||||
|
.unwrap()
|
||||||
|
});
|
||||||
|
|
||||||
|
pub(crate) static THREAD_LOCAL_LAUNCH_FAILURES: Lazy<metrics::IntCounter> = Lazy::new(|| {
|
||||||
|
register_int_counter!(
|
||||||
|
"pageserver_tokio_epoll_uring_pageserver_thread_local_launch_failures_count",
|
||||||
|
"Number of times thread_local_system creation failed and was retried after back-off.",
|
||||||
|
)
|
||||||
|
.unwrap()
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) mod tenant_throttling {
|
pub(crate) mod tenant_throttling {
|
||||||
@@ -2605,6 +2681,8 @@ pub fn preinitialize_metrics() {
|
|||||||
&WALRECEIVER_BROKER_UPDATES,
|
&WALRECEIVER_BROKER_UPDATES,
|
||||||
&WALRECEIVER_CANDIDATES_ADDED,
|
&WALRECEIVER_CANDIDATES_ADDED,
|
||||||
&WALRECEIVER_CANDIDATES_REMOVED,
|
&WALRECEIVER_CANDIDATES_REMOVED,
|
||||||
|
&tokio_epoll_uring::THREAD_LOCAL_LAUNCH_FAILURES,
|
||||||
|
&tokio_epoll_uring::THREAD_LOCAL_LAUNCH_SUCCESSES,
|
||||||
]
|
]
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.for_each(|c| {
|
.for_each(|c| {
|
||||||
@@ -2623,6 +2701,12 @@ pub fn preinitialize_metrics() {
|
|||||||
Lazy::force(&crate::tenant::storage_layer::layer::LAYER_IMPL_METRICS);
|
Lazy::force(&crate::tenant::storage_layer::layer::LAYER_IMPL_METRICS);
|
||||||
Lazy::force(&disk_usage_based_eviction::METRICS);
|
Lazy::force(&disk_usage_based_eviction::METRICS);
|
||||||
|
|
||||||
|
for state_name in pageserver_api::models::TenantState::VARIANTS {
|
||||||
|
// initialize the metric for all gauges, otherwise the time series might seemingly show
|
||||||
|
// values from last restart.
|
||||||
|
TENANT_STATE_METRIC.with_label_values(&[state_name]).set(0);
|
||||||
|
}
|
||||||
|
|
||||||
// countervecs
|
// countervecs
|
||||||
[&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
|
[&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
|
||||||
.into_iter()
|
.into_iter()
|
||||||
|
|||||||
@@ -760,6 +760,7 @@ impl PageServerHandler {
|
|||||||
let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &tenant.cancel)));
|
let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &tenant.cancel)));
|
||||||
timeline
|
timeline
|
||||||
.import_basebackup_from_tar(
|
.import_basebackup_from_tar(
|
||||||
|
tenant.clone(),
|
||||||
&mut copyin_reader,
|
&mut copyin_reader,
|
||||||
base_lsn,
|
base_lsn,
|
||||||
self.broker_client.clone(),
|
self.broker_client.clone(),
|
||||||
@@ -1199,7 +1200,7 @@ impl PageServerHandler {
|
|||||||
prev_lsn: Option<Lsn>,
|
prev_lsn: Option<Lsn>,
|
||||||
full_backup: bool,
|
full_backup: bool,
|
||||||
gzip: bool,
|
gzip: bool,
|
||||||
ctx: RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<(), QueryError>
|
) -> Result<(), QueryError>
|
||||||
where
|
where
|
||||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||||
@@ -1214,7 +1215,7 @@ impl PageServerHandler {
|
|||||||
if let Some(lsn) = lsn {
|
if let Some(lsn) = lsn {
|
||||||
// Backup was requested at a particular LSN. Wait for it to arrive.
|
// Backup was requested at a particular LSN. Wait for it to arrive.
|
||||||
info!("waiting for {}", lsn);
|
info!("waiting for {}", lsn);
|
||||||
timeline.wait_lsn(lsn, &ctx).await?;
|
timeline.wait_lsn(lsn, ctx).await?;
|
||||||
timeline
|
timeline
|
||||||
.check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
|
.check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
|
||||||
.context("invalid basebackup lsn")?;
|
.context("invalid basebackup lsn")?;
|
||||||
@@ -1236,7 +1237,7 @@ impl PageServerHandler {
|
|||||||
lsn,
|
lsn,
|
||||||
prev_lsn,
|
prev_lsn,
|
||||||
full_backup,
|
full_backup,
|
||||||
&ctx,
|
ctx,
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
} else {
|
} else {
|
||||||
@@ -1257,7 +1258,7 @@ impl PageServerHandler {
|
|||||||
lsn,
|
lsn,
|
||||||
prev_lsn,
|
prev_lsn,
|
||||||
full_backup,
|
full_backup,
|
||||||
&ctx,
|
ctx,
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
// shutdown the encoder to ensure the gzip footer is written
|
// shutdown the encoder to ensure the gzip footer is written
|
||||||
@@ -1269,7 +1270,7 @@ impl PageServerHandler {
|
|||||||
lsn,
|
lsn,
|
||||||
prev_lsn,
|
prev_lsn,
|
||||||
full_backup,
|
full_backup,
|
||||||
&ctx,
|
ctx,
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
}
|
}
|
||||||
@@ -1449,25 +1450,25 @@ where
|
|||||||
false
|
false
|
||||||
};
|
};
|
||||||
|
|
||||||
::metrics::metric_vec_duration::observe_async_block_duration_by_result(
|
let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
|
||||||
&*metrics::BASEBACKUP_QUERY_TIME,
|
let res = async {
|
||||||
async move {
|
self.handle_basebackup_request(
|
||||||
self.handle_basebackup_request(
|
pgb,
|
||||||
pgb,
|
tenant_id,
|
||||||
tenant_id,
|
timeline_id,
|
||||||
timeline_id,
|
lsn,
|
||||||
lsn,
|
None,
|
||||||
None,
|
false,
|
||||||
false,
|
gzip,
|
||||||
gzip,
|
&ctx,
|
||||||
ctx,
|
)
|
||||||
)
|
.await?;
|
||||||
.await?;
|
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
Result::<(), QueryError>::Ok(())
|
||||||
Result::<(), QueryError>::Ok(())
|
}
|
||||||
},
|
.await;
|
||||||
)
|
metric_recording.observe(&res);
|
||||||
.await?;
|
res?;
|
||||||
}
|
}
|
||||||
// return pair of prev_lsn and last_lsn
|
// return pair of prev_lsn and last_lsn
|
||||||
else if query_string.starts_with("get_last_record_rlsn ") {
|
else if query_string.starts_with("get_last_record_rlsn ") {
|
||||||
@@ -1563,7 +1564,7 @@ where
|
|||||||
prev_lsn,
|
prev_lsn,
|
||||||
true,
|
true,
|
||||||
false,
|
false,
|
||||||
ctx,
|
&ctx,
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||||
|
|||||||
@@ -34,6 +34,7 @@ use strum::IntoEnumIterator;
|
|||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
use tracing::{debug, trace, warn};
|
use tracing::{debug, trace, warn};
|
||||||
use utils::bin_ser::DeserializeError;
|
use utils::bin_ser::DeserializeError;
|
||||||
|
use utils::vec_map::{VecMap, VecMapOrdering};
|
||||||
use utils::{bin_ser::BeSer, lsn::Lsn};
|
use utils::{bin_ser::BeSer, lsn::Lsn};
|
||||||
|
|
||||||
const MAX_AUX_FILE_DELTAS: usize = 1024;
|
const MAX_AUX_FILE_DELTAS: usize = 1024;
|
||||||
@@ -1546,12 +1547,13 @@ impl<'a> DatadirModification<'a> {
|
|||||||
if !self.pending_updates.is_empty() {
|
if !self.pending_updates.is_empty() {
|
||||||
// The put_batch call below expects expects the inputs to be sorted by Lsn,
|
// The put_batch call below expects expects the inputs to be sorted by Lsn,
|
||||||
// so we do that first.
|
// so we do that first.
|
||||||
let lsn_ordered_batch: Vec<(Key, Lsn, Value)> = self
|
let lsn_ordered_batch: VecMap<Lsn, (Key, Value)> = VecMap::from_iter(
|
||||||
.pending_updates
|
self.pending_updates
|
||||||
.drain()
|
.drain()
|
||||||
.map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (key, lsn, val)))
|
.map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (lsn, (key, val))))
|
||||||
.kmerge_by(|lhs, rhs| lhs.1 .0 < rhs.1 .0)
|
.kmerge_by(|lhs, rhs| lhs.0 < rhs.0),
|
||||||
.collect();
|
VecMapOrdering::GreaterOrEqual,
|
||||||
|
);
|
||||||
|
|
||||||
writer.put_batch(lsn_ordered_batch, ctx).await?;
|
writer.put_batch(lsn_ordered_batch, ctx).await?;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -50,8 +50,6 @@ use once_cell::sync::Lazy;
|
|||||||
|
|
||||||
use utils::id::TimelineId;
|
use utils::id::TimelineId;
|
||||||
|
|
||||||
use crate::shutdown_pageserver;
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// There are four runtimes:
|
// There are four runtimes:
|
||||||
//
|
//
|
||||||
@@ -272,9 +270,6 @@ pub enum TaskKind {
|
|||||||
// Task that uploads a file to remote storage
|
// Task that uploads a file to remote storage
|
||||||
RemoteUploadTask,
|
RemoteUploadTask,
|
||||||
|
|
||||||
// Task that downloads a file from remote storage
|
|
||||||
RemoteDownloadTask,
|
|
||||||
|
|
||||||
// task that handles the initial downloading of all tenants
|
// task that handles the initial downloading of all tenants
|
||||||
InitialLoad,
|
InitialLoad,
|
||||||
|
|
||||||
@@ -456,7 +451,7 @@ async fn task_finish(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if shutdown_process {
|
if shutdown_process {
|
||||||
shutdown_pageserver(None, 1).await;
|
std::process::exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -43,6 +43,8 @@ use utils::sync::gate::Gate;
|
|||||||
use utils::sync::gate::GateGuard;
|
use utils::sync::gate::GateGuard;
|
||||||
use utils::timeout::timeout_cancellable;
|
use utils::timeout::timeout_cancellable;
|
||||||
use utils::timeout::TimeoutCancellableError;
|
use utils::timeout::TimeoutCancellableError;
|
||||||
|
use utils::zstd::create_zst_tarball;
|
||||||
|
use utils::zstd::extract_zst_tarball;
|
||||||
|
|
||||||
use self::config::AttachedLocationConfig;
|
use self::config::AttachedLocationConfig;
|
||||||
use self::config::AttachmentMode;
|
use self::config::AttachmentMode;
|
||||||
@@ -55,8 +57,8 @@ use self::mgr::GetTenantError;
|
|||||||
use self::mgr::TenantsMap;
|
use self::mgr::TenantsMap;
|
||||||
use self::remote_timeline_client::upload::upload_index_part;
|
use self::remote_timeline_client::upload::upload_index_part;
|
||||||
use self::remote_timeline_client::RemoteTimelineClient;
|
use self::remote_timeline_client::RemoteTimelineClient;
|
||||||
|
use self::timeline::uninit::TimelineCreateGuard;
|
||||||
use self::timeline::uninit::TimelineExclusionError;
|
use self::timeline::uninit::TimelineExclusionError;
|
||||||
use self::timeline::uninit::TimelineUninitMark;
|
|
||||||
use self::timeline::uninit::UninitializedTimeline;
|
use self::timeline::uninit::UninitializedTimeline;
|
||||||
use self::timeline::EvictionTaskTenantState;
|
use self::timeline::EvictionTaskTenantState;
|
||||||
use self::timeline::TimelineResources;
|
use self::timeline::TimelineResources;
|
||||||
@@ -200,6 +202,13 @@ pub(super) struct AttachedTenantConf {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl AttachedTenantConf {
|
impl AttachedTenantConf {
|
||||||
|
fn new(tenant_conf: TenantConfOpt, location: AttachedLocationConfig) -> Self {
|
||||||
|
Self {
|
||||||
|
tenant_conf,
|
||||||
|
location,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn try_from(location_conf: LocationConf) -> anyhow::Result<Self> {
|
fn try_from(location_conf: LocationConf) -> anyhow::Result<Self> {
|
||||||
match &location_conf.mode {
|
match &location_conf.mode {
|
||||||
LocationMode::Attached(attach_conf) => Ok(Self {
|
LocationMode::Attached(attach_conf) => Ok(Self {
|
||||||
@@ -565,9 +574,8 @@ impl Tenant {
|
|||||||
// avoiding holding it across awaits
|
// avoiding holding it across awaits
|
||||||
let mut timelines_accessor = self.timelines.lock().unwrap();
|
let mut timelines_accessor = self.timelines.lock().unwrap();
|
||||||
match timelines_accessor.entry(timeline_id) {
|
match timelines_accessor.entry(timeline_id) {
|
||||||
|
// We should never try and load the same timeline twice during startup
|
||||||
Entry::Occupied(_) => {
|
Entry::Occupied(_) => {
|
||||||
// The uninit mark file acts as a lock that prevents another task from
|
|
||||||
// initializing the timeline at the same time.
|
|
||||||
unreachable!(
|
unreachable!(
|
||||||
"Timeline {tenant_id}/{timeline_id} already exists in the tenant map"
|
"Timeline {tenant_id}/{timeline_id} already exists in the tenant map"
|
||||||
);
|
);
|
||||||
@@ -677,9 +685,20 @@ impl Tenant {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state.
|
// Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state.
|
||||||
|
enum BrokenVerbosity {
|
||||||
|
Error,
|
||||||
|
Info
|
||||||
|
}
|
||||||
let make_broken =
|
let make_broken =
|
||||||
|t: &Tenant, err: anyhow::Error| {
|
|t: &Tenant, err: anyhow::Error, verbosity: BrokenVerbosity| {
|
||||||
error!("attach failed, setting tenant state to Broken: {err:?}");
|
match verbosity {
|
||||||
|
BrokenVerbosity::Info => {
|
||||||
|
info!("attach cancelled, setting tenant state to Broken: {err}");
|
||||||
|
},
|
||||||
|
BrokenVerbosity::Error => {
|
||||||
|
error!("attach failed, setting tenant state to Broken: {err:?}");
|
||||||
|
}
|
||||||
|
}
|
||||||
t.state.send_modify(|state| {
|
t.state.send_modify(|state| {
|
||||||
// The Stopping case is for when we have passed control on to DeleteTenantFlow:
|
// The Stopping case is for when we have passed control on to DeleteTenantFlow:
|
||||||
// if it errors, we will call make_broken when tenant is already in Stopping.
|
// if it errors, we will call make_broken when tenant is already in Stopping.
|
||||||
@@ -743,7 +762,7 @@ impl Tenant {
|
|||||||
// Make the tenant broken so that set_stopping will not hang waiting for it to leave
|
// Make the tenant broken so that set_stopping will not hang waiting for it to leave
|
||||||
// the Attaching state. This is an over-reaction (nothing really broke, the tenant is
|
// the Attaching state. This is an over-reaction (nothing really broke, the tenant is
|
||||||
// just shutting down), but ensures progress.
|
// just shutting down), but ensures progress.
|
||||||
make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"));
|
make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"), BrokenVerbosity::Info);
|
||||||
return Ok(());
|
return Ok(());
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
@@ -765,7 +784,7 @@ impl Tenant {
|
|||||||
match res {
|
match res {
|
||||||
Ok(p) => Some(p),
|
Ok(p) => Some(p),
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
make_broken(&tenant_clone, anyhow::anyhow!(e));
|
make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -789,7 +808,7 @@ impl Tenant {
|
|||||||
{
|
{
|
||||||
Ok(should_resume_deletion) => should_resume_deletion,
|
Ok(should_resume_deletion) => should_resume_deletion,
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
make_broken(&tenant_clone, anyhow::anyhow!(err));
|
make_broken(&tenant_clone, anyhow::anyhow!(err), BrokenVerbosity::Error);
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -819,7 +838,7 @@ impl Tenant {
|
|||||||
.await;
|
.await;
|
||||||
|
|
||||||
if let Err(e) = deleted {
|
if let Err(e) = deleted {
|
||||||
make_broken(&tenant_clone, anyhow::anyhow!(e));
|
make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
|
||||||
}
|
}
|
||||||
|
|
||||||
return Ok(());
|
return Ok(());
|
||||||
@@ -840,7 +859,7 @@ impl Tenant {
|
|||||||
tenant_clone.activate(broker_client, None, &ctx);
|
tenant_clone.activate(broker_client, None, &ctx);
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
make_broken(&tenant_clone, anyhow::anyhow!(e));
|
make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1064,8 +1083,7 @@ impl Tenant {
|
|||||||
let entry_path = entry.path();
|
let entry_path = entry.path();
|
||||||
|
|
||||||
let purge = if crate::is_temporary(entry_path)
|
let purge = if crate::is_temporary(entry_path)
|
||||||
// TODO: uninit_mark isn't needed any more, since uninitialized timelines are already
|
// TODO: remove uninit mark code (https://github.com/neondatabase/neon/issues/5718)
|
||||||
// covered by the check that the timeline must exist in remote storage.
|
|
||||||
|| is_uninit_mark(entry_path)
|
|| is_uninit_mark(entry_path)
|
||||||
|| crate::is_delete_mark(entry_path)
|
|| crate::is_delete_mark(entry_path)
|
||||||
{
|
{
|
||||||
@@ -1298,11 +1316,6 @@ impl Tenant {
|
|||||||
/// Until that happens, the on-disk state is invalid (disk_consistent_lsn=Lsn(0))
|
/// Until that happens, the on-disk state is invalid (disk_consistent_lsn=Lsn(0))
|
||||||
/// and the timeline will fail to load at a restart.
|
/// and the timeline will fail to load at a restart.
|
||||||
///
|
///
|
||||||
/// That's why we add an uninit mark file, and wrap it together witht the Timeline
|
|
||||||
/// in-memory object into UninitializedTimeline.
|
|
||||||
/// Once the caller is done setting up the timeline, they should call
|
|
||||||
/// `UninitializedTimeline::initialize_with_lock` to remove the uninit mark.
|
|
||||||
///
|
|
||||||
/// For tests, use `DatadirModification::init_empty_test_timeline` + `commit` to setup the
|
/// For tests, use `DatadirModification::init_empty_test_timeline` + `commit` to setup the
|
||||||
/// minimum amount of keys required to get a writable timeline.
|
/// minimum amount of keys required to get a writable timeline.
|
||||||
/// (Without it, `put` might fail due to `repartition` failing.)
|
/// (Without it, `put` might fail due to `repartition` failing.)
|
||||||
@@ -1318,7 +1331,9 @@ impl Tenant {
|
|||||||
"Cannot create empty timelines on inactive tenant"
|
"Cannot create empty timelines on inactive tenant"
|
||||||
);
|
);
|
||||||
|
|
||||||
let timeline_uninit_mark = self.create_timeline_uninit_mark(new_timeline_id)?;
|
// Protect against concurrent attempts to use this TimelineId
|
||||||
|
let create_guard = self.create_timeline_create_guard(new_timeline_id)?;
|
||||||
|
|
||||||
let new_metadata = TimelineMetadata::new(
|
let new_metadata = TimelineMetadata::new(
|
||||||
// Initialize disk_consistent LSN to 0, The caller must import some data to
|
// Initialize disk_consistent LSN to 0, The caller must import some data to
|
||||||
// make it valid, before calling finish_creation()
|
// make it valid, before calling finish_creation()
|
||||||
@@ -1333,7 +1348,7 @@ impl Tenant {
|
|||||||
self.prepare_new_timeline(
|
self.prepare_new_timeline(
|
||||||
new_timeline_id,
|
new_timeline_id,
|
||||||
&new_metadata,
|
&new_metadata,
|
||||||
timeline_uninit_mark,
|
create_guard,
|
||||||
initdb_lsn,
|
initdb_lsn,
|
||||||
None,
|
None,
|
||||||
)
|
)
|
||||||
@@ -1396,7 +1411,7 @@ impl Tenant {
|
|||||||
/// the same timeline ID already exists, returns CreateTimelineError::AlreadyExists.
|
/// the same timeline ID already exists, returns CreateTimelineError::AlreadyExists.
|
||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
pub(crate) async fn create_timeline(
|
pub(crate) async fn create_timeline(
|
||||||
&self,
|
self: &Arc<Tenant>,
|
||||||
new_timeline_id: TimelineId,
|
new_timeline_id: TimelineId,
|
||||||
ancestor_timeline_id: Option<TimelineId>,
|
ancestor_timeline_id: Option<TimelineId>,
|
||||||
mut ancestor_start_lsn: Option<Lsn>,
|
mut ancestor_start_lsn: Option<Lsn>,
|
||||||
@@ -1421,9 +1436,8 @@ impl Tenant {
|
|||||||
.map_err(|_| CreateTimelineError::ShuttingDown)?;
|
.map_err(|_| CreateTimelineError::ShuttingDown)?;
|
||||||
|
|
||||||
// Get exclusive access to the timeline ID: this ensures that it does not already exist,
|
// Get exclusive access to the timeline ID: this ensures that it does not already exist,
|
||||||
// and that no other creation attempts will be allowed in while we are working. The
|
// and that no other creation attempts will be allowed in while we are working.
|
||||||
// uninit_mark is a guard.
|
let create_guard = match self.create_timeline_create_guard(new_timeline_id) {
|
||||||
let uninit_mark = match self.create_timeline_uninit_mark(new_timeline_id) {
|
|
||||||
Ok(m) => m,
|
Ok(m) => m,
|
||||||
Err(TimelineExclusionError::AlreadyCreating) => {
|
Err(TimelineExclusionError::AlreadyCreating) => {
|
||||||
// Creation is in progress, we cannot create it again, and we cannot
|
// Creation is in progress, we cannot create it again, and we cannot
|
||||||
@@ -1466,6 +1480,8 @@ impl Tenant {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
pausable_failpoint!("timeline-creation-after-uninit");
|
||||||
|
|
||||||
let loaded_timeline = match ancestor_timeline_id {
|
let loaded_timeline = match ancestor_timeline_id {
|
||||||
Some(ancestor_timeline_id) => {
|
Some(ancestor_timeline_id) => {
|
||||||
let ancestor_timeline = self
|
let ancestor_timeline = self
|
||||||
@@ -1513,7 +1529,7 @@ impl Tenant {
|
|||||||
&ancestor_timeline,
|
&ancestor_timeline,
|
||||||
new_timeline_id,
|
new_timeline_id,
|
||||||
ancestor_start_lsn,
|
ancestor_start_lsn,
|
||||||
uninit_mark,
|
create_guard,
|
||||||
ctx,
|
ctx,
|
||||||
)
|
)
|
||||||
.await?
|
.await?
|
||||||
@@ -1523,7 +1539,7 @@ impl Tenant {
|
|||||||
new_timeline_id,
|
new_timeline_id,
|
||||||
pg_version,
|
pg_version,
|
||||||
load_existing_initdb,
|
load_existing_initdb,
|
||||||
uninit_mark,
|
create_guard,
|
||||||
ctx,
|
ctx,
|
||||||
)
|
)
|
||||||
.await?
|
.await?
|
||||||
@@ -1543,7 +1559,7 @@ impl Tenant {
|
|||||||
})?;
|
})?;
|
||||||
}
|
}
|
||||||
|
|
||||||
loaded_timeline.activate(broker_client, None, ctx);
|
loaded_timeline.activate(self.clone(), broker_client, None, ctx);
|
||||||
|
|
||||||
Ok(loaded_timeline)
|
Ok(loaded_timeline)
|
||||||
}
|
}
|
||||||
@@ -1715,7 +1731,12 @@ impl Tenant {
|
|||||||
let mut activated_timelines = 0;
|
let mut activated_timelines = 0;
|
||||||
|
|
||||||
for timeline in timelines_to_activate {
|
for timeline in timelines_to_activate {
|
||||||
timeline.activate(broker_client.clone(), background_jobs_can_start, ctx);
|
timeline.activate(
|
||||||
|
self.clone(),
|
||||||
|
broker_client.clone(),
|
||||||
|
background_jobs_can_start,
|
||||||
|
ctx,
|
||||||
|
);
|
||||||
activated_timelines += 1;
|
activated_timelines += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1846,6 +1867,8 @@ impl Tenant {
|
|||||||
// Wait for any in-flight operations to complete
|
// Wait for any in-flight operations to complete
|
||||||
self.gate.close().await;
|
self.gate.close().await;
|
||||||
|
|
||||||
|
remove_tenant_metrics(&self.tenant_shard_id);
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2045,7 +2068,12 @@ impl Tenant {
|
|||||||
TenantState::Active { .. } => {
|
TenantState::Active { .. } => {
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
TenantState::Broken { .. } | TenantState::Stopping { .. } => {
|
TenantState::Broken { reason, .. } => {
|
||||||
|
// This is fatal, and reported distinctly from the general case of "will never be active" because
|
||||||
|
// it's logically a 500 to external API users (broken is always a bug).
|
||||||
|
return Err(GetActiveTenantError::Broken(reason));
|
||||||
|
}
|
||||||
|
TenantState::Stopping { .. } => {
|
||||||
// There's no chance the tenant can transition back into ::Active
|
// There's no chance the tenant can transition back into ::Active
|
||||||
return Err(GetActiveTenantError::WillNotBecomeActive(current_state));
|
return Err(GetActiveTenantError::WillNotBecomeActive(current_state));
|
||||||
}
|
}
|
||||||
@@ -2123,7 +2151,7 @@ impl Tenant {
|
|||||||
|
|
||||||
// Shut down the timeline's remote client: this means that the indices we write
|
// Shut down the timeline's remote client: this means that the indices we write
|
||||||
// for child shards will not be invalidated by the parent shard deleting layers.
|
// for child shards will not be invalidated by the parent shard deleting layers.
|
||||||
tl_client.shutdown().await?;
|
tl_client.shutdown().await;
|
||||||
|
|
||||||
// Download methods can still be used after shutdown, as they don't flow through the remote client's
|
// Download methods can still be used after shutdown, as they don't flow through the remote client's
|
||||||
// queue. In principal the RemoteTimelineClient could provide this without downloading it, but this
|
// queue. In principal the RemoteTimelineClient could provide this without downloading it, but this
|
||||||
@@ -2868,9 +2896,9 @@ impl Tenant {
|
|||||||
start_lsn: Option<Lsn>,
|
start_lsn: Option<Lsn>,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<Arc<Timeline>, CreateTimelineError> {
|
) -> Result<Arc<Timeline>, CreateTimelineError> {
|
||||||
let uninit_mark = self.create_timeline_uninit_mark(dst_id).unwrap();
|
let create_guard = self.create_timeline_create_guard(dst_id).unwrap();
|
||||||
let tl = self
|
let tl = self
|
||||||
.branch_timeline_impl(src_timeline, dst_id, start_lsn, uninit_mark, ctx)
|
.branch_timeline_impl(src_timeline, dst_id, start_lsn, create_guard, ctx)
|
||||||
.await?;
|
.await?;
|
||||||
tl.set_state(TimelineState::Active);
|
tl.set_state(TimelineState::Active);
|
||||||
Ok(tl)
|
Ok(tl)
|
||||||
@@ -2884,10 +2912,10 @@ impl Tenant {
|
|||||||
src_timeline: &Arc<Timeline>,
|
src_timeline: &Arc<Timeline>,
|
||||||
dst_id: TimelineId,
|
dst_id: TimelineId,
|
||||||
start_lsn: Option<Lsn>,
|
start_lsn: Option<Lsn>,
|
||||||
timeline_uninit_mark: TimelineUninitMark<'_>,
|
timeline_create_guard: TimelineCreateGuard<'_>,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<Arc<Timeline>, CreateTimelineError> {
|
) -> Result<Arc<Timeline>, CreateTimelineError> {
|
||||||
self.branch_timeline_impl(src_timeline, dst_id, start_lsn, timeline_uninit_mark, ctx)
|
self.branch_timeline_impl(src_timeline, dst_id, start_lsn, timeline_create_guard, ctx)
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2896,7 +2924,7 @@ impl Tenant {
|
|||||||
src_timeline: &Arc<Timeline>,
|
src_timeline: &Arc<Timeline>,
|
||||||
dst_id: TimelineId,
|
dst_id: TimelineId,
|
||||||
start_lsn: Option<Lsn>,
|
start_lsn: Option<Lsn>,
|
||||||
timeline_uninit_mark: TimelineUninitMark<'_>,
|
timeline_create_guard: TimelineCreateGuard<'_>,
|
||||||
_ctx: &RequestContext,
|
_ctx: &RequestContext,
|
||||||
) -> Result<Arc<Timeline>, CreateTimelineError> {
|
) -> Result<Arc<Timeline>, CreateTimelineError> {
|
||||||
let src_id = src_timeline.timeline_id;
|
let src_id = src_timeline.timeline_id;
|
||||||
@@ -2980,7 +3008,7 @@ impl Tenant {
|
|||||||
.prepare_new_timeline(
|
.prepare_new_timeline(
|
||||||
dst_id,
|
dst_id,
|
||||||
&metadata,
|
&metadata,
|
||||||
timeline_uninit_mark,
|
timeline_create_guard,
|
||||||
start_lsn + 1,
|
start_lsn + 1,
|
||||||
Some(Arc::clone(src_timeline)),
|
Some(Arc::clone(src_timeline)),
|
||||||
)
|
)
|
||||||
@@ -3012,12 +3040,12 @@ impl Tenant {
|
|||||||
load_existing_initdb: Option<TimelineId>,
|
load_existing_initdb: Option<TimelineId>,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> anyhow::Result<Arc<Timeline>> {
|
) -> anyhow::Result<Arc<Timeline>> {
|
||||||
let uninit_mark = self.create_timeline_uninit_mark(timeline_id).unwrap();
|
let create_guard = self.create_timeline_create_guard(timeline_id).unwrap();
|
||||||
self.bootstrap_timeline(
|
self.bootstrap_timeline(
|
||||||
timeline_id,
|
timeline_id,
|
||||||
pg_version,
|
pg_version,
|
||||||
load_existing_initdb,
|
load_existing_initdb,
|
||||||
uninit_mark,
|
create_guard,
|
||||||
ctx,
|
ctx,
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
@@ -3044,8 +3072,13 @@ impl Tenant {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let (pgdata_zstd, tar_zst_size) =
|
let (pgdata_zstd, tar_zst_size) = create_zst_tarball(pgdata_path, &temp_path).await?;
|
||||||
import_datadir::create_tar_zst(pgdata_path, &temp_path).await?;
|
const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024;
|
||||||
|
if tar_zst_size > INITDB_TAR_ZST_WARN_LIMIT {
|
||||||
|
warn!(
|
||||||
|
"compressed {temp_path} size of {tar_zst_size} is above limit {INITDB_TAR_ZST_WARN_LIMIT}."
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
pausable_failpoint!("before-initdb-upload");
|
pausable_failpoint!("before-initdb-upload");
|
||||||
|
|
||||||
@@ -3081,7 +3114,7 @@ impl Tenant {
|
|||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
pg_version: u32,
|
pg_version: u32,
|
||||||
load_existing_initdb: Option<TimelineId>,
|
load_existing_initdb: Option<TimelineId>,
|
||||||
timeline_uninit_mark: TimelineUninitMark<'_>,
|
timeline_create_guard: TimelineCreateGuard<'_>,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> anyhow::Result<Arc<Timeline>> {
|
) -> anyhow::Result<Arc<Timeline>> {
|
||||||
// create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/`
|
// create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/`
|
||||||
@@ -3093,13 +3126,14 @@ impl Tenant {
|
|||||||
TEMP_FILE_SUFFIX,
|
TEMP_FILE_SUFFIX,
|
||||||
);
|
);
|
||||||
|
|
||||||
// an uninit mark was placed before, nothing else can access this timeline files
|
// Remove whatever was left from the previous runs: safe because TimelineCreateGuard guarantees
|
||||||
// current initdb was not run yet, so remove whatever was left from the previous runs
|
// we won't race with other creations or existent timelines with the same path.
|
||||||
if pgdata_path.exists() {
|
if pgdata_path.exists() {
|
||||||
fs::remove_dir_all(&pgdata_path).with_context(|| {
|
fs::remove_dir_all(&pgdata_path).with_context(|| {
|
||||||
format!("Failed to remove already existing initdb directory: {pgdata_path}")
|
format!("Failed to remove already existing initdb directory: {pgdata_path}")
|
||||||
})?;
|
})?;
|
||||||
}
|
}
|
||||||
|
|
||||||
// this new directory is very temporary, set to remove it immediately after bootstrap, we don't need it
|
// this new directory is very temporary, set to remove it immediately after bootstrap, we don't need it
|
||||||
scopeguard::defer! {
|
scopeguard::defer! {
|
||||||
if let Err(e) = fs::remove_dir_all(&pgdata_path) {
|
if let Err(e) = fs::remove_dir_all(&pgdata_path) {
|
||||||
@@ -3144,7 +3178,7 @@ impl Tenant {
|
|||||||
|
|
||||||
let buf_read =
|
let buf_read =
|
||||||
BufReader::with_capacity(remote_timeline_client::BUFFER_SIZE, initdb_tar_zst);
|
BufReader::with_capacity(remote_timeline_client::BUFFER_SIZE, initdb_tar_zst);
|
||||||
import_datadir::extract_tar_zst(&pgdata_path, buf_read)
|
extract_zst_tarball(&pgdata_path, buf_read)
|
||||||
.await
|
.await
|
||||||
.context("extract initdb tar")?;
|
.context("extract initdb tar")?;
|
||||||
} else {
|
} else {
|
||||||
@@ -3176,7 +3210,7 @@ impl Tenant {
|
|||||||
.prepare_new_timeline(
|
.prepare_new_timeline(
|
||||||
timeline_id,
|
timeline_id,
|
||||||
&new_metadata,
|
&new_metadata,
|
||||||
timeline_uninit_mark,
|
timeline_create_guard,
|
||||||
pgdata_lsn,
|
pgdata_lsn,
|
||||||
None,
|
None,
|
||||||
)
|
)
|
||||||
@@ -3248,13 +3282,12 @@ impl Tenant {
|
|||||||
///
|
///
|
||||||
/// An empty layer map is initialized, and new data and WAL can be imported starting
|
/// An empty layer map is initialized, and new data and WAL can be imported starting
|
||||||
/// at 'disk_consistent_lsn'. After any initial data has been imported, call
|
/// at 'disk_consistent_lsn'. After any initial data has been imported, call
|
||||||
/// `finish_creation` to insert the Timeline into the timelines map and to remove the
|
/// `finish_creation` to insert the Timeline into the timelines map.
|
||||||
/// uninit mark file.
|
|
||||||
async fn prepare_new_timeline<'a>(
|
async fn prepare_new_timeline<'a>(
|
||||||
&'a self,
|
&'a self,
|
||||||
new_timeline_id: TimelineId,
|
new_timeline_id: TimelineId,
|
||||||
new_metadata: &TimelineMetadata,
|
new_metadata: &TimelineMetadata,
|
||||||
uninit_mark: TimelineUninitMark<'a>,
|
create_guard: TimelineCreateGuard<'a>,
|
||||||
start_lsn: Lsn,
|
start_lsn: Lsn,
|
||||||
ancestor: Option<Arc<Timeline>>,
|
ancestor: Option<Arc<Timeline>>,
|
||||||
) -> anyhow::Result<UninitializedTimeline> {
|
) -> anyhow::Result<UninitializedTimeline> {
|
||||||
@@ -3277,9 +3310,12 @@ impl Tenant {
|
|||||||
|
|
||||||
timeline_struct.init_empty_layer_map(start_lsn);
|
timeline_struct.init_empty_layer_map(start_lsn);
|
||||||
|
|
||||||
if let Err(e) = self.create_timeline_files(&uninit_mark.timeline_path).await {
|
if let Err(e) = self
|
||||||
|
.create_timeline_files(&create_guard.timeline_path)
|
||||||
|
.await
|
||||||
|
{
|
||||||
error!("Failed to create initial files for timeline {tenant_shard_id}/{new_timeline_id}, cleaning up: {e:?}");
|
error!("Failed to create initial files for timeline {tenant_shard_id}/{new_timeline_id}, cleaning up: {e:?}");
|
||||||
cleanup_timeline_directory(uninit_mark);
|
cleanup_timeline_directory(create_guard);
|
||||||
return Err(e);
|
return Err(e);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -3290,41 +3326,31 @@ impl Tenant {
|
|||||||
Ok(UninitializedTimeline::new(
|
Ok(UninitializedTimeline::new(
|
||||||
self,
|
self,
|
||||||
new_timeline_id,
|
new_timeline_id,
|
||||||
Some((timeline_struct, uninit_mark)),
|
Some((timeline_struct, create_guard)),
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn create_timeline_files(&self, timeline_path: &Utf8Path) -> anyhow::Result<()> {
|
async fn create_timeline_files(&self, timeline_path: &Utf8Path) -> anyhow::Result<()> {
|
||||||
crashsafe::create_dir(timeline_path).context("Failed to create timeline directory")?;
|
crashsafe::create_dir(timeline_path).context("Failed to create timeline directory")?;
|
||||||
|
|
||||||
fail::fail_point!("after-timeline-uninit-mark-creation", |_| {
|
fail::fail_point!("after-timeline-dir-creation", |_| {
|
||||||
anyhow::bail!("failpoint after-timeline-uninit-mark-creation");
|
anyhow::bail!("failpoint after-timeline-dir-creation");
|
||||||
});
|
});
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Attempts to create an uninit mark file for the timeline initialization.
|
/// Get a guard that provides exclusive access to the timeline directory, preventing
|
||||||
/// Bails, if the timeline is already loaded into the memory (i.e. initialized before), or the uninit mark file already exists.
|
/// concurrent attempts to create the same timeline.
|
||||||
///
|
fn create_timeline_create_guard(
|
||||||
/// This way, we need to hold the timelines lock only for small amount of time during the mark check/creation per timeline init.
|
|
||||||
fn create_timeline_uninit_mark(
|
|
||||||
&self,
|
&self,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
) -> Result<TimelineUninitMark, TimelineExclusionError> {
|
) -> Result<TimelineCreateGuard, TimelineExclusionError> {
|
||||||
let tenant_shard_id = self.tenant_shard_id;
|
let tenant_shard_id = self.tenant_shard_id;
|
||||||
|
|
||||||
let uninit_mark_path = self
|
|
||||||
.conf
|
|
||||||
.timeline_uninit_mark_file_path(tenant_shard_id, timeline_id);
|
|
||||||
let timeline_path = self.conf.timeline_path(&tenant_shard_id, &timeline_id);
|
let timeline_path = self.conf.timeline_path(&tenant_shard_id, &timeline_id);
|
||||||
|
|
||||||
let uninit_mark = TimelineUninitMark::new(
|
let create_guard = TimelineCreateGuard::new(self, timeline_id, timeline_path.clone())?;
|
||||||
self,
|
|
||||||
timeline_id,
|
|
||||||
uninit_mark_path.clone(),
|
|
||||||
timeline_path.clone(),
|
|
||||||
)?;
|
|
||||||
|
|
||||||
// At this stage, we have got exclusive access to in-memory state for this timeline ID
|
// At this stage, we have got exclusive access to in-memory state for this timeline ID
|
||||||
// for creation.
|
// for creation.
|
||||||
@@ -3340,23 +3366,7 @@ impl Tenant {
|
|||||||
)));
|
)));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create the on-disk uninit mark _after_ the in-memory acquisition of the tenant ID: guarantees
|
Ok(create_guard)
|
||||||
// that during process runtime, colliding creations will be caught in-memory without getting
|
|
||||||
// as far as failing to write a file.
|
|
||||||
fs::OpenOptions::new()
|
|
||||||
.write(true)
|
|
||||||
.create_new(true)
|
|
||||||
.open(&uninit_mark_path)
|
|
||||||
.context("Failed to create uninit mark file")
|
|
||||||
.and_then(|_| {
|
|
||||||
crashsafe::fsync_file_and_parent(&uninit_mark_path)
|
|
||||||
.context("Failed to fsync uninit mark file")
|
|
||||||
})
|
|
||||||
.with_context(|| {
|
|
||||||
format!("Failed to crate uninit mark for timeline {tenant_shard_id}/{timeline_id}")
|
|
||||||
})?;
|
|
||||||
|
|
||||||
Ok(uninit_mark)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Gathers inputs from all of the timelines to produce a sizing model input.
|
/// Gathers inputs from all of the timelines to produce a sizing model input.
|
||||||
@@ -3557,11 +3567,6 @@ async fn run_initdb(
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Drop for Tenant {
|
|
||||||
fn drop(&mut self) {
|
|
||||||
remove_tenant_metrics(&self.tenant_shard_id);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/// Dump contents of a layer file to stdout.
|
/// Dump contents of a layer file to stdout.
|
||||||
pub async fn dump_layerfile_from_path(
|
pub async fn dump_layerfile_from_path(
|
||||||
path: &Utf8Path,
|
path: &Utf8Path,
|
||||||
@@ -4628,10 +4633,7 @@ mod tests {
|
|||||||
drop(guard);
|
drop(guard);
|
||||||
|
|
||||||
// Pick a big LSN such that we query over all the changes.
|
// Pick a big LSN such that we query over all the changes.
|
||||||
// Technically, u64::MAX - 1 is the largest LSN supported by the read path,
|
let reads_lsn = Lsn(u64::MAX - 1);
|
||||||
// but there seems to be a bug on the non-vectored search path which surfaces
|
|
||||||
// in that case.
|
|
||||||
let reads_lsn = Lsn(u64::MAX - 1000);
|
|
||||||
|
|
||||||
for read in reads {
|
for read in reads {
|
||||||
info!("Doing vectored read on {:?}", read);
|
info!("Doing vectored read on {:?}", read);
|
||||||
@@ -5105,15 +5107,15 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_uninit_mark_crash() -> anyhow::Result<()> {
|
async fn test_create_guard_crash() -> anyhow::Result<()> {
|
||||||
let name = "test_uninit_mark_crash";
|
let name = "test_create_guard_crash";
|
||||||
let harness = TenantHarness::create(name)?;
|
let harness = TenantHarness::create(name)?;
|
||||||
{
|
{
|
||||||
let (tenant, ctx) = harness.load().await;
|
let (tenant, ctx) = harness.load().await;
|
||||||
let tline = tenant
|
let tline = tenant
|
||||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
|
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
|
||||||
.await?;
|
.await?;
|
||||||
// Keeps uninit mark in place
|
// Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again
|
||||||
let raw_tline = tline.raw_timeline().unwrap();
|
let raw_tline = tline.raw_timeline().unwrap();
|
||||||
raw_tline
|
raw_tline
|
||||||
.shutdown()
|
.shutdown()
|
||||||
@@ -5141,10 +5143,24 @@ mod tests {
|
|||||||
.timeline_path(&tenant.tenant_shard_id, &TIMELINE_ID)
|
.timeline_path(&tenant.tenant_shard_id, &TIMELINE_ID)
|
||||||
.exists());
|
.exists());
|
||||||
|
|
||||||
assert!(!harness
|
Ok(())
|
||||||
.conf
|
}
|
||||||
.timeline_uninit_mark_file_path(tenant.tenant_shard_id, TIMELINE_ID)
|
|
||||||
.exists());
|
#[tokio::test]
|
||||||
|
async fn test_read_at_max_lsn() -> anyhow::Result<()> {
|
||||||
|
let harness = TenantHarness::create("test_read_at_max_lsn")?;
|
||||||
|
let (tenant, ctx) = harness.load().await;
|
||||||
|
let tline = tenant
|
||||||
|
.create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let lsn = Lsn(0x10);
|
||||||
|
bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;
|
||||||
|
|
||||||
|
let test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
|
||||||
|
let read_lsn = Lsn(u64::MAX - 1);
|
||||||
|
|
||||||
|
assert!(tline.get(test_key, read_lsn, &ctx).await.is_ok());
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -196,16 +196,17 @@ impl LocationConf {
|
|||||||
/// For use when attaching/re-attaching: update the generation stored in this
|
/// For use when attaching/re-attaching: update the generation stored in this
|
||||||
/// structure. If we were in a secondary state, promote to attached (posession
|
/// structure. If we were in a secondary state, promote to attached (posession
|
||||||
/// of a fresh generation implies this).
|
/// of a fresh generation implies this).
|
||||||
pub(crate) fn attach_in_generation(&mut self, generation: Generation) {
|
pub(crate) fn attach_in_generation(&mut self, mode: AttachmentMode, generation: Generation) {
|
||||||
match &mut self.mode {
|
match &mut self.mode {
|
||||||
LocationMode::Attached(attach_conf) => {
|
LocationMode::Attached(attach_conf) => {
|
||||||
attach_conf.generation = generation;
|
attach_conf.generation = generation;
|
||||||
|
attach_conf.attach_mode = mode;
|
||||||
}
|
}
|
||||||
LocationMode::Secondary(_) => {
|
LocationMode::Secondary(_) => {
|
||||||
// We are promoted to attached by the control plane's re-attach response
|
// We are promoted to attached by the control plane's re-attach response
|
||||||
self.mode = LocationMode::Attached(AttachedLocationConfig {
|
self.mode = LocationMode::Attached(AttachedLocationConfig {
|
||||||
generation,
|
generation,
|
||||||
attach_mode: AttachmentMode::Single,
|
attach_mode: mode,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -354,6 +355,7 @@ pub struct TenantConf {
|
|||||||
/// If non-zero, the period between uploads of a heatmap from attached tenants. This
|
/// If non-zero, the period between uploads of a heatmap from attached tenants. This
|
||||||
/// may be disabled if a Tenant will not have secondary locations: only secondary
|
/// may be disabled if a Tenant will not have secondary locations: only secondary
|
||||||
/// locations will use the heatmap uploaded by attached locations.
|
/// locations will use the heatmap uploaded by attached locations.
|
||||||
|
#[serde(with = "humantime_serde")]
|
||||||
pub heatmap_period: Duration,
|
pub heatmap_period: Duration,
|
||||||
|
|
||||||
/// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup
|
/// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup
|
||||||
|
|||||||
@@ -111,6 +111,7 @@ async fn create_local_delete_mark(
|
|||||||
let _ = std::fs::OpenOptions::new()
|
let _ = std::fs::OpenOptions::new()
|
||||||
.write(true)
|
.write(true)
|
||||||
.create(true)
|
.create(true)
|
||||||
|
.truncate(true)
|
||||||
.open(&marker_path)
|
.open(&marker_path)
|
||||||
.with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
|
.with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
|
||||||
|
|
||||||
@@ -296,6 +297,7 @@ impl DeleteTenantFlow {
|
|||||||
remote_storage: Option<GenericRemoteStorage>,
|
remote_storage: Option<GenericRemoteStorage>,
|
||||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||||
tenant: Arc<Tenant>,
|
tenant: Arc<Tenant>,
|
||||||
|
cancel: &CancellationToken,
|
||||||
) -> Result<(), DeleteTenantError> {
|
) -> Result<(), DeleteTenantError> {
|
||||||
span::debug_assert_current_span_has_tenant_id();
|
span::debug_assert_current_span_has_tenant_id();
|
||||||
|
|
||||||
@@ -303,7 +305,9 @@ impl DeleteTenantFlow {
|
|||||||
|
|
||||||
let mut guard = Self::prepare(&tenant).await?;
|
let mut guard = Self::prepare(&tenant).await?;
|
||||||
|
|
||||||
if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
|
if let Err(e) =
|
||||||
|
Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant, cancel).await
|
||||||
|
{
|
||||||
tenant.set_broken(format!("{e:#}")).await;
|
tenant.set_broken(format!("{e:#}")).await;
|
||||||
return Err(e);
|
return Err(e);
|
||||||
}
|
}
|
||||||
@@ -322,6 +326,7 @@ impl DeleteTenantFlow {
|
|||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
remote_storage: Option<&GenericRemoteStorage>,
|
remote_storage: Option<&GenericRemoteStorage>,
|
||||||
tenant: &Tenant,
|
tenant: &Tenant,
|
||||||
|
cancel: &CancellationToken,
|
||||||
) -> Result<(), DeleteTenantError> {
|
) -> Result<(), DeleteTenantError> {
|
||||||
guard.mark_in_progress()?;
|
guard.mark_in_progress()?;
|
||||||
|
|
||||||
@@ -335,15 +340,9 @@ impl DeleteTenantFlow {
|
|||||||
// Though sounds scary, different mark name?
|
// Though sounds scary, different mark name?
|
||||||
// Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
|
// Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
|
||||||
if let Some(remote_storage) = &remote_storage {
|
if let Some(remote_storage) = &remote_storage {
|
||||||
create_remote_delete_mark(
|
create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel)
|
||||||
conf,
|
.await
|
||||||
remote_storage,
|
.context("remote_mark")?
|
||||||
&tenant.tenant_shard_id,
|
|
||||||
// Can't use tenant.cancel, it's already shut down. TODO: wire in an appropriate token
|
|
||||||
&CancellationToken::new(),
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
.context("remote_mark")?
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
|
fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
|
||||||
@@ -546,8 +545,7 @@ impl DeleteTenantFlow {
|
|||||||
conf,
|
conf,
|
||||||
remote_storage.as_ref(),
|
remote_storage.as_ref(),
|
||||||
&tenant.tenant_shard_id,
|
&tenant.tenant_shard_id,
|
||||||
// Can't use tenant.cancel, it's already shut down. TODO: wire in an appropriate token
|
&task_mgr::shutdown_token(),
|
||||||
&CancellationToken::new(),
|
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -217,7 +217,7 @@ use crate::task_mgr::shutdown_token;
|
|||||||
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
|
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||||
use crate::tenant::remote_timeline_client::download::download_retry;
|
use crate::tenant::remote_timeline_client::download::download_retry;
|
||||||
use crate::tenant::storage_layer::AsLayerDesc;
|
use crate::tenant::storage_layer::AsLayerDesc;
|
||||||
use crate::tenant::upload_queue::Delete;
|
use crate::tenant::upload_queue::{Delete, UploadQueueStoppedDeletable};
|
||||||
use crate::tenant::TIMELINES_SEGMENT_NAME;
|
use crate::tenant::TIMELINES_SEGMENT_NAME;
|
||||||
use crate::{
|
use crate::{
|
||||||
config::PageServerConf,
|
config::PageServerConf,
|
||||||
@@ -266,15 +266,6 @@ pub enum MaybeDeletedIndexPart {
|
|||||||
Deleted(IndexPart),
|
Deleted(IndexPart),
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Errors that can arise when calling [`RemoteTimelineClient::stop`].
|
|
||||||
#[derive(Debug, thiserror::Error)]
|
|
||||||
pub enum StopError {
|
|
||||||
/// Returned if the upload queue was never initialized.
|
|
||||||
/// See [`RemoteTimelineClient::init_upload_queue`] and [`RemoteTimelineClient::init_upload_queue_for_empty_remote`].
|
|
||||||
#[error("queue is not initialized")]
|
|
||||||
QueueUninitialized,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, thiserror::Error)]
|
#[derive(Debug, thiserror::Error)]
|
||||||
pub enum PersistIndexPartWithDeletedFlagError {
|
pub enum PersistIndexPartWithDeletedFlagError {
|
||||||
#[error("another task is already setting the deleted_flag, started at {0:?}")]
|
#[error("another task is already setting the deleted_flag, started at {0:?}")]
|
||||||
@@ -399,15 +390,10 @@ impl RemoteTimelineClient {
|
|||||||
"bug: it is responsibility of the caller to provide index part from MaybeDeletedIndexPart::Deleted"
|
"bug: it is responsibility of the caller to provide index part from MaybeDeletedIndexPart::Deleted"
|
||||||
))?;
|
))?;
|
||||||
|
|
||||||
{
|
|
||||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
|
||||||
upload_queue.initialize_with_current_remote_index_part(index_part)?;
|
|
||||||
self.update_remote_physical_size_gauge(Some(index_part));
|
|
||||||
}
|
|
||||||
// also locks upload queue, without dropping the guard above it will be a deadlock
|
|
||||||
self.stop().expect("initialized line above");
|
|
||||||
|
|
||||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||||
|
upload_queue.initialize_with_current_remote_index_part(index_part)?;
|
||||||
|
self.update_remote_physical_size_gauge(Some(index_part));
|
||||||
|
self.stop_impl(&mut upload_queue);
|
||||||
|
|
||||||
upload_queue
|
upload_queue
|
||||||
.stopped_mut()
|
.stopped_mut()
|
||||||
@@ -421,7 +407,8 @@ impl RemoteTimelineClient {
|
|||||||
match &mut *self.upload_queue.lock().unwrap() {
|
match &mut *self.upload_queue.lock().unwrap() {
|
||||||
UploadQueue::Uninitialized => None,
|
UploadQueue::Uninitialized => None,
|
||||||
UploadQueue::Initialized(q) => q.get_last_remote_consistent_lsn_projected(),
|
UploadQueue::Initialized(q) => q.get_last_remote_consistent_lsn_projected(),
|
||||||
UploadQueue::Stopped(q) => q
|
UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => None,
|
||||||
|
UploadQueue::Stopped(UploadQueueStopped::Deletable(q)) => q
|
||||||
.upload_queue_for_deletion
|
.upload_queue_for_deletion
|
||||||
.get_last_remote_consistent_lsn_projected(),
|
.get_last_remote_consistent_lsn_projected(),
|
||||||
}
|
}
|
||||||
@@ -431,7 +418,8 @@ impl RemoteTimelineClient {
|
|||||||
match &mut *self.upload_queue.lock().unwrap() {
|
match &mut *self.upload_queue.lock().unwrap() {
|
||||||
UploadQueue::Uninitialized => None,
|
UploadQueue::Uninitialized => None,
|
||||||
UploadQueue::Initialized(q) => Some(q.get_last_remote_consistent_lsn_visible()),
|
UploadQueue::Initialized(q) => Some(q.get_last_remote_consistent_lsn_visible()),
|
||||||
UploadQueue::Stopped(q) => Some(
|
UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => None,
|
||||||
|
UploadQueue::Stopped(UploadQueueStopped::Deletable(q)) => Some(
|
||||||
q.upload_queue_for_deletion
|
q.upload_queue_for_deletion
|
||||||
.get_last_remote_consistent_lsn_visible(),
|
.get_last_remote_consistent_lsn_visible(),
|
||||||
),
|
),
|
||||||
@@ -898,7 +886,7 @@ impl RemoteTimelineClient {
|
|||||||
/// Wait for all previously scheduled operations to complete, and then stop.
|
/// Wait for all previously scheduled operations to complete, and then stop.
|
||||||
///
|
///
|
||||||
/// Not cancellation safe
|
/// Not cancellation safe
|
||||||
pub(crate) async fn shutdown(self: &Arc<Self>) -> Result<(), StopError> {
|
pub(crate) async fn shutdown(self: &Arc<Self>) {
|
||||||
// On cancellation the queue is left in ackward state of refusing new operations but
|
// On cancellation the queue is left in ackward state of refusing new operations but
|
||||||
// proper stop is yet to be called. On cancel the original or some later task must call
|
// proper stop is yet to be called. On cancel the original or some later task must call
|
||||||
// `stop` or `shutdown`.
|
// `stop` or `shutdown`.
|
||||||
@@ -909,8 +897,12 @@ impl RemoteTimelineClient {
|
|||||||
let fut = {
|
let fut = {
|
||||||
let mut guard = self.upload_queue.lock().unwrap();
|
let mut guard = self.upload_queue.lock().unwrap();
|
||||||
let upload_queue = match &mut *guard {
|
let upload_queue = match &mut *guard {
|
||||||
UploadQueue::Stopped(_) => return Ok(()),
|
UploadQueue::Stopped(_) => return,
|
||||||
UploadQueue::Uninitialized => return Err(StopError::QueueUninitialized),
|
UploadQueue::Uninitialized => {
|
||||||
|
// transition into Stopped state
|
||||||
|
self.stop_impl(&mut guard);
|
||||||
|
return;
|
||||||
|
}
|
||||||
UploadQueue::Initialized(ref mut init) => init,
|
UploadQueue::Initialized(ref mut init) => init,
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -942,7 +934,7 @@ impl RemoteTimelineClient {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
self.stop()
|
self.stop();
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Set the deleted_at field in the remote index file.
|
/// Set the deleted_at field in the remote index file.
|
||||||
@@ -1324,12 +1316,7 @@ impl RemoteTimelineClient {
|
|||||||
// upload finishes or times out soon enough.
|
// upload finishes or times out soon enough.
|
||||||
if cancel.is_cancelled() {
|
if cancel.is_cancelled() {
|
||||||
info!("upload task cancelled by shutdown request");
|
info!("upload task cancelled by shutdown request");
|
||||||
match self.stop() {
|
self.stop();
|
||||||
Ok(()) => {}
|
|
||||||
Err(StopError::QueueUninitialized) => {
|
|
||||||
unreachable!("we never launch an upload task if the queue is uninitialized, and once it is initialized, we never go back")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1584,17 +1571,23 @@ impl RemoteTimelineClient {
|
|||||||
/// In-progress operations will still be running after this function returns.
|
/// In-progress operations will still be running after this function returns.
|
||||||
/// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))`
|
/// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))`
|
||||||
/// to wait for them to complete, after calling this function.
|
/// to wait for them to complete, after calling this function.
|
||||||
pub(crate) fn stop(&self) -> Result<(), StopError> {
|
pub(crate) fn stop(&self) {
|
||||||
// Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue
|
// Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue
|
||||||
// into stopped state, thereby dropping all off the queued *ops* which haven't become *tasks* yet.
|
// into stopped state, thereby dropping all off the queued *ops* which haven't become *tasks* yet.
|
||||||
// The other *tasks* will come here and observe an already shut down queue and hence simply wrap up their business.
|
// The other *tasks* will come here and observe an already shut down queue and hence simply wrap up their business.
|
||||||
let mut guard = self.upload_queue.lock().unwrap();
|
let mut guard = self.upload_queue.lock().unwrap();
|
||||||
match &mut *guard {
|
self.stop_impl(&mut guard);
|
||||||
UploadQueue::Uninitialized => Err(StopError::QueueUninitialized),
|
}
|
||||||
|
|
||||||
|
fn stop_impl(&self, guard: &mut std::sync::MutexGuard<UploadQueue>) {
|
||||||
|
match &mut **guard {
|
||||||
|
UploadQueue::Uninitialized => {
|
||||||
|
info!("UploadQueue is in state Uninitialized, nothing to do");
|
||||||
|
**guard = UploadQueue::Stopped(UploadQueueStopped::Uninitialized);
|
||||||
|
}
|
||||||
UploadQueue::Stopped(_) => {
|
UploadQueue::Stopped(_) => {
|
||||||
// nothing to do
|
// nothing to do
|
||||||
info!("another concurrent task already shut down the queue");
|
info!("another concurrent task already shut down the queue");
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
UploadQueue::Initialized(initialized) => {
|
UploadQueue::Initialized(initialized) => {
|
||||||
info!("shutting down upload queue");
|
info!("shutting down upload queue");
|
||||||
@@ -1627,11 +1620,13 @@ impl RemoteTimelineClient {
|
|||||||
};
|
};
|
||||||
|
|
||||||
let upload_queue = std::mem::replace(
|
let upload_queue = std::mem::replace(
|
||||||
&mut *guard,
|
&mut **guard,
|
||||||
UploadQueue::Stopped(UploadQueueStopped {
|
UploadQueue::Stopped(UploadQueueStopped::Deletable(
|
||||||
upload_queue_for_deletion,
|
UploadQueueStoppedDeletable {
|
||||||
deleted_at: SetDeletedFlagProgress::NotRunning,
|
upload_queue_for_deletion,
|
||||||
}),
|
deleted_at: SetDeletedFlagProgress::NotRunning,
|
||||||
|
},
|
||||||
|
)),
|
||||||
);
|
);
|
||||||
if let UploadQueue::Initialized(qi) = upload_queue {
|
if let UploadQueue::Initialized(qi) = upload_queue {
|
||||||
qi
|
qi
|
||||||
@@ -1660,10 +1655,6 @@ impl RemoteTimelineClient {
|
|||||||
// which is exactly what we want to happen.
|
// which is exactly what we want to happen.
|
||||||
drop(op);
|
drop(op);
|
||||||
}
|
}
|
||||||
|
|
||||||
// We're done.
|
|
||||||
drop(guard);
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ use crate::tenant::storage_layer::LayerFileName;
|
|||||||
use crate::tenant::Generation;
|
use crate::tenant::Generation;
|
||||||
use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
|
use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
|
||||||
use crate::TEMP_FILE_SUFFIX;
|
use crate::TEMP_FILE_SUFFIX;
|
||||||
use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode};
|
use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode, RemotePath};
|
||||||
use utils::crashsafe::path_with_suffix_extension;
|
use utils::crashsafe::path_with_suffix_extension;
|
||||||
use utils::id::TimelineId;
|
use utils::id::TimelineId;
|
||||||
|
|
||||||
@@ -73,55 +73,13 @@ pub async fn download_layer_file<'a>(
|
|||||||
// If pageserver crashes the temp file will be deleted on startup and re-downloaded.
|
// If pageserver crashes the temp file will be deleted on startup and re-downloaded.
|
||||||
let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
|
let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
|
||||||
|
|
||||||
let (mut destination_file, bytes_amount) = download_retry(
|
let bytes_amount = download_retry(
|
||||||
|| async {
|
|| async { download_object(storage, &remote_path, &temp_file_path, cancel).await },
|
||||||
let destination_file = tokio::fs::File::create(&temp_file_path)
|
|
||||||
.await
|
|
||||||
.with_context(|| format!("create a destination file for layer '{temp_file_path}'"))
|
|
||||||
.map_err(DownloadError::Other)?;
|
|
||||||
|
|
||||||
let download = storage.download(&remote_path, cancel).await?;
|
|
||||||
|
|
||||||
let mut destination_file =
|
|
||||||
tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);
|
|
||||||
|
|
||||||
let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
|
|
||||||
|
|
||||||
let bytes_amount = tokio::io::copy_buf(&mut reader, &mut destination_file).await;
|
|
||||||
|
|
||||||
match bytes_amount {
|
|
||||||
Ok(bytes_amount) => {
|
|
||||||
let destination_file = destination_file.into_inner();
|
|
||||||
Ok((destination_file, bytes_amount))
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
if let Err(e) = tokio::fs::remove_file(&temp_file_path).await {
|
|
||||||
on_fatal_io_error(&e, &format!("Removing temporary file {temp_file_path}"));
|
|
||||||
}
|
|
||||||
|
|
||||||
Err(e.into())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
&format!("download {remote_path:?}"),
|
&format!("download {remote_path:?}"),
|
||||||
cancel,
|
cancel,
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
// Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
|
|
||||||
// A file will not be closed immediately when it goes out of scope if there are any IO operations
|
|
||||||
// that have not yet completed. To ensure that a file is closed immediately when it is dropped,
|
|
||||||
// you should call flush before dropping it.
|
|
||||||
//
|
|
||||||
// From the tokio code I see that it waits for pending operations to complete. There shouldt be any because
|
|
||||||
// we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations.
|
|
||||||
// But for additional safety lets check/wait for any pending operations.
|
|
||||||
destination_file
|
|
||||||
.flush()
|
|
||||||
.await
|
|
||||||
.with_context(|| format!("flush source file at {temp_file_path}"))
|
|
||||||
.map_err(DownloadError::Other)?;
|
|
||||||
|
|
||||||
let expected = layer_metadata.file_size();
|
let expected = layer_metadata.file_size();
|
||||||
if expected != bytes_amount {
|
if expected != bytes_amount {
|
||||||
return Err(DownloadError::Other(anyhow!(
|
return Err(DownloadError::Other(anyhow!(
|
||||||
@@ -129,14 +87,6 @@ pub async fn download_layer_file<'a>(
|
|||||||
)));
|
)));
|
||||||
}
|
}
|
||||||
|
|
||||||
// not using sync_data because it can lose file size update
|
|
||||||
destination_file
|
|
||||||
.sync_all()
|
|
||||||
.await
|
|
||||||
.with_context(|| format!("failed to fsync source file at {temp_file_path}"))
|
|
||||||
.map_err(DownloadError::Other)?;
|
|
||||||
drop(destination_file);
|
|
||||||
|
|
||||||
fail::fail_point!("remote-storage-download-pre-rename", |_| {
|
fail::fail_point!("remote-storage-download-pre-rename", |_| {
|
||||||
Err(DownloadError::Other(anyhow!(
|
Err(DownloadError::Other(anyhow!(
|
||||||
"remote-storage-download-pre-rename failpoint triggered"
|
"remote-storage-download-pre-rename failpoint triggered"
|
||||||
@@ -169,6 +119,128 @@ pub async fn download_layer_file<'a>(
|
|||||||
Ok(bytes_amount)
|
Ok(bytes_amount)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Download the object `src_path` in the remote `storage` to local path `dst_path`.
|
||||||
|
///
|
||||||
|
/// If Ok() is returned, the download succeeded and the inode & data have been made durable.
|
||||||
|
/// (Note that the directory entry for the inode is not made durable.)
|
||||||
|
/// The file size in bytes is returned.
|
||||||
|
///
|
||||||
|
/// If Err() is returned, there was some error. The file at `dst_path` has been unlinked.
|
||||||
|
/// The unlinking has _not_ been made durable.
|
||||||
|
async fn download_object<'a>(
|
||||||
|
storage: &'a GenericRemoteStorage,
|
||||||
|
src_path: &RemotePath,
|
||||||
|
dst_path: &Utf8PathBuf,
|
||||||
|
cancel: &CancellationToken,
|
||||||
|
) -> Result<u64, DownloadError> {
|
||||||
|
let res = match crate::virtual_file::io_engine::get() {
|
||||||
|
crate::virtual_file::io_engine::IoEngine::NotSet => panic!("unset"),
|
||||||
|
crate::virtual_file::io_engine::IoEngine::StdFs => {
|
||||||
|
async {
|
||||||
|
let destination_file = tokio::fs::File::create(dst_path)
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("create a destination file for layer '{dst_path}'"))
|
||||||
|
.map_err(DownloadError::Other)?;
|
||||||
|
|
||||||
|
let download = storage.download(src_path, cancel).await?;
|
||||||
|
|
||||||
|
let mut buf_writer =
|
||||||
|
tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);
|
||||||
|
|
||||||
|
let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
|
||||||
|
|
||||||
|
let bytes_amount = tokio::io::copy_buf(&mut reader, &mut buf_writer).await?;
|
||||||
|
buf_writer.flush().await?;
|
||||||
|
|
||||||
|
let mut destination_file = buf_writer.into_inner();
|
||||||
|
|
||||||
|
// Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
|
||||||
|
// A file will not be closed immediately when it goes out of scope if there are any IO operations
|
||||||
|
// that have not yet completed. To ensure that a file is closed immediately when it is dropped,
|
||||||
|
// you should call flush before dropping it.
|
||||||
|
//
|
||||||
|
// From the tokio code I see that it waits for pending operations to complete. There shouldt be any because
|
||||||
|
// we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations.
|
||||||
|
// But for additional safety lets check/wait for any pending operations.
|
||||||
|
destination_file
|
||||||
|
.flush()
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("flush source file at {dst_path}"))
|
||||||
|
.map_err(DownloadError::Other)?;
|
||||||
|
|
||||||
|
// not using sync_data because it can lose file size update
|
||||||
|
destination_file
|
||||||
|
.sync_all()
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("failed to fsync source file at {dst_path}"))
|
||||||
|
.map_err(DownloadError::Other)?;
|
||||||
|
|
||||||
|
Ok(bytes_amount)
|
||||||
|
}
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
crate::virtual_file::io_engine::IoEngine::TokioEpollUring => {
|
||||||
|
use crate::virtual_file::owned_buffers_io::{self, util::size_tracking_writer};
|
||||||
|
async {
|
||||||
|
let destination_file = VirtualFile::create(dst_path)
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("create a destination file for layer '{dst_path}'"))
|
||||||
|
.map_err(DownloadError::Other)?;
|
||||||
|
|
||||||
|
let mut download = storage.download(src_path, cancel).await?;
|
||||||
|
|
||||||
|
// TODO: use vectored write (writev) once supported by tokio-epoll-uring.
|
||||||
|
// There's chunks_vectored() on the stream.
|
||||||
|
let (bytes_amount, destination_file) = async {
|
||||||
|
let size_tracking = size_tracking_writer::Writer::new(destination_file);
|
||||||
|
let mut buffered = owned_buffers_io::write::BufferedWriter::<
|
||||||
|
{ super::BUFFER_SIZE },
|
||||||
|
_,
|
||||||
|
>::new(size_tracking);
|
||||||
|
while let Some(res) =
|
||||||
|
futures::StreamExt::next(&mut download.download_stream).await
|
||||||
|
{
|
||||||
|
let chunk = match res {
|
||||||
|
Ok(chunk) => chunk,
|
||||||
|
Err(e) => return Err(e),
|
||||||
|
};
|
||||||
|
buffered
|
||||||
|
.write_buffered(tokio_epoll_uring::BoundedBuf::slice_full(chunk))
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
|
let size_tracking = buffered.flush_and_into_inner().await?;
|
||||||
|
Ok(size_tracking.into_inner())
|
||||||
|
}
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
// not using sync_data because it can lose file size update
|
||||||
|
destination_file
|
||||||
|
.sync_all()
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("failed to fsync source file at {dst_path}"))
|
||||||
|
.map_err(DownloadError::Other)?;
|
||||||
|
|
||||||
|
Ok(bytes_amount)
|
||||||
|
}
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// in case the download failed, clean up
|
||||||
|
match res {
|
||||||
|
Ok(bytes_amount) => Ok(bytes_amount),
|
||||||
|
Err(e) => {
|
||||||
|
if let Err(e) = tokio::fs::remove_file(dst_path).await {
|
||||||
|
if e.kind() != std::io::ErrorKind::NotFound {
|
||||||
|
on_fatal_io_error(&e, &format!("Removing temporary file {dst_path}"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";
|
const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";
|
||||||
|
|
||||||
pub(crate) fn is_temp_download_file(path: &Utf8Path) -> bool {
|
pub(crate) fn is_temp_download_file(path: &Utf8Path) -> bool {
|
||||||
|
|||||||
@@ -95,7 +95,11 @@ pub(crate) struct SecondaryTenant {
|
|||||||
shard_identity: ShardIdentity,
|
shard_identity: ShardIdentity,
|
||||||
tenant_conf: std::sync::Mutex<TenantConfOpt>,
|
tenant_conf: std::sync::Mutex<TenantConfOpt>,
|
||||||
|
|
||||||
|
// Internal state used by the Downloader.
|
||||||
detail: std::sync::Mutex<SecondaryDetail>,
|
detail: std::sync::Mutex<SecondaryDetail>,
|
||||||
|
|
||||||
|
// Public state indicating overall progress of downloads relative to the last heatmap seen
|
||||||
|
pub(crate) progress: std::sync::Mutex<models::SecondaryProgress>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SecondaryTenant {
|
impl SecondaryTenant {
|
||||||
@@ -118,6 +122,8 @@ impl SecondaryTenant {
|
|||||||
tenant_conf: std::sync::Mutex::new(tenant_conf),
|
tenant_conf: std::sync::Mutex::new(tenant_conf),
|
||||||
|
|
||||||
detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())),
|
detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())),
|
||||||
|
|
||||||
|
progress: std::sync::Mutex::default(),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -247,9 +253,12 @@ impl SecondaryTenant {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads,
|
/// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads,
|
||||||
/// and heatmap uploads. This is not a hot data path: it's primarily a hook for tests,
|
/// and heatmap uploads. This is not a hot data path: it's used for:
|
||||||
/// where we want to immediately upload/download for a particular tenant. In normal operation
|
/// - Live migrations, where we want to ensure a migration destination has the freshest possible
|
||||||
/// uploads & downloads are autonomous and not driven by this interface.
|
/// content before trying to cut over.
|
||||||
|
/// - Tests, where we want to immediately upload/download for a particular tenant.
|
||||||
|
///
|
||||||
|
/// In normal operations, outside of migrations, uploads & downloads are autonomous and not driven by this interface.
|
||||||
pub struct SecondaryController {
|
pub struct SecondaryController {
|
||||||
upload_req_tx: tokio::sync::mpsc::Sender<CommandRequest<UploadCommand>>,
|
upload_req_tx: tokio::sync::mpsc::Sender<CommandRequest<UploadCommand>>,
|
||||||
download_req_tx: tokio::sync::mpsc::Sender<CommandRequest<DownloadCommand>>,
|
download_req_tx: tokio::sync::mpsc::Sender<CommandRequest<DownloadCommand>>,
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ use crate::{
|
|||||||
tenant::{
|
tenant::{
|
||||||
config::SecondaryLocationConfig,
|
config::SecondaryLocationConfig,
|
||||||
debug_assert_current_span_has_tenant_and_timeline_id,
|
debug_assert_current_span_has_tenant_and_timeline_id,
|
||||||
|
ephemeral_file::is_ephemeral_file,
|
||||||
remote_timeline_client::{
|
remote_timeline_client::{
|
||||||
index::LayerFileMetadata, is_temp_download_file, FAILED_DOWNLOAD_WARN_THRESHOLD,
|
index::LayerFileMetadata, is_temp_download_file, FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||||
FAILED_REMOTE_OP_RETRIES,
|
FAILED_REMOTE_OP_RETRIES,
|
||||||
@@ -41,14 +42,16 @@ use crate::tenant::{
|
|||||||
use camino::Utf8PathBuf;
|
use camino::Utf8PathBuf;
|
||||||
use chrono::format::{DelayedFormat, StrftimeItems};
|
use chrono::format::{DelayedFormat, StrftimeItems};
|
||||||
use futures::Future;
|
use futures::Future;
|
||||||
|
use pageserver_api::models::SecondaryProgress;
|
||||||
use pageserver_api::shard::TenantShardId;
|
use pageserver_api::shard::TenantShardId;
|
||||||
use rand::Rng;
|
use rand::Rng;
|
||||||
use remote_storage::{DownloadError, GenericRemoteStorage};
|
use remote_storage::{DownloadError, Etag, GenericRemoteStorage};
|
||||||
|
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
use tracing::{info_span, instrument, warn, Instrument};
|
use tracing::{info_span, instrument, warn, Instrument};
|
||||||
use utils::{
|
use utils::{
|
||||||
backoff, completion::Barrier, crashsafe::path_with_suffix_extension, fs_ext, id::TimelineId,
|
backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext,
|
||||||
|
id::TimelineId,
|
||||||
};
|
};
|
||||||
|
|
||||||
use super::{
|
use super::{
|
||||||
@@ -128,6 +131,7 @@ pub(super) struct SecondaryDetail {
|
|||||||
pub(super) config: SecondaryLocationConfig,
|
pub(super) config: SecondaryLocationConfig,
|
||||||
|
|
||||||
last_download: Option<Instant>,
|
last_download: Option<Instant>,
|
||||||
|
last_etag: Option<Etag>,
|
||||||
next_download: Option<Instant>,
|
next_download: Option<Instant>,
|
||||||
pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
|
pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
|
||||||
}
|
}
|
||||||
@@ -138,11 +142,26 @@ fn strftime(t: &'_ SystemTime) -> DelayedFormat<StrftimeItems<'_>> {
|
|||||||
datetime.format("%d/%m/%Y %T")
|
datetime.format("%d/%m/%Y %T")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Information returned from download function when it detects the heatmap has changed
|
||||||
|
struct HeatMapModified {
|
||||||
|
etag: Etag,
|
||||||
|
last_modified: SystemTime,
|
||||||
|
bytes: Vec<u8>,
|
||||||
|
}
|
||||||
|
|
||||||
|
enum HeatMapDownload {
|
||||||
|
// The heatmap's etag has changed: return the new etag, mtime and the body bytes
|
||||||
|
Modified(HeatMapModified),
|
||||||
|
// The heatmap's etag is unchanged
|
||||||
|
Unmodified,
|
||||||
|
}
|
||||||
|
|
||||||
impl SecondaryDetail {
|
impl SecondaryDetail {
|
||||||
pub(super) fn new(config: SecondaryLocationConfig) -> Self {
|
pub(super) fn new(config: SecondaryLocationConfig) -> Self {
|
||||||
Self {
|
Self {
|
||||||
config,
|
config,
|
||||||
last_download: None,
|
last_download: None,
|
||||||
|
last_etag: None,
|
||||||
next_download: None,
|
next_download: None,
|
||||||
timelines: HashMap::new(),
|
timelines: HashMap::new(),
|
||||||
}
|
}
|
||||||
@@ -477,11 +496,31 @@ impl<'a> TenantDownloader<'a> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
|
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
|
||||||
|
|
||||||
|
// We will use the etag from last successful download to make the download conditional on changes
|
||||||
|
let last_etag = self
|
||||||
|
.secondary_state
|
||||||
|
.detail
|
||||||
|
.lock()
|
||||||
|
.unwrap()
|
||||||
|
.last_etag
|
||||||
|
.clone();
|
||||||
|
|
||||||
// Download the tenant's heatmap
|
// Download the tenant's heatmap
|
||||||
let heatmap_bytes = tokio::select!(
|
let HeatMapModified {
|
||||||
bytes = self.download_heatmap() => {bytes?},
|
last_modified: heatmap_mtime,
|
||||||
|
etag: heatmap_etag,
|
||||||
|
bytes: heatmap_bytes,
|
||||||
|
} = match tokio::select!(
|
||||||
|
bytes = self.download_heatmap(last_etag.as_ref()) => {bytes?},
|
||||||
_ = self.secondary_state.cancel.cancelled() => return Ok(())
|
_ = self.secondary_state.cancel.cancelled() => return Ok(())
|
||||||
);
|
) {
|
||||||
|
HeatMapDownload::Unmodified => {
|
||||||
|
tracing::info!("Heatmap unchanged since last successful download");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
HeatMapDownload::Modified(m) => m,
|
||||||
|
};
|
||||||
|
|
||||||
let heatmap = serde_json::from_slice::<HeatMapTenant>(&heatmap_bytes)?;
|
let heatmap = serde_json::from_slice::<HeatMapTenant>(&heatmap_bytes)?;
|
||||||
|
|
||||||
@@ -496,11 +535,27 @@ impl<'a> TenantDownloader<'a> {
|
|||||||
.await
|
.await
|
||||||
.maybe_fatal_err(&context_msg)?;
|
.maybe_fatal_err(&context_msg)?;
|
||||||
|
|
||||||
tracing::debug!("Wrote local heatmap to {}", heatmap_path);
|
tracing::debug!(
|
||||||
|
"Wrote local heatmap to {}, with {} timelines",
|
||||||
|
heatmap_path,
|
||||||
|
heatmap.timelines.len()
|
||||||
|
);
|
||||||
|
|
||||||
|
// Clean up any local layers that aren't in the heatmap. We do this first for all timelines, on the general
|
||||||
|
// principle that deletions should be done before writes wherever possible, and so that we can use this
|
||||||
|
// phase to initialize our SecondaryProgress.
|
||||||
|
{
|
||||||
|
*self.secondary_state.progress.lock().unwrap() =
|
||||||
|
self.prepare_timelines(&heatmap, heatmap_mtime).await?;
|
||||||
|
}
|
||||||
|
|
||||||
// Download the layers in the heatmap
|
// Download the layers in the heatmap
|
||||||
for timeline in heatmap.timelines {
|
for timeline in heatmap.timelines {
|
||||||
if self.secondary_state.cancel.is_cancelled() {
|
if self.secondary_state.cancel.is_cancelled() {
|
||||||
|
tracing::debug!(
|
||||||
|
"Cancelled before downloading timeline {}",
|
||||||
|
timeline.timeline_id
|
||||||
|
);
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -515,30 +570,159 @@ impl<'a> TenantDownloader<'a> {
|
|||||||
.await?;
|
.await?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Only update last_etag after a full successful download: this way will not skip
|
||||||
|
// the next download, even if the heatmap's actual etag is unchanged.
|
||||||
|
self.secondary_state.detail.lock().unwrap().last_etag = Some(heatmap_etag);
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn download_heatmap(&self) -> Result<Vec<u8>, UpdateError> {
|
/// Do any fast local cleanup that comes before the much slower process of downloading
|
||||||
|
/// layers from remote storage. In the process, initialize the SecondaryProgress object
|
||||||
|
/// that will later be updated incrementally as we download layers.
|
||||||
|
async fn prepare_timelines(
|
||||||
|
&self,
|
||||||
|
heatmap: &HeatMapTenant,
|
||||||
|
heatmap_mtime: SystemTime,
|
||||||
|
) -> Result<SecondaryProgress, UpdateError> {
|
||||||
|
let heatmap_stats = heatmap.get_stats();
|
||||||
|
// We will construct a progress object, and then populate its initial "downloaded" numbers
|
||||||
|
// while iterating through local layer state in [`Self::prepare_timelines`]
|
||||||
|
let mut progress = SecondaryProgress {
|
||||||
|
layers_total: heatmap_stats.layers,
|
||||||
|
bytes_total: heatmap_stats.bytes,
|
||||||
|
heatmap_mtime: Some(heatmap_mtime),
|
||||||
|
layers_downloaded: 0,
|
||||||
|
bytes_downloaded: 0,
|
||||||
|
};
|
||||||
|
// Accumulate list of things to delete while holding the detail lock, for execution after dropping the lock
|
||||||
|
let mut delete_layers = Vec::new();
|
||||||
|
let mut delete_timelines = Vec::new();
|
||||||
|
{
|
||||||
|
let mut detail = self.secondary_state.detail.lock().unwrap();
|
||||||
|
for (timeline_id, timeline_state) in &mut detail.timelines {
|
||||||
|
let Some(heatmap_timeline_index) = heatmap
|
||||||
|
.timelines
|
||||||
|
.iter()
|
||||||
|
.position(|t| t.timeline_id == *timeline_id)
|
||||||
|
else {
|
||||||
|
// This timeline is no longer referenced in the heatmap: delete it locally
|
||||||
|
delete_timelines.push(*timeline_id);
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
|
||||||
|
let heatmap_timeline = heatmap.timelines.get(heatmap_timeline_index).unwrap();
|
||||||
|
|
||||||
|
let layers_in_heatmap = heatmap_timeline
|
||||||
|
.layers
|
||||||
|
.iter()
|
||||||
|
.map(|l| &l.name)
|
||||||
|
.collect::<HashSet<_>>();
|
||||||
|
let layers_on_disk = timeline_state
|
||||||
|
.on_disk_layers
|
||||||
|
.iter()
|
||||||
|
.map(|l| l.0)
|
||||||
|
.collect::<HashSet<_>>();
|
||||||
|
|
||||||
|
let mut layer_count = layers_on_disk.len();
|
||||||
|
let mut layer_byte_count: u64 = timeline_state
|
||||||
|
.on_disk_layers
|
||||||
|
.values()
|
||||||
|
.map(|l| l.metadata.file_size())
|
||||||
|
.sum();
|
||||||
|
|
||||||
|
// Remove on-disk layers that are no longer present in heatmap
|
||||||
|
for layer in layers_on_disk.difference(&layers_in_heatmap) {
|
||||||
|
layer_count -= 1;
|
||||||
|
layer_byte_count -= timeline_state
|
||||||
|
.on_disk_layers
|
||||||
|
.get(layer)
|
||||||
|
.unwrap()
|
||||||
|
.metadata
|
||||||
|
.file_size();
|
||||||
|
|
||||||
|
delete_layers.push((*timeline_id, (*layer).clone()));
|
||||||
|
}
|
||||||
|
|
||||||
|
progress.bytes_downloaded += layer_byte_count;
|
||||||
|
progress.layers_downloaded += layer_count;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Execute accumulated deletions
|
||||||
|
for (timeline_id, layer_name) in delete_layers {
|
||||||
|
let timeline_path = self
|
||||||
|
.conf
|
||||||
|
.timeline_path(self.secondary_state.get_tenant_shard_id(), &timeline_id);
|
||||||
|
let local_path = timeline_path.join(layer_name.to_string());
|
||||||
|
tracing::info!(timeline_id=%timeline_id, "Removing secondary local layer {layer_name} because it's absent in heatmap",);
|
||||||
|
|
||||||
|
tokio::fs::remove_file(&local_path)
|
||||||
|
.await
|
||||||
|
.or_else(fs_ext::ignore_not_found)
|
||||||
|
.maybe_fatal_err("Removing secondary layer")?;
|
||||||
|
|
||||||
|
// Update in-memory housekeeping to reflect the absence of the deleted layer
|
||||||
|
let mut detail = self.secondary_state.detail.lock().unwrap();
|
||||||
|
let Some(timeline_state) = detail.timelines.get_mut(&timeline_id) else {
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
timeline_state.on_disk_layers.remove(&layer_name);
|
||||||
|
}
|
||||||
|
|
||||||
|
for timeline_id in delete_timelines {
|
||||||
|
let timeline_path = self
|
||||||
|
.conf
|
||||||
|
.timeline_path(self.secondary_state.get_tenant_shard_id(), &timeline_id);
|
||||||
|
tracing::info!(timeline_id=%timeline_id,
|
||||||
|
"Timeline no longer in heatmap, removing from secondary location"
|
||||||
|
);
|
||||||
|
tokio::fs::remove_dir_all(&timeline_path)
|
||||||
|
.await
|
||||||
|
.or_else(fs_ext::ignore_not_found)
|
||||||
|
.maybe_fatal_err("Removing secondary timeline")?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(progress)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns downloaded bytes if the etag differs from `prev_etag`, or None if the object
|
||||||
|
/// still matches `prev_etag`.
|
||||||
|
async fn download_heatmap(
|
||||||
|
&self,
|
||||||
|
prev_etag: Option<&Etag>,
|
||||||
|
) -> Result<HeatMapDownload, UpdateError> {
|
||||||
debug_assert_current_span_has_tenant_id();
|
debug_assert_current_span_has_tenant_id();
|
||||||
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
|
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
|
||||||
// TODO: make download conditional on ETag having changed since last download
|
// TODO: pull up etag check into the request, to do a conditional GET rather than
|
||||||
|
// issuing a GET and then maybe ignoring the response body
|
||||||
// (https://github.com/neondatabase/neon/issues/6199)
|
// (https://github.com/neondatabase/neon/issues/6199)
|
||||||
tracing::debug!("Downloading heatmap for secondary tenant",);
|
tracing::debug!("Downloading heatmap for secondary tenant",);
|
||||||
|
|
||||||
let heatmap_path = remote_heatmap_path(tenant_shard_id);
|
let heatmap_path = remote_heatmap_path(tenant_shard_id);
|
||||||
let cancel = &self.secondary_state.cancel;
|
let cancel = &self.secondary_state.cancel;
|
||||||
|
|
||||||
let heatmap_bytes = backoff::retry(
|
backoff::retry(
|
||||||
|| async {
|
|| async {
|
||||||
let download = self
|
let download = self
|
||||||
.remote_storage
|
.remote_storage
|
||||||
.download(&heatmap_path, cancel)
|
.download(&heatmap_path, cancel)
|
||||||
.await
|
.await
|
||||||
.map_err(UpdateError::from)?;
|
.map_err(UpdateError::from)?;
|
||||||
let mut heatmap_bytes = Vec::new();
|
|
||||||
let mut body = tokio_util::io::StreamReader::new(download.download_stream);
|
if Some(&download.etag) == prev_etag {
|
||||||
let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
|
Ok(HeatMapDownload::Unmodified)
|
||||||
Ok(heatmap_bytes)
|
} else {
|
||||||
|
let mut heatmap_bytes = Vec::new();
|
||||||
|
let mut body = tokio_util::io::StreamReader::new(download.download_stream);
|
||||||
|
let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
|
||||||
|
SECONDARY_MODE.download_heatmap.inc();
|
||||||
|
Ok(HeatMapDownload::Modified(HeatMapModified {
|
||||||
|
etag: download.etag,
|
||||||
|
last_modified: download.last_modified,
|
||||||
|
bytes: heatmap_bytes,
|
||||||
|
}))
|
||||||
|
}
|
||||||
},
|
},
|
||||||
|e| matches!(e, UpdateError::NoData | UpdateError::Cancelled),
|
|e| matches!(e, UpdateError::NoData | UpdateError::Cancelled),
|
||||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||||
@@ -548,11 +732,7 @@ impl<'a> TenantDownloader<'a> {
|
|||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
.ok_or_else(|| UpdateError::Cancelled)
|
.ok_or_else(|| UpdateError::Cancelled)
|
||||||
.and_then(|x| x)?;
|
.and_then(|x| x)
|
||||||
|
|
||||||
SECONDARY_MODE.download_heatmap.inc();
|
|
||||||
|
|
||||||
Ok(heatmap_bytes)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn download_timeline(&self, timeline: HeatMapTimeline) -> Result<(), UpdateError> {
|
async fn download_timeline(&self, timeline: HeatMapTimeline) -> Result<(), UpdateError> {
|
||||||
@@ -593,31 +773,13 @@ impl<'a> TenantDownloader<'a> {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let layers_in_heatmap = timeline
|
tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
|
||||||
.layers
|
|
||||||
.iter()
|
|
||||||
.map(|l| &l.name)
|
|
||||||
.collect::<HashSet<_>>();
|
|
||||||
let layers_on_disk = timeline_state
|
|
||||||
.on_disk_layers
|
|
||||||
.iter()
|
|
||||||
.map(|l| l.0)
|
|
||||||
.collect::<HashSet<_>>();
|
|
||||||
|
|
||||||
// Remove on-disk layers that are no longer present in heatmap
|
|
||||||
for layer in layers_on_disk.difference(&layers_in_heatmap) {
|
|
||||||
let local_path = timeline_path.join(layer.to_string());
|
|
||||||
tracing::info!("Removing secondary local layer {layer} because it's absent in heatmap",);
|
|
||||||
tokio::fs::remove_file(&local_path)
|
|
||||||
.await
|
|
||||||
.or_else(fs_ext::ignore_not_found)
|
|
||||||
.maybe_fatal_err("Removing secondary layer")?;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Download heatmap layers that are not present on local disk, or update their
|
// Download heatmap layers that are not present on local disk, or update their
|
||||||
// access time if they are already present.
|
// access time if they are already present.
|
||||||
for layer in timeline.layers {
|
for layer in timeline.layers {
|
||||||
if self.secondary_state.cancel.is_cancelled() {
|
if self.secondary_state.cancel.is_cancelled() {
|
||||||
|
tracing::debug!("Cancelled -- dropping out of layer loop");
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -662,6 +824,12 @@ impl<'a> TenantDownloader<'a> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Failpoint for simulating slow remote storage
|
||||||
|
failpoint_support::sleep_millis_async!(
|
||||||
|
"secondary-layer-download-sleep",
|
||||||
|
&self.secondary_state.cancel
|
||||||
|
);
|
||||||
|
|
||||||
// Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
|
// Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
|
||||||
let downloaded_bytes = match download_layer_file(
|
let downloaded_bytes = match download_layer_file(
|
||||||
self.conf,
|
self.conf,
|
||||||
@@ -701,6 +869,11 @@ impl<'a> TenantDownloader<'a> {
|
|||||||
tokio::fs::remove_file(&local_path)
|
tokio::fs::remove_file(&local_path)
|
||||||
.await
|
.await
|
||||||
.or_else(fs_ext::ignore_not_found)?;
|
.or_else(fs_ext::ignore_not_found)?;
|
||||||
|
} else {
|
||||||
|
tracing::info!("Downloaded layer {}, size {}", layer.name, downloaded_bytes);
|
||||||
|
let mut progress = self.secondary_state.progress.lock().unwrap();
|
||||||
|
progress.bytes_downloaded += downloaded_bytes;
|
||||||
|
progress.layers_downloaded += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
SECONDARY_MODE.download_layer.inc();
|
SECONDARY_MODE.download_layer.inc();
|
||||||
@@ -789,7 +962,10 @@ async fn init_timeline_state(
|
|||||||
// Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
|
// Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
|
||||||
warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config");
|
warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config");
|
||||||
continue;
|
continue;
|
||||||
} else if crate::is_temporary(&file_path) || is_temp_download_file(&file_path) {
|
} else if crate::is_temporary(&file_path)
|
||||||
|
|| is_temp_download_file(&file_path)
|
||||||
|
|| is_ephemeral_file(file_name)
|
||||||
|
{
|
||||||
// Temporary files are frequently left behind from restarting during downloads
|
// Temporary files are frequently left behind from restarting during downloads
|
||||||
tracing::info!("Cleaning up temporary file {file_path}");
|
tracing::info!("Cleaning up temporary file {file_path}");
|
||||||
if let Err(e) = tokio::fs::remove_file(&file_path)
|
if let Err(e) = tokio::fs::remove_file(&file_path)
|
||||||
|
|||||||
@@ -62,3 +62,25 @@ impl HeatMapTimeline {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) struct HeatMapStats {
|
||||||
|
pub(crate) bytes: u64,
|
||||||
|
pub(crate) layers: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl HeatMapTenant {
|
||||||
|
pub(crate) fn get_stats(&self) -> HeatMapStats {
|
||||||
|
let mut stats = HeatMapStats {
|
||||||
|
bytes: 0,
|
||||||
|
layers: 0,
|
||||||
|
};
|
||||||
|
for timeline in &self.timelines {
|
||||||
|
for layer in &timeline.layers {
|
||||||
|
stats.layers += 1;
|
||||||
|
stats.bytes += layer.metadata.file_size;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
stats
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ use crate::{
|
|||||||
metrics::SECONDARY_MODE,
|
metrics::SECONDARY_MODE,
|
||||||
tenant::{
|
tenant::{
|
||||||
config::AttachmentMode,
|
config::AttachmentMode,
|
||||||
|
mgr::GetTenantError,
|
||||||
mgr::TenantManager,
|
mgr::TenantManager,
|
||||||
remote_timeline_client::remote_heatmap_path,
|
remote_timeline_client::remote_heatmap_path,
|
||||||
span::debug_assert_current_span_has_tenant_id,
|
span::debug_assert_current_span_has_tenant_id,
|
||||||
@@ -292,8 +293,11 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
|
|||||||
"Starting heatmap write on command");
|
"Starting heatmap write on command");
|
||||||
let tenant = self
|
let tenant = self
|
||||||
.tenant_manager
|
.tenant_manager
|
||||||
.get_attached_tenant_shard(*tenant_shard_id, true)
|
.get_attached_tenant_shard(*tenant_shard_id)
|
||||||
.map_err(|e| anyhow::anyhow!(e))?;
|
.map_err(|e| anyhow::anyhow!(e))?;
|
||||||
|
if !tenant.is_active() {
|
||||||
|
return Err(GetTenantError::NotActive(*tenant_shard_id).into());
|
||||||
|
}
|
||||||
|
|
||||||
Ok(UploadPending {
|
Ok(UploadPending {
|
||||||
// Ignore our state for last digest: this forces an upload even if nothing has changed
|
// Ignore our state for last digest: this forces an upload even if nothing has changed
|
||||||
|
|||||||
@@ -300,6 +300,7 @@ where
|
|||||||
|
|
||||||
let tenant_shard_id = job.get_tenant_shard_id();
|
let tenant_shard_id = job.get_tenant_shard_id();
|
||||||
let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
|
let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
|
||||||
|
tracing::info!("Command already running, waiting for it");
|
||||||
barrier
|
barrier
|
||||||
} else {
|
} else {
|
||||||
let running = self.spawn_now(job);
|
let running = self.spawn_now(job);
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user