mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-26 14:50:36 +00:00
Compare commits
274 Commits
arpad/comp
...
release-pr
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5bad8126dc | ||
|
|
27bc242085 | ||
|
|
192b49cc6d | ||
|
|
e1b60f3693 | ||
|
|
2804f5323b | ||
|
|
676adc6b32 | ||
|
|
96a4e8de66 | ||
|
|
01180666b0 | ||
|
|
6c94269c32 | ||
|
|
edc691647d | ||
|
|
855d7b4781 | ||
|
|
c49c9707ce | ||
|
|
2227540a0d | ||
|
|
f1347f2417 | ||
|
|
30b295b017 | ||
|
|
1cef395266 | ||
|
|
78d160f76d | ||
|
|
b9238059d6 | ||
|
|
d0cb4b88c8 | ||
|
|
1ec3e39d4e | ||
|
|
a1a74eef2c | ||
|
|
90e689adda | ||
|
|
f0b2d4b053 | ||
|
|
299d9474c9 | ||
|
|
7234208b36 | ||
|
|
93450f11f5 | ||
|
|
2f0f9edf33 | ||
|
|
d424f2b7c8 | ||
|
|
21315e80bc | ||
|
|
483b66d383 | ||
|
|
aa72a22661 | ||
|
|
5c0264b591 | ||
|
|
9f13277729 | ||
|
|
54aa319805 | ||
|
|
4a227484bf | ||
|
|
2f83f85291 | ||
|
|
d6cfcb0d93 | ||
|
|
392843ad2a | ||
|
|
bd4dae8f4a | ||
|
|
b05fe53cfd | ||
|
|
c13a2f0df1 | ||
|
|
39be366fc5 | ||
|
|
6eda0a3158 | ||
|
|
306c7a1813 | ||
|
|
80be423a58 | ||
|
|
5dcfef82f2 | ||
|
|
e67b8f69c0 | ||
|
|
e546872ab4 | ||
|
|
322ea1cf7c | ||
|
|
3633742de9 | ||
|
|
079d3a37ba | ||
|
|
a46e77b476 | ||
|
|
a92702b01e | ||
|
|
8ff3253f20 | ||
|
|
04b82c92a7 | ||
|
|
e5bf423e68 | ||
|
|
60af392e45 | ||
|
|
661fc41e71 | ||
|
|
702c488f32 | ||
|
|
45c5122754 | ||
|
|
558394f710 | ||
|
|
73b0898608 | ||
|
|
e65be4c2dc | ||
|
|
40087b8164 | ||
|
|
c762b59483 | ||
|
|
5d71601ca9 | ||
|
|
a113c3e433 | ||
|
|
e81fc598f4 | ||
|
|
48b845fa76 | ||
|
|
27096858dc | ||
|
|
4430d0ae7d | ||
|
|
6e183aa0de | ||
|
|
fd6d0b7635 | ||
|
|
3710c32aae | ||
|
|
be83bee49d | ||
|
|
cf28e5922a | ||
|
|
7d384d6953 | ||
|
|
4b3b37b912 | ||
|
|
1d8d200f4d | ||
|
|
0d80d6ce18 | ||
|
|
f653ee039f | ||
|
|
e614a95853 | ||
|
|
850db4cc13 | ||
|
|
8a316b1277 | ||
|
|
4d13bae449 | ||
|
|
49377abd98 | ||
|
|
a6b2f4e54e | ||
|
|
face60d50b | ||
|
|
9768aa27f2 | ||
|
|
96b2e575e1 | ||
|
|
7222777784 | ||
|
|
5469fdede0 | ||
|
|
72aa6b9fdd | ||
|
|
ae0634b7be | ||
|
|
70711f32fa | ||
|
|
52a88af0aa | ||
|
|
b7a43bf817 | ||
|
|
dce91b33a4 | ||
|
|
23ee4f3050 | ||
|
|
46857e8282 | ||
|
|
368ab0ce54 | ||
|
|
a5987eebfd | ||
|
|
6686ede30f | ||
|
|
373c7057cc | ||
|
|
7d6ec16166 | ||
|
|
0e6fdc8a58 | ||
|
|
521438a5c6 | ||
|
|
07d7874bc8 | ||
|
|
1804111a02 | ||
|
|
cd0178efed | ||
|
|
333574be57 | ||
|
|
79a799a143 | ||
|
|
9da06af6c9 | ||
|
|
ce1753d036 | ||
|
|
67db8432b4 | ||
|
|
4e2e44e524 | ||
|
|
ed786104f3 | ||
|
|
84b74f2bd1 | ||
|
|
fec2ad6283 | ||
|
|
98eebd4682 | ||
|
|
2f74287c9b | ||
|
|
aee1bf95e3 | ||
|
|
b9de9d75ff | ||
|
|
7943b709e6 | ||
|
|
d7d066d493 | ||
|
|
e78ac22107 | ||
|
|
76a8f2bb44 | ||
|
|
8d59a8581f | ||
|
|
b1ddd01289 | ||
|
|
6eae4fc9aa | ||
|
|
765455bca2 | ||
|
|
4204960942 | ||
|
|
67345d66ea | ||
|
|
2266ee5971 | ||
|
|
b58445d855 | ||
|
|
36050e7f3d | ||
|
|
33360ed96d | ||
|
|
39a28d1108 | ||
|
|
efa6aa134f | ||
|
|
2c724e56e2 | ||
|
|
feff887c6f | ||
|
|
353d915fcf | ||
|
|
2e38098cbc | ||
|
|
a6fe5ea1ac | ||
|
|
05b0aed0c1 | ||
|
|
cd1705357d | ||
|
|
6bc7561290 | ||
|
|
fbd3ac14b5 | ||
|
|
e437787c8f | ||
|
|
3460dbf90b | ||
|
|
6b89d99677 | ||
|
|
6cc8ea86e4 | ||
|
|
e62a492d6f | ||
|
|
a475cdf642 | ||
|
|
7002c79a47 | ||
|
|
ee6cf357b4 | ||
|
|
e5c2086b5f | ||
|
|
5f1208296a | ||
|
|
88e8e473cd | ||
|
|
b0a77844f6 | ||
|
|
1baf464307 | ||
|
|
e9b8e81cea | ||
|
|
85d6194aa4 | ||
|
|
333a7a68ef | ||
|
|
6aa4e41bee | ||
|
|
840183e51f | ||
|
|
cbccc94b03 | ||
|
|
fce227df22 | ||
|
|
bd787e800f | ||
|
|
4a7704b4a3 | ||
|
|
ff1119da66 | ||
|
|
4c3ba1627b | ||
|
|
1407174fb2 | ||
|
|
ec9dcb1889 | ||
|
|
d11d781afc | ||
|
|
4e44565b71 | ||
|
|
4ed51ad33b | ||
|
|
1c1ebe5537 | ||
|
|
c19cb7f386 | ||
|
|
4b97d31b16 | ||
|
|
923ade3dd7 | ||
|
|
b04e711975 | ||
|
|
afd0a6b39a | ||
|
|
99752286d8 | ||
|
|
15df93363c | ||
|
|
bc0ab741af | ||
|
|
51d9dfeaa3 | ||
|
|
f63cb18155 | ||
|
|
0de603d88e | ||
|
|
240913912a | ||
|
|
91a4ea0de2 | ||
|
|
8608704f49 | ||
|
|
efef68ce99 | ||
|
|
8daefd24da | ||
|
|
46cc8b7982 | ||
|
|
38cd90dd0c | ||
|
|
a51b269f15 | ||
|
|
43bf6d0a0f | ||
|
|
15273a9b66 | ||
|
|
78aca668d0 | ||
|
|
acbf4148ea | ||
|
|
6508540561 | ||
|
|
a41b5244a8 | ||
|
|
2b3189be95 | ||
|
|
248563c595 | ||
|
|
14cd6ca933 | ||
|
|
eb36403e71 | ||
|
|
3c6f779698 | ||
|
|
f67f0c1c11 | ||
|
|
edb02d3299 | ||
|
|
664a69e65b | ||
|
|
478322ebf9 | ||
|
|
802f174072 | ||
|
|
47f9890bae | ||
|
|
262265daad | ||
|
|
300da5b872 | ||
|
|
7b22b5c433 | ||
|
|
ffca97bc1e | ||
|
|
cb356f3259 | ||
|
|
c85374295f | ||
|
|
4992160677 | ||
|
|
bd535b3371 | ||
|
|
d90c5a03af | ||
|
|
2d02cc9079 | ||
|
|
49ad94b99f | ||
|
|
948a217398 | ||
|
|
125381eae7 | ||
|
|
cd01bbc715 | ||
|
|
d8b5e3b88d | ||
|
|
06d25f2186 | ||
|
|
f759b561f3 | ||
|
|
ece0555600 | ||
|
|
73ea0a0b01 | ||
|
|
d8f6d6fd6f | ||
|
|
d24de169a7 | ||
|
|
0816168296 | ||
|
|
277b44d57a | ||
|
|
68c2c3880e | ||
|
|
49da498f65 | ||
|
|
2c76ba3dd7 | ||
|
|
dbe3dc69ad | ||
|
|
8e5bb3ed49 | ||
|
|
ab0be7b8da | ||
|
|
b4c55f5d24 | ||
|
|
ede70d833c | ||
|
|
70c3d18bb0 | ||
|
|
7a491f52c4 | ||
|
|
323c4ecb4f | ||
|
|
3d2466607e | ||
|
|
ed478b39f4 | ||
|
|
91585a558d | ||
|
|
93467eae1f | ||
|
|
f3aac81d19 | ||
|
|
979ad60c19 | ||
|
|
9316cb1b1f | ||
|
|
e7939a527a | ||
|
|
36d26665e1 | ||
|
|
873347f977 | ||
|
|
e814ac16f9 | ||
|
|
ad3055d386 | ||
|
|
94e03eb452 | ||
|
|
380f26ef79 | ||
|
|
3c5b7f59d7 | ||
|
|
fee89f80b5 | ||
|
|
41cce8eaf1 | ||
|
|
f88fe0218d | ||
|
|
cc856eca85 | ||
|
|
cf350c6002 | ||
|
|
0ce6b6a0a3 | ||
|
|
73f247d537 | ||
|
|
960be82183 | ||
|
|
806e5a6c19 | ||
|
|
8d5df07cce | ||
|
|
df7a9d1407 |
24
.github/workflows/build_and_test.yml
vendored
24
.github/workflows/build_and_test.yml
vendored
@@ -1121,36 +1121,18 @@ jobs:
|
||||
run: |
|
||||
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
|
||||
-f deployPgSniRouter=false \
|
||||
-f deployProxy=false \
|
||||
-f deployStorage=true \
|
||||
-f deployStorageBroker=true \
|
||||
-f deployStorageController=true \
|
||||
-f branch=main \
|
||||
-f dockerTag=${{needs.tag.outputs.build-tag}} \
|
||||
-f deployPreprodRegion=true
|
||||
|
||||
# TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions
|
||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
|
||||
-f deployPgSniRouter=false \
|
||||
-f deployProxy=false \
|
||||
-f deployStorage=true \
|
||||
-f deployStorageBroker=true \
|
||||
-f deployStorageController=true \
|
||||
-f branch=main \
|
||||
-f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
|
||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
|
||||
-f deployPgSniRouter=true \
|
||||
-f deployProxy=true \
|
||||
-f deployStorage=false \
|
||||
-f deployStorageBroker=false \
|
||||
-f deployStorageController=false \
|
||||
-f branch=main \
|
||||
-f dockerTag=${{needs.tag.outputs.build-tag}} \
|
||||
-f deployPreprodRegion=true
|
||||
|
||||
gh workflow --repo neondatabase/aws run deploy-proxy-prod.yml --ref main \
|
||||
-f deployPgSniRouter=true \
|
||||
-f deployProxy=true \
|
||||
|
||||
90
.github/workflows/trigger-e2e-tests.yml
vendored
90
.github/workflows/trigger-e2e-tests.yml
vendored
@@ -62,14 +62,14 @@ jobs:
|
||||
|
||||
trigger-e2e-tests:
|
||||
needs: [ tag ]
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
env:
|
||||
TAG: ${{ needs.tag.outputs.build-tag }}
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
||||
options: --init
|
||||
steps:
|
||||
- name: check if ecr image are present
|
||||
env:
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
run: |
|
||||
for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
|
||||
OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
|
||||
@@ -79,55 +79,41 @@ jobs:
|
||||
fi
|
||||
done
|
||||
|
||||
- name: Set e2e-platforms
|
||||
id: e2e-platforms
|
||||
env:
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
# Default set of platforms to run e2e tests on
|
||||
platforms='["docker", "k8s"]'
|
||||
|
||||
# If the PR changes vendor/, pgxn/ or libs/vm_monitor/ directories, or Dockerfile.compute-node, add k8s-neonvm to the list of platforms.
|
||||
# If the workflow run is not a pull request, add k8s-neonvm to the list.
|
||||
if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then
|
||||
for f in $(gh api "/repos/${GITHUB_REPOSITORY}/pulls/${PR_NUMBER}/files" --paginate --jq '.[].filename'); do
|
||||
case "$f" in
|
||||
vendor/*|pgxn/*|libs/vm_monitor/*|Dockerfile.compute-node)
|
||||
platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
|
||||
;;
|
||||
*)
|
||||
# no-op
|
||||
;;
|
||||
esac
|
||||
done
|
||||
else
|
||||
platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
|
||||
fi
|
||||
|
||||
echo "e2e-platforms=${platforms}" | tee -a $GITHUB_OUTPUT
|
||||
|
||||
- name: Set PR's status to pending and request a remote CI test
|
||||
env:
|
||||
E2E_PLATFORMS: ${{ steps.e2e-platforms.outputs.e2e-platforms }}
|
||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
run: |
|
||||
REMOTE_REPO="${GITHUB_REPOSITORY_OWNER}/cloud"
|
||||
# For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit
|
||||
# but we need to use a real sha of a latest commit in the PR's branch for the e2e job,
|
||||
# to place a job run status update later.
|
||||
COMMIT_SHA=${{ github.event.pull_request.head.sha }}
|
||||
# For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those
|
||||
COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
|
||||
|
||||
gh api "/repos/${GITHUB_REPOSITORY}/statuses/${COMMIT_SHA}" \
|
||||
--method POST \
|
||||
--raw-field "state=pending" \
|
||||
--raw-field "description=[$REMOTE_REPO] Remote CI job is about to start" \
|
||||
--raw-field "context=neon-cloud-e2e"
|
||||
REMOTE_REPO="${{ github.repository_owner }}/cloud"
|
||||
|
||||
gh workflow --repo ${REMOTE_REPO} \
|
||||
run testing.yml \
|
||||
--ref "main" \
|
||||
--raw-field "ci_job_name=neon-cloud-e2e" \
|
||||
--raw-field "commit_hash=$COMMIT_SHA" \
|
||||
--raw-field "remote_repo=${GITHUB_REPOSITORY}" \
|
||||
--raw-field "storage_image_tag=${TAG}" \
|
||||
--raw-field "compute_image_tag=${TAG}" \
|
||||
--raw-field "concurrency_group=${E2E_CONCURRENCY_GROUP}" \
|
||||
--raw-field "e2e-platforms=${E2E_PLATFORMS}"
|
||||
curl -f -X POST \
|
||||
https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
|
||||
-H "Accept: application/vnd.github.v3+json" \
|
||||
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
|
||||
--data \
|
||||
"{
|
||||
\"state\": \"pending\",
|
||||
\"context\": \"neon-cloud-e2e\",
|
||||
\"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
|
||||
}"
|
||||
|
||||
curl -f -X POST \
|
||||
https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
|
||||
-H "Accept: application/vnd.github.v3+json" \
|
||||
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
|
||||
--data \
|
||||
"{
|
||||
\"ref\": \"main\",
|
||||
\"inputs\": {
|
||||
\"ci_job_name\": \"neon-cloud-e2e\",
|
||||
\"commit_hash\": \"$COMMIT_SHA\",
|
||||
\"remote_repo\": \"${{ github.repository }}\",
|
||||
\"storage_image_tag\": \"${TAG}\",
|
||||
\"compute_image_tag\": \"${TAG}\",
|
||||
\"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
|
||||
}
|
||||
}"
|
||||
|
||||
@@ -1,13 +1,12 @@
|
||||
/compute_tools/ @neondatabase/control-plane @neondatabase/compute
|
||||
/control_plane/attachment_service @neondatabase/storage
|
||||
/libs/pageserver_api/ @neondatabase/storage
|
||||
/libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers
|
||||
/libs/postgres_ffi/ @neondatabase/compute
|
||||
/libs/remote_storage/ @neondatabase/storage
|
||||
/libs/safekeeper_api/ @neondatabase/safekeepers
|
||||
/libs/vm_monitor/ @neondatabase/autoscaling
|
||||
/pageserver/ @neondatabase/storage
|
||||
/pgxn/ @neondatabase/compute
|
||||
/pgxn/neon/ @neondatabase/compute @neondatabase/safekeepers
|
||||
/proxy/ @neondatabase/proxy
|
||||
/safekeeper/ @neondatabase/safekeepers
|
||||
/vendor/ @neondatabase/compute
|
||||
|
||||
206
Cargo.lock
generated
206
Cargo.lock
generated
@@ -276,7 +276,7 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"aws-config",
|
||||
"bytes",
|
||||
"aws-sdk-secretsmanager",
|
||||
"camino",
|
||||
"clap",
|
||||
"control_plane",
|
||||
@@ -288,8 +288,6 @@ dependencies = [
|
||||
"hex",
|
||||
"humantime",
|
||||
"hyper",
|
||||
"lasso",
|
||||
"measured",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"pageserver_api",
|
||||
@@ -297,7 +295,6 @@ dependencies = [
|
||||
"postgres_connection",
|
||||
"r2d2",
|
||||
"reqwest",
|
||||
"routerify",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"thiserror",
|
||||
@@ -346,9 +343,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-credential-types"
|
||||
version = "1.1.8"
|
||||
version = "1.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fa8587ae17c8e967e4b05a62d495be2fb7701bec52a97f7acfe8a29f938384c8"
|
||||
checksum = "33cc49dcdd31c8b6e79850a179af4c367669150c7ac0135f176c61bec81a70f7"
|
||||
dependencies = [
|
||||
"aws-smithy-async",
|
||||
"aws-smithy-runtime-api",
|
||||
@@ -358,9 +355,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-runtime"
|
||||
version = "1.1.8"
|
||||
version = "1.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b13dc54b4b49f8288532334bba8f87386a40571c47c37b1304979b556dc613c8"
|
||||
checksum = "eb031bff99877c26c28895766f7bb8484a05e24547e370768d6cc9db514662aa"
|
||||
dependencies = [
|
||||
"aws-credential-types",
|
||||
"aws-sigv4",
|
||||
@@ -380,29 +377,6 @@ dependencies = [
|
||||
"uuid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aws-sdk-iam"
|
||||
version = "1.17.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b8ae76026bfb1b80a6aed0bb400c1139cd9c0563e26bce1986cd021c6a968c7b"
|
||||
dependencies = [
|
||||
"aws-credential-types",
|
||||
"aws-runtime",
|
||||
"aws-smithy-async",
|
||||
"aws-smithy-http",
|
||||
"aws-smithy-json",
|
||||
"aws-smithy-query",
|
||||
"aws-smithy-runtime",
|
||||
"aws-smithy-runtime-api",
|
||||
"aws-smithy-types",
|
||||
"aws-smithy-xml",
|
||||
"aws-types",
|
||||
"http 0.2.9",
|
||||
"once_cell",
|
||||
"regex-lite",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aws-sdk-s3"
|
||||
version = "1.14.0"
|
||||
@@ -432,6 +406,29 @@ dependencies = [
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aws-sdk-secretsmanager"
|
||||
version = "1.14.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0a0b64e61e7d632d9df90a2e0f32630c68c24960cab1d27d848718180af883d3"
|
||||
dependencies = [
|
||||
"aws-credential-types",
|
||||
"aws-runtime",
|
||||
"aws-smithy-async",
|
||||
"aws-smithy-http",
|
||||
"aws-smithy-json",
|
||||
"aws-smithy-runtime",
|
||||
"aws-smithy-runtime-api",
|
||||
"aws-smithy-types",
|
||||
"aws-types",
|
||||
"bytes",
|
||||
"fastrand 2.0.0",
|
||||
"http 0.2.9",
|
||||
"once_cell",
|
||||
"regex-lite",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aws-sdk-sso"
|
||||
version = "1.12.0"
|
||||
@@ -501,9 +498,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-sigv4"
|
||||
version = "1.2.0"
|
||||
version = "1.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "11d6f29688a4be9895c0ba8bef861ad0c0dac5c15e9618b9b7a6c233990fc263"
|
||||
checksum = "c371c6b0ac54d4605eb6f016624fb5c7c2925d315fdf600ac1bf21b19d5f1742"
|
||||
dependencies = [
|
||||
"aws-credential-types",
|
||||
"aws-smithy-eventstream",
|
||||
@@ -516,7 +513,7 @@ dependencies = [
|
||||
"hex",
|
||||
"hmac",
|
||||
"http 0.2.9",
|
||||
"http 1.1.0",
|
||||
"http 1.0.0",
|
||||
"once_cell",
|
||||
"p256",
|
||||
"percent-encoding",
|
||||
@@ -530,9 +527,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-smithy-async"
|
||||
version = "1.1.8"
|
||||
version = "1.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d26ea8fa03025b2face2b3038a63525a10891e3d8829901d502e5384a0d8cd46"
|
||||
checksum = "72ee2d09cce0ef3ae526679b522835d63e75fb427aca5413cd371e490d52dcc6"
|
||||
dependencies = [
|
||||
"futures-util",
|
||||
"pin-project-lite",
|
||||
@@ -573,9 +570,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-smithy-http"
|
||||
version = "0.60.7"
|
||||
version = "0.60.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3f10fa66956f01540051b0aa7ad54574640f748f9839e843442d99b970d3aff9"
|
||||
checksum = "dab56aea3cd9e1101a0a999447fb346afb680ab1406cebc44b32346e25b4117d"
|
||||
dependencies = [
|
||||
"aws-smithy-eventstream",
|
||||
"aws-smithy-runtime-api",
|
||||
@@ -594,18 +591,18 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-smithy-json"
|
||||
version = "0.60.7"
|
||||
version = "0.60.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4683df9469ef09468dad3473d129960119a0d3593617542b7d52086c8486f2d6"
|
||||
checksum = "fd3898ca6518f9215f62678870064398f00031912390efd03f1f6ef56d83aa8e"
|
||||
dependencies = [
|
||||
"aws-smithy-types",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aws-smithy-query"
|
||||
version = "0.60.7"
|
||||
version = "0.60.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f2fbd61ceb3fe8a1cb7352e42689cec5335833cd9f94103a61e98f9bb61c64bb"
|
||||
checksum = "bda4b1dfc9810e35fba8a620e900522cd1bd4f9578c446e82f49d1ce41d2e9f9"
|
||||
dependencies = [
|
||||
"aws-smithy-types",
|
||||
"urlencoding",
|
||||
@@ -613,9 +610,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-smithy-runtime"
|
||||
version = "1.1.8"
|
||||
version = "1.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ec81002d883e5a7fd2bb063d6fb51c4999eb55d404f4fff3dd878bf4733b9f01"
|
||||
checksum = "fafdab38f40ad7816e7da5dec279400dd505160780083759f01441af1bbb10ea"
|
||||
dependencies = [
|
||||
"aws-smithy-async",
|
||||
"aws-smithy-http",
|
||||
@@ -638,15 +635,14 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-smithy-runtime-api"
|
||||
version = "1.2.0"
|
||||
version = "1.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9acb931e0adaf5132de878f1398d83f8677f90ba70f01f65ff87f6d7244be1c5"
|
||||
checksum = "c18276dd28852f34b3bf501f4f3719781f4999a51c7bff1a5c6dc8c4529adc29"
|
||||
dependencies = [
|
||||
"aws-smithy-async",
|
||||
"aws-smithy-types",
|
||||
"bytes",
|
||||
"http 0.2.9",
|
||||
"http 1.1.0",
|
||||
"pin-project-lite",
|
||||
"tokio",
|
||||
"tracing",
|
||||
@@ -655,9 +651,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-smithy-types"
|
||||
version = "1.1.8"
|
||||
version = "1.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "abe14dceea1e70101d38fbf2a99e6a34159477c0fb95e68e05c66bd7ae4c3729"
|
||||
checksum = "bb3e134004170d3303718baa2a4eb4ca64ee0a1c0a7041dca31b38be0fb414f3"
|
||||
dependencies = [
|
||||
"base64-simd",
|
||||
"bytes",
|
||||
@@ -678,18 +674,18 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-smithy-xml"
|
||||
version = "0.60.7"
|
||||
version = "0.60.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "872c68cf019c0e4afc5de7753c4f7288ce4b71663212771bf5e4542eb9346ca9"
|
||||
checksum = "8604a11b25e9ecaf32f9aa56b9fe253c5e2f606a3477f0071e96d3155a5ed218"
|
||||
dependencies = [
|
||||
"xmlparser",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aws-types"
|
||||
version = "1.1.8"
|
||||
version = "1.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0dbf2f3da841a8930f159163175cf6a3d16ddde517c1b0fba7aa776822800f40"
|
||||
checksum = "789bbe008e65636fe1b6dbbb374c40c8960d1232b96af5ff4aec349f9c4accf4"
|
||||
dependencies = [
|
||||
"aws-credential-types",
|
||||
"aws-smithy-async",
|
||||
@@ -2396,9 +2392,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "http"
|
||||
version = "1.1.0"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258"
|
||||
checksum = "b32afd38673a8016f7c9ae69e5af41a58f81b1d31689040f2f1959594ce194ea"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fnv",
|
||||
@@ -2498,7 +2494,7 @@ dependencies = [
|
||||
"hyper",
|
||||
"log",
|
||||
"rustls 0.21.9",
|
||||
"rustls-native-certs 0.6.2",
|
||||
"rustls-native-certs",
|
||||
"tokio",
|
||||
"tokio-rustls 0.24.0",
|
||||
]
|
||||
@@ -2884,35 +2880,6 @@ version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
|
||||
|
||||
[[package]]
|
||||
name = "measured"
|
||||
version = "0.0.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f246648d027839a34b420e27c7de1165ace96e19ef894985d0a6ff89a7840a9f"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"hashbrown 0.14.0",
|
||||
"itoa",
|
||||
"lasso",
|
||||
"measured-derive",
|
||||
"memchr",
|
||||
"parking_lot 0.12.1",
|
||||
"rustc-hash",
|
||||
"ryu",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "measured-derive"
|
||||
version = "0.0.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "edaa5cc22d99d5d6d7d99c3b5b5f7e7f8034c22f1b5d62a1adecd2ed005d9b80"
|
||||
dependencies = [
|
||||
"heck",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.6.4"
|
||||
@@ -3581,7 +3548,6 @@ dependencies = [
|
||||
"strum_macros",
|
||||
"svg_fmt",
|
||||
"sync_wrapper",
|
||||
"sysinfo",
|
||||
"tenant_size_model",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
@@ -3935,7 +3901,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres"
|
||||
version = "0.19.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -3948,7 +3914,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-native-tls"
|
||||
version = "0.5.0"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
|
||||
dependencies = [
|
||||
"native-tls",
|
||||
"tokio",
|
||||
@@ -3959,7 +3925,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-protocol"
|
||||
version = "0.6.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
|
||||
dependencies = [
|
||||
"base64 0.20.0",
|
||||
"byteorder",
|
||||
@@ -3972,13 +3938,12 @@ dependencies = [
|
||||
"rand 0.8.5",
|
||||
"sha2",
|
||||
"stringprep",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "postgres-types"
|
||||
version = "0.2.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -4200,10 +4165,6 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"aws-config",
|
||||
"aws-sdk-iam",
|
||||
"aws-sigv4",
|
||||
"aws-types",
|
||||
"base64 0.13.1",
|
||||
"bstr",
|
||||
"bytes",
|
||||
@@ -4214,7 +4175,6 @@ dependencies = [
|
||||
"consumption_metrics",
|
||||
"dashmap",
|
||||
"env_logger",
|
||||
"fallible-iterator",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hashbrown 0.13.2",
|
||||
@@ -4222,7 +4182,6 @@ dependencies = [
|
||||
"hex",
|
||||
"hmac",
|
||||
"hostname",
|
||||
"http 1.1.0",
|
||||
"humantime",
|
||||
"hyper",
|
||||
"hyper-tungstenite",
|
||||
@@ -4266,7 +4225,6 @@ dependencies = [
|
||||
"smallvec",
|
||||
"smol_str",
|
||||
"socket2 0.5.5",
|
||||
"subtle",
|
||||
"sync_wrapper",
|
||||
"task-local-extensions",
|
||||
"thiserror",
|
||||
@@ -4438,9 +4396,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "redis"
|
||||
version = "0.25.2"
|
||||
version = "0.24.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "71d64e978fd98a0e6b105d066ba4889a7301fca65aeac850a877d8797343feeb"
|
||||
checksum = "c580d9cbbe1d1b479e8d67cf9daf6a62c957e6846048408b80b43ac3f6af84cd"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"bytes",
|
||||
@@ -4449,15 +4407,15 @@ dependencies = [
|
||||
"itoa",
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
"rustls 0.22.2",
|
||||
"rustls-native-certs 0.7.0",
|
||||
"rustls-pemfile 2.1.1",
|
||||
"rustls-pki-types",
|
||||
"rustls 0.21.9",
|
||||
"rustls-native-certs",
|
||||
"rustls-pemfile 1.0.2",
|
||||
"rustls-webpki 0.101.7",
|
||||
"ryu",
|
||||
"sha1_smol",
|
||||
"socket2 0.5.5",
|
||||
"socket2 0.4.9",
|
||||
"tokio",
|
||||
"tokio-rustls 0.25.0",
|
||||
"tokio-rustls 0.24.0",
|
||||
"tokio-util",
|
||||
"url",
|
||||
]
|
||||
@@ -4886,19 +4844,6 @@ dependencies = [
|
||||
"security-framework",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustls-native-certs"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8f1fb85efa936c42c6d5fc28d2629bb51e4b2f4b8a5211e297d599cc5a093792"
|
||||
dependencies = [
|
||||
"openssl-probe",
|
||||
"rustls-pemfile 2.1.1",
|
||||
"rustls-pki-types",
|
||||
"schannel",
|
||||
"security-framework",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustls-pemfile"
|
||||
version = "1.0.2"
|
||||
@@ -5401,23 +5346,13 @@ checksum = "ae1a47186c03a32177042e55dbc5fd5aee900b8e0069a8d70fba96a9375cd012"
|
||||
|
||||
[[package]]
|
||||
name = "sha2"
|
||||
version = "0.10.8"
|
||||
version = "0.10.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8"
|
||||
checksum = "82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"cpufeatures",
|
||||
"digest",
|
||||
"sha2-asm",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sha2-asm"
|
||||
version = "0.6.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f27ba7066011e3fb30d808b51affff34f0a66d3a03a58edd787c6e420e40e44e"
|
||||
dependencies = [
|
||||
"cc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6000,7 +5935,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.7"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
@@ -6166,7 +6101,7 @@ dependencies = [
|
||||
"percent-encoding",
|
||||
"pin-project",
|
||||
"prost",
|
||||
"rustls-native-certs 0.6.2",
|
||||
"rustls-native-certs",
|
||||
"rustls-pemfile 1.0.2",
|
||||
"tokio",
|
||||
"tokio-rustls 0.24.0",
|
||||
@@ -6533,7 +6468,6 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"arc-swap",
|
||||
"async-compression",
|
||||
"async-trait",
|
||||
"bincode",
|
||||
"byteorder",
|
||||
@@ -6572,14 +6506,12 @@ dependencies = [
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tokio-tar",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"tracing-error",
|
||||
"tracing-subscriber",
|
||||
"url",
|
||||
"uuid",
|
||||
"walkdir",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
@@ -7051,6 +6983,7 @@ dependencies = [
|
||||
"aws-sigv4",
|
||||
"aws-smithy-async",
|
||||
"aws-smithy-http",
|
||||
"aws-smithy-runtime-api",
|
||||
"aws-smithy-types",
|
||||
"axum",
|
||||
"base64 0.21.1",
|
||||
@@ -7096,7 +7029,6 @@ dependencies = [
|
||||
"scopeguard",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sha2",
|
||||
"smallvec",
|
||||
"subtle",
|
||||
"syn 1.0.109",
|
||||
|
||||
10
Cargo.toml
10
Cargo.toml
@@ -52,12 +52,10 @@ async-stream = "0.3"
|
||||
async-trait = "0.1"
|
||||
aws-config = { version = "1.1.4", default-features = false, features=["rustls"] }
|
||||
aws-sdk-s3 = "1.14"
|
||||
aws-sdk-iam = "1.15.0"
|
||||
aws-sdk-secretsmanager = { version = "1.14.0" }
|
||||
aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] }
|
||||
aws-smithy-types = "1.1.4"
|
||||
aws-credential-types = "1.1.4"
|
||||
aws-sigv4 = { version = "1.2.0", features = ["sign-http"] }
|
||||
aws-types = "1.1.7"
|
||||
axum = { version = "0.6.20", features = ["ws"] }
|
||||
base64 = "0.13.0"
|
||||
bincode = "1.3"
|
||||
@@ -78,7 +76,6 @@ either = "1.8"
|
||||
enum-map = "2.4.2"
|
||||
enumset = "1.0.12"
|
||||
fail = "0.5.0"
|
||||
fallible-iterator = "0.2"
|
||||
fs2 = "0.4.3"
|
||||
futures = "0.3"
|
||||
futures-core = "0.3"
|
||||
@@ -91,7 +88,6 @@ hex = "0.4"
|
||||
hex-literal = "0.4"
|
||||
hmac = "0.12.1"
|
||||
hostname = "0.3.1"
|
||||
http = {version = "1.1.0", features = ["std"]}
|
||||
http-types = { version = "2", default-features = false }
|
||||
humantime = "2.1"
|
||||
humantime-serde = "1.1.1"
|
||||
@@ -105,7 +101,6 @@ lasso = "0.7"
|
||||
leaky-bucket = "1.0.1"
|
||||
libc = "0.2"
|
||||
md5 = "0.7.0"
|
||||
measured = { version = "0.0.13", features=["default", "lasso"] }
|
||||
memoffset = "0.8"
|
||||
native-tls = "0.2"
|
||||
nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
|
||||
@@ -125,7 +120,7 @@ procfs = "0.14"
|
||||
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
|
||||
prost = "0.11"
|
||||
rand = "0.8"
|
||||
redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
|
||||
redis = { version = "0.24.0", features = ["tokio-rustls-comp", "keep-alive"] }
|
||||
regex = "1.10.2"
|
||||
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
|
||||
reqwest-tracing = { version = "0.4.7", features = ["opentelemetry_0_20"] }
|
||||
@@ -153,7 +148,6 @@ smol_str = { version = "0.2.0", features = ["serde"] }
|
||||
socket2 = "0.5"
|
||||
strum = "0.24"
|
||||
strum_macros = "0.24"
|
||||
"subtle" = "2.5.0"
|
||||
svg_fmt = "0.4.1"
|
||||
sync_wrapper = "0.1.2"
|
||||
tar = "0.4"
|
||||
|
||||
@@ -135,7 +135,7 @@ WORKDIR /home/nonroot
|
||||
|
||||
# Rust
|
||||
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
|
||||
ENV RUSTC_VERSION=1.77.0
|
||||
ENV RUSTC_VERSION=1.76.0
|
||||
ENV RUSTUP_HOME="/home/nonroot/.rustup"
|
||||
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
|
||||
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
|
||||
@@ -149,7 +149,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
|
||||
cargo install --git https://github.com/paritytech/cachepot && \
|
||||
cargo install rustfilt && \
|
||||
cargo install cargo-hakari && \
|
||||
cargo install cargo-deny --locked && \
|
||||
cargo install cargo-deny && \
|
||||
cargo install cargo-hack && \
|
||||
cargo install cargo-nextest && \
|
||||
rm -rf /home/nonroot/.cargo/registry && \
|
||||
|
||||
@@ -944,9 +944,6 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
|
||||
COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
|
||||
COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
|
||||
|
||||
# Create remote extension download directory
|
||||
RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions
|
||||
|
||||
# Install:
|
||||
# libreadline8 for psql
|
||||
# libicu67, locales for collations (including ICU and plpgsql_check)
|
||||
|
||||
@@ -32,29 +32,6 @@ compute_ctl -D /var/db/postgres/compute \
|
||||
-b /usr/local/bin/postgres
|
||||
```
|
||||
|
||||
## State Diagram
|
||||
|
||||
Computes can be in various states. Below is a diagram that details how a
|
||||
compute moves between states.
|
||||
|
||||
```mermaid
|
||||
%% https://mermaid.js.org/syntax/stateDiagram.html
|
||||
stateDiagram-v2
|
||||
[*] --> Empty : Compute spawned
|
||||
Empty --> ConfigurationPending : Waiting for compute spec
|
||||
ConfigurationPending --> Configuration : Received compute spec
|
||||
Configuration --> Failed : Failed to configure the compute
|
||||
Configuration --> Running : Compute has been configured
|
||||
Empty --> Init : Compute spec is immediately available
|
||||
Empty --> TerminationPending : Requested termination
|
||||
Init --> Failed : Failed to start Postgres
|
||||
Init --> Running : Started Postgres
|
||||
Running --> TerminationPending : Requested termination
|
||||
TerminationPending --> Terminated : Terminated compute
|
||||
Failed --> [*] : Compute exited
|
||||
Terminated --> [*] : Compute exited
|
||||
```
|
||||
|
||||
## Tests
|
||||
|
||||
Cargo formatter:
|
||||
|
||||
@@ -1262,12 +1262,10 @@ LIMIT 100",
|
||||
.await
|
||||
.map_err(DownloadError::Other);
|
||||
|
||||
if download_size.is_ok() {
|
||||
self.ext_download_progress
|
||||
.write()
|
||||
.expect("bad lock")
|
||||
.insert(ext_archive_name.to_string(), (download_start, true));
|
||||
}
|
||||
self.ext_download_progress
|
||||
.write()
|
||||
.expect("bad lock")
|
||||
.insert(ext_archive_name.to_string(), (download_start, true));
|
||||
|
||||
download_size
|
||||
}
|
||||
|
||||
@@ -17,7 +17,6 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
|
||||
.write(true)
|
||||
.create(true)
|
||||
.append(false)
|
||||
.truncate(false)
|
||||
.open(path)?;
|
||||
let buf = io::BufReader::new(&file);
|
||||
let mut count: usize = 0;
|
||||
|
||||
@@ -302,9 +302,9 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
RoleAction::Create => {
|
||||
// This branch only runs when roles are created through the console, so it is
|
||||
// safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
|
||||
// from neon_superuser.
|
||||
// from neon_superuser. (NOTE: REPLICATION has been removed from here for now).
|
||||
let mut query: String = format!(
|
||||
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
|
||||
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
|
||||
name.pg_quote()
|
||||
);
|
||||
info!("running role create query: '{}'", &query);
|
||||
@@ -743,24 +743,21 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
|
||||
// which may happen in two cases:
|
||||
// - extension was just installed
|
||||
// - extension was already installed and is up to date
|
||||
let query = "ALTER EXTENSION neon UPDATE";
|
||||
info!("update neon extension version with query: {}", query);
|
||||
if let Err(e) = client.simple_query(query) {
|
||||
error!(
|
||||
"failed to upgrade neon extension during `handle_extension_neon`: {}",
|
||||
e
|
||||
);
|
||||
}
|
||||
// DISABLED due to compute node unpinning epic
|
||||
// let query = "ALTER EXTENSION neon UPDATE";
|
||||
// info!("update neon extension version with query: {}", query);
|
||||
// client.simple_query(query)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
|
||||
info!("handle neon extension upgrade");
|
||||
let query = "ALTER EXTENSION neon UPDATE";
|
||||
info!("update neon extension version with query: {}", query);
|
||||
client.simple_query(query)?;
|
||||
pub fn handle_neon_extension_upgrade(_client: &mut Client) -> Result<()> {
|
||||
info!("handle neon extension upgrade (not really)");
|
||||
// DISABLED due to compute node unpinning epic
|
||||
// let query = "ALTER EXTENSION neon UPDATE";
|
||||
// info!("update neon extension version with query: {}", query);
|
||||
// client.simple_query(query)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -809,8 +806,19 @@ $$;"#,
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
// Add new migrations below.
|
||||
r#"
|
||||
DO $$
|
||||
DECLARE
|
||||
role_name TEXT;
|
||||
BEGIN
|
||||
FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
|
||||
LOOP
|
||||
RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
|
||||
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
|
||||
END LOOP;
|
||||
END
|
||||
$$;"#,
|
||||
];
|
||||
|
||||
let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
|
||||
|
||||
@@ -16,7 +16,7 @@ testing = []
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
aws-config.workspace = true
|
||||
bytes.workspace = true
|
||||
aws-sdk-secretsmanager.workspace = true
|
||||
camino.workspace = true
|
||||
clap.workspace = true
|
||||
fail.workspace = true
|
||||
@@ -25,20 +25,17 @@ git-version.workspace = true
|
||||
hex.workspace = true
|
||||
hyper.workspace = true
|
||||
humantime.workspace = true
|
||||
lasso.workspace = true
|
||||
once_cell.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
pageserver_client.workspace = true
|
||||
postgres_connection.workspace = true
|
||||
reqwest.workspace = true
|
||||
routerify.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tracing.workspace = true
|
||||
measured.workspace = true
|
||||
|
||||
diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
|
||||
diesel_migrations = { version = "2.1.0" }
|
||||
|
||||
@@ -1,3 +0,0 @@
|
||||
|
||||
UPDATE tenant_shards set placement_policy='{"Double": 1}' where placement_policy='{"Attached": 1}';
|
||||
UPDATE tenant_shards set placement_policy='"Single"' where placement_policy='{"Attached": 0}';
|
||||
@@ -1,3 +0,0 @@
|
||||
|
||||
UPDATE tenant_shards set placement_policy='{"Attached": 1}' where placement_policy='{"Double": 1}';
|
||||
UPDATE tenant_shards set placement_policy='{"Attached": 0}' where placement_policy='"Single"';
|
||||
@@ -1,3 +0,0 @@
|
||||
-- This file should undo anything in `up.sql`
|
||||
|
||||
ALTER TABLE tenant_shards drop scheduling_policy;
|
||||
@@ -1,2 +0,0 @@
|
||||
|
||||
ALTER TABLE tenant_shards add scheduling_policy VARCHAR NOT NULL DEFAULT '"Active"';
|
||||
@@ -14,6 +14,7 @@ use utils::{
|
||||
|
||||
use crate::service::Config;
|
||||
|
||||
const BUSY_DELAY: Duration = Duration::from_secs(1);
|
||||
const SLOWDOWN_DELAY: Duration = Duration::from_secs(5);
|
||||
|
||||
pub(crate) const API_CONCURRENCY: usize = 32;
|
||||
@@ -279,10 +280,11 @@ impl ComputeHook {
|
||||
Err(NotifyError::SlowDown)
|
||||
}
|
||||
StatusCode::LOCKED => {
|
||||
// We consider this fatal, because it's possible that the operation blocking the control one is
|
||||
// also the one that is waiting for this reconcile. We should let the reconciler calling
|
||||
// this hook fail, to give control plane a chance to un-lock.
|
||||
tracing::info!("Control plane reports tenant is locked, dropping out of notify");
|
||||
// Delay our retry if busy: the usual fast exponential backoff in backoff::retry
|
||||
// is not appropriate
|
||||
tokio::time::timeout(BUSY_DELAY, cancel.cancelled())
|
||||
.await
|
||||
.ok();
|
||||
Err(NotifyError::Busy)
|
||||
}
|
||||
StatusCode::SERVICE_UNAVAILABLE
|
||||
@@ -304,12 +306,7 @@ impl ComputeHook {
|
||||
let client = reqwest::Client::new();
|
||||
backoff::retry(
|
||||
|| self.do_notify_iteration(&client, url, &reconfigure_request, cancel),
|
||||
|e| {
|
||||
matches!(
|
||||
e,
|
||||
NotifyError::Fatal(_) | NotifyError::Unexpected(_) | NotifyError::Busy
|
||||
)
|
||||
},
|
||||
|e| matches!(e, NotifyError::Fatal(_) | NotifyError::Unexpected(_)),
|
||||
3,
|
||||
10,
|
||||
"Send compute notification",
|
||||
|
||||
@@ -139,7 +139,7 @@ impl HeartbeaterTask {
|
||||
.with_client_retries(
|
||||
|client| async move { client.get_utilization().await },
|
||||
&jwt_token,
|
||||
3,
|
||||
2,
|
||||
3,
|
||||
Duration::from_secs(1),
|
||||
&cancel,
|
||||
|
||||
@@ -1,11 +1,5 @@
|
||||
use crate::metrics::{
|
||||
HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup,
|
||||
METRICS_REGISTRY,
|
||||
};
|
||||
use crate::reconciler::ReconcileError;
|
||||
use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
|
||||
use futures::Future;
|
||||
use hyper::header::CONTENT_TYPE;
|
||||
use hyper::{Body, Request, Response};
|
||||
use hyper::{StatusCode, Uri};
|
||||
use pageserver_api::models::{
|
||||
@@ -34,15 +28,12 @@ use utils::{
|
||||
};
|
||||
|
||||
use pageserver_api::controller_api::{
|
||||
NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantPolicyRequest,
|
||||
TenantShardMigrateRequest,
|
||||
NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
|
||||
};
|
||||
use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
|
||||
|
||||
use control_plane::storage_controller::{AttachHookRequest, InspectRequest};
|
||||
|
||||
use routerify::Middleware;
|
||||
|
||||
/// State available to HTTP request handlers
|
||||
#[derive(Clone)]
|
||||
pub struct HttpState {
|
||||
@@ -322,7 +313,7 @@ async fn handle_tenant_timeline_passthrough(
|
||||
tracing::info!("Proxying request for tenant {} ({})", tenant_id, path);
|
||||
|
||||
// Find the node that holds shard zero
|
||||
let (node, tenant_shard_id) = service.tenant_shard0_node(tenant_id)?;
|
||||
let (base_url, tenant_shard_id) = service.tenant_shard0_baseurl(tenant_id)?;
|
||||
|
||||
// Callers will always pass an unsharded tenant ID. Before proxying, we must
|
||||
// rewrite this to a shard-aware shard zero ID.
|
||||
@@ -331,39 +322,12 @@ async fn handle_tenant_timeline_passthrough(
|
||||
let tenant_shard_str = format!("{}", tenant_shard_id);
|
||||
let path = path.replace(&tenant_str, &tenant_shard_str);
|
||||
|
||||
let latency = &METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_passthrough_request_latency;
|
||||
|
||||
// This is a bit awkward. We remove the param from the request
|
||||
// and join the words by '_' to get a label for the request.
|
||||
let just_path = path.replace(&tenant_shard_str, "");
|
||||
let path_label = just_path
|
||||
.split('/')
|
||||
.filter(|token| !token.is_empty())
|
||||
.collect::<Vec<_>>()
|
||||
.join("_");
|
||||
let labels = PageserverRequestLabelGroup {
|
||||
pageserver_id: &node.get_id().to_string(),
|
||||
path: &path_label,
|
||||
method: crate::metrics::Method::Get,
|
||||
};
|
||||
|
||||
let _timer = latency.start_timer(labels.clone());
|
||||
|
||||
let client = mgmt_api::Client::new(node.base_url(), service.get_config().jwt_token.as_deref());
|
||||
let client = mgmt_api::Client::new(base_url, service.get_config().jwt_token.as_deref());
|
||||
let resp = client.get_raw(path).await.map_err(|_e|
|
||||
// FIXME: give APiError a proper Unavailable variant. We return 503 here because
|
||||
// if we can't successfully send a request to the pageserver, we aren't available.
|
||||
ApiError::ShuttingDown)?;
|
||||
|
||||
if !resp.status().is_success() {
|
||||
let error_counter = &METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_passthrough_request_error;
|
||||
error_counter.inc(labels);
|
||||
}
|
||||
|
||||
// We have a reqest::Response, would like a http::Response
|
||||
let mut builder = hyper::Response::builder()
|
||||
.status(resp.status())
|
||||
@@ -389,16 +353,6 @@ async fn handle_tenant_locate(
|
||||
json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
|
||||
}
|
||||
|
||||
async fn handle_tenant_describe(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
|
||||
}
|
||||
|
||||
async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
@@ -479,22 +433,6 @@ async fn handle_tenant_shard_migrate(
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_tenant_update_policy(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
let update_req = json_request::<TenantPolicyRequest>(&mut req).await?;
|
||||
let state = get_state(&req);
|
||||
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
state
|
||||
.service
|
||||
.tenant_update_policy(tenant_id, update_req)
|
||||
.await?,
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
check_permissions(&req, Scope::PageServerApi)?;
|
||||
@@ -526,14 +464,6 @@ async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>,
|
||||
json_response(StatusCode::OK, state.service.consistency_check().await?)
|
||||
}
|
||||
|
||||
async fn handle_reconcile_all(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let state = get_state(&req);
|
||||
|
||||
json_response(StatusCode::OK, state.service.reconcile_all_now().await?)
|
||||
}
|
||||
|
||||
/// Status endpoint is just used for checking that our HTTP listener is up
|
||||
async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
json_response(StatusCode::OK, ())
|
||||
@@ -558,11 +488,7 @@ impl From<ReconcileError> for ApiError {
|
||||
|
||||
/// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
|
||||
/// be allowed to run if Service has finished its initial reconciliation.
|
||||
async fn tenant_service_handler<R, H>(
|
||||
request: Request<Body>,
|
||||
handler: H,
|
||||
request_name: RequestName,
|
||||
) -> R::Output
|
||||
async fn tenant_service_handler<R, H>(request: Request<Body>, handler: H) -> R::Output
|
||||
where
|
||||
R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
|
||||
H: FnOnce(Arc<Service>, Request<Body>) -> R + Send + Sync + 'static,
|
||||
@@ -582,10 +508,9 @@ where
|
||||
));
|
||||
}
|
||||
|
||||
named_request_span(
|
||||
request_span(
|
||||
request,
|
||||
|request| async move { handler(service, request).await },
|
||||
request_name,
|
||||
)
|
||||
.await
|
||||
}
|
||||
@@ -596,98 +521,11 @@ fn check_permissions(request: &Request<Body>, required_scope: Scope) -> Result<(
|
||||
})
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
struct RequestMeta {
|
||||
method: hyper::http::Method,
|
||||
at: Instant,
|
||||
}
|
||||
|
||||
fn prologue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
|
||||
) -> Middleware<B, ApiError> {
|
||||
Middleware::pre(move |req| async move {
|
||||
let meta = RequestMeta {
|
||||
method: req.method().clone(),
|
||||
at: Instant::now(),
|
||||
};
|
||||
|
||||
req.set_context(meta);
|
||||
|
||||
Ok(req)
|
||||
})
|
||||
}
|
||||
|
||||
fn epilogue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
|
||||
) -> Middleware<B, ApiError> {
|
||||
Middleware::post_with_info(move |resp, req_info| async move {
|
||||
let request_name = match req_info.context::<RequestName>() {
|
||||
Some(name) => name,
|
||||
None => {
|
||||
return Ok(resp);
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(meta) = req_info.context::<RequestMeta>() {
|
||||
let status = &crate::metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_http_request_status;
|
||||
let latency = &crate::metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_http_request_latency;
|
||||
|
||||
status.inc(HttpRequestStatusLabelGroup {
|
||||
path: request_name.0,
|
||||
method: meta.method.clone().into(),
|
||||
status: crate::metrics::StatusCode(resp.status()),
|
||||
});
|
||||
|
||||
latency.observe(
|
||||
HttpRequestLatencyLabelGroup {
|
||||
path: request_name.0,
|
||||
method: meta.method.into(),
|
||||
},
|
||||
meta.at.elapsed().as_secs_f64(),
|
||||
);
|
||||
}
|
||||
Ok(resp)
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn measured_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4";
|
||||
|
||||
let payload = crate::metrics::METRICS_REGISTRY.encode();
|
||||
let response = Response::builder()
|
||||
.status(200)
|
||||
.header(CONTENT_TYPE, TEXT_FORMAT)
|
||||
.body(payload.into())
|
||||
.unwrap();
|
||||
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct RequestName(&'static str);
|
||||
|
||||
async fn named_request_span<R, H>(
|
||||
request: Request<Body>,
|
||||
handler: H,
|
||||
name: RequestName,
|
||||
) -> R::Output
|
||||
where
|
||||
R: Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
|
||||
H: FnOnce(Request<Body>) -> R + Send + Sync + 'static,
|
||||
{
|
||||
request.set_context(name);
|
||||
request_span(request, handler).await
|
||||
}
|
||||
|
||||
pub fn make_router(
|
||||
service: Arc<Service>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
) -> RouterBuilder<hyper::Body, ApiError> {
|
||||
let mut router = endpoint::make_router()
|
||||
.middleware(prologue_metrics_middleware())
|
||||
.middleware(epilogue_metrics_middleware());
|
||||
let mut router = endpoint::make_router();
|
||||
if auth.is_some() {
|
||||
router = router.middleware(auth_middleware(|request| {
|
||||
let state = get_state(request);
|
||||
@@ -696,176 +534,96 @@ pub fn make_router(
|
||||
} else {
|
||||
state.auth.as_deref()
|
||||
}
|
||||
}));
|
||||
}))
|
||||
}
|
||||
|
||||
router
|
||||
.data(Arc::new(HttpState::new(service, auth)))
|
||||
.get("/metrics", |r| {
|
||||
named_request_span(r, measured_metrics_handler, RequestName("metrics"))
|
||||
})
|
||||
// Non-prefixed generic endpoints (status, metrics)
|
||||
.get("/status", |r| {
|
||||
named_request_span(r, handle_status, RequestName("status"))
|
||||
})
|
||||
.get("/ready", |r| {
|
||||
named_request_span(r, handle_ready, RequestName("ready"))
|
||||
})
|
||||
.get("/status", |r| request_span(r, handle_status))
|
||||
.get("/ready", |r| request_span(r, handle_ready))
|
||||
// Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix
|
||||
.post("/upcall/v1/re-attach", |r| {
|
||||
named_request_span(r, handle_re_attach, RequestName("upcall_v1_reattach"))
|
||||
})
|
||||
.post("/upcall/v1/validate", |r| {
|
||||
named_request_span(r, handle_validate, RequestName("upcall_v1_validate"))
|
||||
request_span(r, handle_re_attach)
|
||||
})
|
||||
.post("/upcall/v1/validate", |r| request_span(r, handle_validate))
|
||||
// Test/dev/debug endpoints
|
||||
.post("/debug/v1/attach-hook", |r| {
|
||||
named_request_span(r, handle_attach_hook, RequestName("debug_v1_attach_hook"))
|
||||
})
|
||||
.post("/debug/v1/inspect", |r| {
|
||||
named_request_span(r, handle_inspect, RequestName("debug_v1_inspect"))
|
||||
request_span(r, handle_attach_hook)
|
||||
})
|
||||
.post("/debug/v1/inspect", |r| request_span(r, handle_inspect))
|
||||
.post("/debug/v1/tenant/:tenant_id/drop", |r| {
|
||||
named_request_span(r, handle_tenant_drop, RequestName("debug_v1_tenant_drop"))
|
||||
request_span(r, handle_tenant_drop)
|
||||
})
|
||||
.post("/debug/v1/node/:node_id/drop", |r| {
|
||||
named_request_span(r, handle_node_drop, RequestName("debug_v1_node_drop"))
|
||||
})
|
||||
.get("/debug/v1/tenant", |r| {
|
||||
named_request_span(r, handle_tenants_dump, RequestName("debug_v1_tenant"))
|
||||
})
|
||||
.get("/debug/v1/tenant/:tenant_id/locate", |r| {
|
||||
tenant_service_handler(
|
||||
r,
|
||||
handle_tenant_locate,
|
||||
RequestName("debug_v1_tenant_locate"),
|
||||
)
|
||||
request_span(r, handle_node_drop)
|
||||
})
|
||||
.get("/debug/v1/tenant", |r| request_span(r, handle_tenants_dump))
|
||||
.get("/debug/v1/scheduler", |r| {
|
||||
named_request_span(r, handle_scheduler_dump, RequestName("debug_v1_scheduler"))
|
||||
request_span(r, handle_scheduler_dump)
|
||||
})
|
||||
.post("/debug/v1/consistency_check", |r| {
|
||||
named_request_span(
|
||||
r,
|
||||
handle_consistency_check,
|
||||
RequestName("debug_v1_consistency_check"),
|
||||
)
|
||||
})
|
||||
.post("/debug/v1/reconcile_all", |r| {
|
||||
request_span(r, handle_reconcile_all)
|
||||
request_span(r, handle_consistency_check)
|
||||
})
|
||||
.put("/debug/v1/failpoints", |r| {
|
||||
request_span(r, |r| failpoints_handler(r, CancellationToken::new()))
|
||||
})
|
||||
.get("/control/v1/tenant/:tenant_id/locate", |r| {
|
||||
tenant_service_handler(r, handle_tenant_locate)
|
||||
})
|
||||
// Node operations
|
||||
.post("/control/v1/node", |r| {
|
||||
named_request_span(r, handle_node_register, RequestName("control_v1_node"))
|
||||
})
|
||||
.get("/control/v1/node", |r| {
|
||||
named_request_span(r, handle_node_list, RequestName("control_v1_node"))
|
||||
request_span(r, handle_node_register)
|
||||
})
|
||||
.get("/control/v1/node", |r| request_span(r, handle_node_list))
|
||||
.put("/control/v1/node/:node_id/config", |r| {
|
||||
named_request_span(
|
||||
r,
|
||||
handle_node_configure,
|
||||
RequestName("control_v1_node_config"),
|
||||
)
|
||||
request_span(r, handle_node_configure)
|
||||
})
|
||||
// Tenant Shard operations
|
||||
.put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
|
||||
tenant_service_handler(
|
||||
r,
|
||||
handle_tenant_shard_migrate,
|
||||
RequestName("control_v1_tenant_migrate"),
|
||||
)
|
||||
tenant_service_handler(r, handle_tenant_shard_migrate)
|
||||
})
|
||||
.put("/control/v1/tenant/:tenant_id/shard_split", |r| {
|
||||
tenant_service_handler(
|
||||
r,
|
||||
handle_tenant_shard_split,
|
||||
RequestName("control_v1_tenant_shard_split"),
|
||||
)
|
||||
})
|
||||
.get("/control/v1/tenant/:tenant_id", |r| {
|
||||
tenant_service_handler(
|
||||
r,
|
||||
handle_tenant_describe,
|
||||
RequestName("control_v1_tenant_describe"),
|
||||
)
|
||||
})
|
||||
.put("/control/v1/tenant/:tenant_id/policy", |r| {
|
||||
named_request_span(
|
||||
r,
|
||||
handle_tenant_update_policy,
|
||||
RequestName("control_v1_tenant_policy"),
|
||||
)
|
||||
tenant_service_handler(r, handle_tenant_shard_split)
|
||||
})
|
||||
// Tenant operations
|
||||
// The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
|
||||
// this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
|
||||
.post("/v1/tenant", |r| {
|
||||
tenant_service_handler(r, handle_tenant_create, RequestName("v1_tenant"))
|
||||
tenant_service_handler(r, handle_tenant_create)
|
||||
})
|
||||
.delete("/v1/tenant/:tenant_id", |r| {
|
||||
tenant_service_handler(r, handle_tenant_delete, RequestName("v1_tenant"))
|
||||
tenant_service_handler(r, handle_tenant_delete)
|
||||
})
|
||||
.put("/v1/tenant/config", |r| {
|
||||
tenant_service_handler(r, handle_tenant_config_set, RequestName("v1_tenant_config"))
|
||||
tenant_service_handler(r, handle_tenant_config_set)
|
||||
})
|
||||
.get("/v1/tenant/:tenant_id/config", |r| {
|
||||
tenant_service_handler(r, handle_tenant_config_get, RequestName("v1_tenant_config"))
|
||||
tenant_service_handler(r, handle_tenant_config_get)
|
||||
})
|
||||
.put("/v1/tenant/:tenant_shard_id/location_config", |r| {
|
||||
tenant_service_handler(
|
||||
r,
|
||||
handle_tenant_location_config,
|
||||
RequestName("v1_tenant_location_config"),
|
||||
)
|
||||
tenant_service_handler(r, handle_tenant_location_config)
|
||||
})
|
||||
.put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
|
||||
tenant_service_handler(
|
||||
r,
|
||||
handle_tenant_time_travel_remote_storage,
|
||||
RequestName("v1_tenant_time_travel_remote_storage"),
|
||||
)
|
||||
tenant_service_handler(r, handle_tenant_time_travel_remote_storage)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_id/secondary/download", |r| {
|
||||
tenant_service_handler(
|
||||
r,
|
||||
handle_tenant_secondary_download,
|
||||
RequestName("v1_tenant_secondary_download"),
|
||||
)
|
||||
tenant_service_handler(r, handle_tenant_secondary_download)
|
||||
})
|
||||
// Timeline operations
|
||||
.delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
|
||||
tenant_service_handler(
|
||||
r,
|
||||
handle_tenant_timeline_delete,
|
||||
RequestName("v1_tenant_timeline"),
|
||||
)
|
||||
tenant_service_handler(r, handle_tenant_timeline_delete)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_id/timeline", |r| {
|
||||
tenant_service_handler(
|
||||
r,
|
||||
handle_tenant_timeline_create,
|
||||
RequestName("v1_tenant_timeline"),
|
||||
)
|
||||
tenant_service_handler(r, handle_tenant_timeline_create)
|
||||
})
|
||||
// Tenant detail GET passthrough to shard zero
|
||||
.get("/v1/tenant/:tenant_id", |r| {
|
||||
tenant_service_handler(
|
||||
r,
|
||||
handle_tenant_timeline_passthrough,
|
||||
RequestName("v1_tenant_passthrough"),
|
||||
)
|
||||
tenant_service_handler(r, handle_tenant_timeline_passthrough)
|
||||
})
|
||||
// Timeline GET passthrough to shard zero. Note that the `*` in the URL is a wildcard: any future
|
||||
// timeline GET APIs will be implicitly included.
|
||||
.get("/v1/tenant/:tenant_id/timeline*", |r| {
|
||||
tenant_service_handler(
|
||||
r,
|
||||
handle_tenant_timeline_passthrough,
|
||||
RequestName("v1_tenant_timeline_passthrough"),
|
||||
)
|
||||
tenant_service_handler(r, handle_tenant_timeline_passthrough)
|
||||
})
|
||||
}
|
||||
|
||||
@@ -8,7 +8,6 @@ pub mod http;
|
||||
mod id_lock_map;
|
||||
pub mod metrics;
|
||||
mod node;
|
||||
mod pageserver_client;
|
||||
pub mod persistence;
|
||||
mod reconciler;
|
||||
mod scheduler;
|
||||
|
||||
@@ -3,6 +3,7 @@ use attachment_service::http::make_router;
|
||||
use attachment_service::metrics::preinitialize_metrics;
|
||||
use attachment_service::persistence::Persistence;
|
||||
use attachment_service::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT};
|
||||
use aws_config::{BehaviorVersion, Region};
|
||||
use camino::Utf8PathBuf;
|
||||
use clap::Parser;
|
||||
use diesel::Connection;
|
||||
@@ -54,31 +55,11 @@ struct Cli {
|
||||
#[arg(long)]
|
||||
database_url: Option<String>,
|
||||
|
||||
/// Flag to enable dev mode, which permits running without auth
|
||||
#[arg(long, default_value = "false")]
|
||||
dev: bool,
|
||||
|
||||
/// Grace period before marking unresponsive pageserver offline
|
||||
#[arg(long)]
|
||||
max_unavailable_interval: Option<humantime::Duration>,
|
||||
}
|
||||
|
||||
enum StrictMode {
|
||||
/// In strict mode, we will require that all secrets are loaded, i.e. security features
|
||||
/// may not be implicitly turned off by omitting secrets in the environment.
|
||||
Strict,
|
||||
/// In dev mode, secrets are optional, and omitting a particular secret will implicitly
|
||||
/// disable the auth related to it (e.g. no pageserver jwt key -> send unauthenticated
|
||||
/// requests, no public key -> don't authenticate incoming requests).
|
||||
Dev,
|
||||
}
|
||||
|
||||
impl Default for StrictMode {
|
||||
fn default() -> Self {
|
||||
Self::Strict
|
||||
}
|
||||
}
|
||||
|
||||
/// Secrets may either be provided on the command line (for testing), or loaded from AWS SecretManager: this
|
||||
/// type encapsulates the logic to decide which and do the loading.
|
||||
struct Secrets {
|
||||
@@ -89,6 +70,13 @@ struct Secrets {
|
||||
}
|
||||
|
||||
impl Secrets {
|
||||
const DATABASE_URL_SECRET: &'static str = "rds-neon-storage-controller-url";
|
||||
const PAGESERVER_JWT_TOKEN_SECRET: &'static str =
|
||||
"neon-storage-controller-pageserver-jwt-token";
|
||||
const CONTROL_PLANE_JWT_TOKEN_SECRET: &'static str =
|
||||
"neon-storage-controller-control-plane-jwt-token";
|
||||
const PUBLIC_KEY_SECRET: &'static str = "neon-storage-controller-public-key";
|
||||
|
||||
const DATABASE_URL_ENV: &'static str = "DATABASE_URL";
|
||||
const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN";
|
||||
const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN";
|
||||
@@ -99,41 +87,111 @@ impl Secrets {
|
||||
/// - Environment variables if DATABASE_URL is set.
|
||||
/// - AWS Secrets Manager secrets
|
||||
async fn load(args: &Cli) -> anyhow::Result<Self> {
|
||||
let Some(database_url) =
|
||||
Self::load_secret(&args.database_url, Self::DATABASE_URL_ENV).await
|
||||
else {
|
||||
anyhow::bail!(
|
||||
"Database URL is not set (set `--database-url`, or `DATABASE_URL` environment)"
|
||||
)
|
||||
};
|
||||
|
||||
let public_key = match Self::load_secret(&args.public_key, Self::PUBLIC_KEY_ENV).await {
|
||||
Some(v) => Some(JwtAuth::from_key(v).context("Loading public key")?),
|
||||
None => None,
|
||||
};
|
||||
|
||||
let this = Self {
|
||||
database_url,
|
||||
public_key,
|
||||
jwt_token: Self::load_secret(&args.jwt_token, Self::PAGESERVER_JWT_TOKEN_ENV).await,
|
||||
control_plane_jwt_token: Self::load_secret(
|
||||
&args.control_plane_jwt_token,
|
||||
Self::CONTROL_PLANE_JWT_TOKEN_ENV,
|
||||
)
|
||||
.await,
|
||||
};
|
||||
|
||||
Ok(this)
|
||||
match &args.database_url {
|
||||
Some(url) => Self::load_cli(url, args),
|
||||
None => match std::env::var(Self::DATABASE_URL_ENV) {
|
||||
Ok(database_url) => Self::load_env(database_url),
|
||||
Err(_) => Self::load_aws_sm().await,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
async fn load_secret(cli: &Option<String>, env_name: &str) -> Option<String> {
|
||||
if let Some(v) = cli {
|
||||
Some(v.clone())
|
||||
} else if let Ok(v) = std::env::var(env_name) {
|
||||
Some(v)
|
||||
} else {
|
||||
None
|
||||
fn load_env(database_url: String) -> anyhow::Result<Self> {
|
||||
let public_key = match std::env::var(Self::PUBLIC_KEY_ENV) {
|
||||
Ok(public_key) => Some(JwtAuth::from_key(public_key).context("Loading public key")?),
|
||||
Err(_) => None,
|
||||
};
|
||||
Ok(Self {
|
||||
database_url,
|
||||
public_key,
|
||||
jwt_token: std::env::var(Self::PAGESERVER_JWT_TOKEN_ENV).ok(),
|
||||
control_plane_jwt_token: std::env::var(Self::CONTROL_PLANE_JWT_TOKEN_ENV).ok(),
|
||||
})
|
||||
}
|
||||
|
||||
async fn load_aws_sm() -> anyhow::Result<Self> {
|
||||
let Ok(region) = std::env::var("AWS_REGION") else {
|
||||
anyhow::bail!("AWS_REGION is not set, cannot load secrets automatically: either set this, or use CLI args to supply secrets");
|
||||
};
|
||||
let config = aws_config::defaults(BehaviorVersion::v2023_11_09())
|
||||
.region(Region::new(region.clone()))
|
||||
.load()
|
||||
.await;
|
||||
|
||||
let asm = aws_sdk_secretsmanager::Client::new(&config);
|
||||
|
||||
let Some(database_url) = asm
|
||||
.get_secret_value()
|
||||
.secret_id(Self::DATABASE_URL_SECRET)
|
||||
.send()
|
||||
.await?
|
||||
.secret_string()
|
||||
.map(str::to_string)
|
||||
else {
|
||||
anyhow::bail!(
|
||||
"Database URL secret not found at {region}/{}",
|
||||
Self::DATABASE_URL_SECRET
|
||||
)
|
||||
};
|
||||
|
||||
let jwt_token = asm
|
||||
.get_secret_value()
|
||||
.secret_id(Self::PAGESERVER_JWT_TOKEN_SECRET)
|
||||
.send()
|
||||
.await?
|
||||
.secret_string()
|
||||
.map(str::to_string);
|
||||
if jwt_token.is_none() {
|
||||
tracing::warn!("No pageserver JWT token set: this will only work if authentication is disabled on the pageserver");
|
||||
}
|
||||
|
||||
let control_plane_jwt_token = asm
|
||||
.get_secret_value()
|
||||
.secret_id(Self::CONTROL_PLANE_JWT_TOKEN_SECRET)
|
||||
.send()
|
||||
.await?
|
||||
.secret_string()
|
||||
.map(str::to_string);
|
||||
if jwt_token.is_none() {
|
||||
tracing::warn!("No control plane JWT token set: this will only work if authentication is disabled on the pageserver");
|
||||
}
|
||||
|
||||
let public_key = asm
|
||||
.get_secret_value()
|
||||
.secret_id(Self::PUBLIC_KEY_SECRET)
|
||||
.send()
|
||||
.await?
|
||||
.secret_string()
|
||||
.map(str::to_string);
|
||||
let public_key = match public_key {
|
||||
Some(key) => Some(JwtAuth::from_key(key)?),
|
||||
None => {
|
||||
tracing::warn!(
|
||||
"No public key set: inccoming HTTP requests will not be authenticated"
|
||||
);
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
Ok(Self {
|
||||
database_url,
|
||||
public_key,
|
||||
jwt_token,
|
||||
control_plane_jwt_token,
|
||||
})
|
||||
}
|
||||
|
||||
fn load_cli(database_url: &str, args: &Cli) -> anyhow::Result<Self> {
|
||||
let public_key = match &args.public_key {
|
||||
None => None,
|
||||
Some(key) => Some(JwtAuth::from_key(key.clone()).context("Loading public key")?),
|
||||
};
|
||||
Ok(Self {
|
||||
database_url: database_url.to_owned(),
|
||||
public_key,
|
||||
jwt_token: args.jwt_token.clone(),
|
||||
control_plane_jwt_token: args.control_plane_jwt_token.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -189,42 +247,8 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
args.listen
|
||||
);
|
||||
|
||||
let strict_mode = if args.dev {
|
||||
StrictMode::Dev
|
||||
} else {
|
||||
StrictMode::Strict
|
||||
};
|
||||
|
||||
let secrets = Secrets::load(&args).await?;
|
||||
|
||||
// Validate required secrets and arguments are provided in strict mode
|
||||
match strict_mode {
|
||||
StrictMode::Strict
|
||||
if (secrets.public_key.is_none()
|
||||
|| secrets.jwt_token.is_none()
|
||||
|| secrets.control_plane_jwt_token.is_none()) =>
|
||||
{
|
||||
// Production systems should always have secrets configured: if public_key was not set
|
||||
// then we would implicitly disable auth.
|
||||
anyhow::bail!(
|
||||
"Insecure config! One or more secrets is not set. This is only permitted in `--dev` mode"
|
||||
);
|
||||
}
|
||||
StrictMode::Strict if args.compute_hook_url.is_none() => {
|
||||
// Production systems should always have a compute hook set, to prevent falling
|
||||
// back to trying to use neon_local.
|
||||
anyhow::bail!(
|
||||
"`--compute-hook-url` is not set: this is only permitted in `--dev` mode"
|
||||
);
|
||||
}
|
||||
StrictMode::Strict => {
|
||||
tracing::info!("Starting in strict mode: configuration is OK.")
|
||||
}
|
||||
StrictMode::Dev => {
|
||||
tracing::warn!("Starting in dev mode: this may be an insecure configuration.")
|
||||
}
|
||||
}
|
||||
|
||||
let config = Config {
|
||||
jwt_token: secrets.jwt_token,
|
||||
control_plane_jwt_token: secrets.control_plane_jwt_token,
|
||||
|
||||
@@ -1,288 +1,32 @@
|
||||
//!
|
||||
//! This module provides metric definitions for the storage controller.
|
||||
//!
|
||||
//! All metrics are grouped in [`StorageControllerMetricGroup`]. [`StorageControllerMetrics`] holds
|
||||
//! the mentioned metrics and their encoder. It's globally available via the [`METRICS_REGISTRY`]
|
||||
//! constant.
|
||||
//!
|
||||
//! The rest of the code defines label group types and deals with converting outer types to labels.
|
||||
//!
|
||||
use bytes::Bytes;
|
||||
use measured::{
|
||||
label::{LabelValue, StaticLabelSet},
|
||||
FixedCardinalityLabel, MetricGroup,
|
||||
};
|
||||
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
|
||||
use once_cell::sync::Lazy;
|
||||
use std::sync::Mutex;
|
||||
|
||||
use crate::persistence::{DatabaseError, DatabaseOperation};
|
||||
pub(crate) struct ReconcilerMetrics {
|
||||
pub(crate) spawned: IntCounter,
|
||||
pub(crate) complete: IntCounterVec,
|
||||
}
|
||||
|
||||
pub(crate) static METRICS_REGISTRY: Lazy<StorageControllerMetrics> =
|
||||
Lazy::new(StorageControllerMetrics::default);
|
||||
impl ReconcilerMetrics {
|
||||
// Labels used on [`Self::complete`]
|
||||
pub(crate) const SUCCESS: &'static str = "ok";
|
||||
pub(crate) const ERROR: &'static str = "success";
|
||||
pub(crate) const CANCEL: &'static str = "cancel";
|
||||
}
|
||||
|
||||
pub(crate) static RECONCILER: Lazy<ReconcilerMetrics> = Lazy::new(|| ReconcilerMetrics {
|
||||
spawned: register_int_counter!(
|
||||
"storage_controller_reconcile_spawn",
|
||||
"Count of how many times we spawn a reconcile task",
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
complete: register_int_counter_vec!(
|
||||
"storage_controller_reconcile_complete",
|
||||
"Reconciler tasks completed, broken down by success/failure/cancelled",
|
||||
&["status"],
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
});
|
||||
|
||||
pub fn preinitialize_metrics() {
|
||||
Lazy::force(&METRICS_REGISTRY);
|
||||
}
|
||||
|
||||
pub(crate) struct StorageControllerMetrics {
|
||||
pub(crate) metrics_group: StorageControllerMetricGroup,
|
||||
encoder: Mutex<measured::text::TextEncoder>,
|
||||
}
|
||||
|
||||
#[derive(measured::MetricGroup)]
|
||||
pub(crate) struct StorageControllerMetricGroup {
|
||||
/// Count of how many times we spawn a reconcile task
|
||||
pub(crate) storage_controller_reconcile_spawn: measured::Counter,
|
||||
/// Reconciler tasks completed, broken down by success/failure/cancelled
|
||||
pub(crate) storage_controller_reconcile_complete:
|
||||
measured::CounterVec<ReconcileCompleteLabelGroupSet>,
|
||||
|
||||
/// Count of how many times we make an optimization change to a tenant's scheduling
|
||||
pub(crate) storage_controller_schedule_optimization: measured::Counter,
|
||||
|
||||
/// HTTP request status counters for handled requests
|
||||
pub(crate) storage_controller_http_request_status:
|
||||
measured::CounterVec<HttpRequestStatusLabelGroupSet>,
|
||||
/// HTTP request handler latency across all status codes
|
||||
pub(crate) storage_controller_http_request_latency:
|
||||
measured::HistogramVec<HttpRequestLatencyLabelGroupSet, 5>,
|
||||
|
||||
/// Count of HTTP requests to the pageserver that resulted in an error,
|
||||
/// broken down by the pageserver node id, request name and method
|
||||
pub(crate) storage_controller_pageserver_request_error:
|
||||
measured::CounterVec<PageserverRequestLabelGroupSet>,
|
||||
|
||||
/// Latency of HTTP requests to the pageserver, broken down by pageserver
|
||||
/// node id, request name and method. This include both successful and unsuccessful
|
||||
/// requests.
|
||||
pub(crate) storage_controller_pageserver_request_latency:
|
||||
measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
|
||||
|
||||
/// Count of pass-through HTTP requests to the pageserver that resulted in an error,
|
||||
/// broken down by the pageserver node id, request name and method
|
||||
pub(crate) storage_controller_passthrough_request_error:
|
||||
measured::CounterVec<PageserverRequestLabelGroupSet>,
|
||||
|
||||
/// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver
|
||||
/// node id, request name and method. This include both successful and unsuccessful
|
||||
/// requests.
|
||||
pub(crate) storage_controller_passthrough_request_latency:
|
||||
measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
|
||||
|
||||
/// Count of errors in database queries, broken down by error type and operation.
|
||||
pub(crate) storage_controller_database_query_error:
|
||||
measured::CounterVec<DatabaseQueryErrorLabelGroupSet>,
|
||||
|
||||
/// Latency of database queries, broken down by operation.
|
||||
pub(crate) storage_controller_database_query_latency:
|
||||
measured::HistogramVec<DatabaseQueryLatencyLabelGroupSet, 5>,
|
||||
}
|
||||
|
||||
impl StorageControllerMetrics {
|
||||
pub(crate) fn encode(&self) -> Bytes {
|
||||
let mut encoder = self.encoder.lock().unwrap();
|
||||
self.metrics_group.collect_into(&mut *encoder);
|
||||
encoder.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for StorageControllerMetrics {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
metrics_group: StorageControllerMetricGroup::new(),
|
||||
encoder: Mutex::new(measured::text::TextEncoder::new()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl StorageControllerMetricGroup {
|
||||
pub(crate) fn new() -> Self {
|
||||
Self {
|
||||
storage_controller_reconcile_spawn: measured::Counter::new(),
|
||||
storage_controller_reconcile_complete: measured::CounterVec::new(
|
||||
ReconcileCompleteLabelGroupSet {
|
||||
status: StaticLabelSet::new(),
|
||||
},
|
||||
),
|
||||
storage_controller_schedule_optimization: measured::Counter::new(),
|
||||
storage_controller_http_request_status: measured::CounterVec::new(
|
||||
HttpRequestStatusLabelGroupSet {
|
||||
path: lasso::ThreadedRodeo::new(),
|
||||
method: StaticLabelSet::new(),
|
||||
status: StaticLabelSet::new(),
|
||||
},
|
||||
),
|
||||
storage_controller_http_request_latency: measured::HistogramVec::new(
|
||||
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
|
||||
),
|
||||
storage_controller_pageserver_request_error: measured::CounterVec::new(
|
||||
PageserverRequestLabelGroupSet {
|
||||
pageserver_id: lasso::ThreadedRodeo::new(),
|
||||
path: lasso::ThreadedRodeo::new(),
|
||||
method: StaticLabelSet::new(),
|
||||
},
|
||||
),
|
||||
storage_controller_pageserver_request_latency: measured::HistogramVec::new(
|
||||
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
|
||||
),
|
||||
storage_controller_passthrough_request_error: measured::CounterVec::new(
|
||||
PageserverRequestLabelGroupSet {
|
||||
pageserver_id: lasso::ThreadedRodeo::new(),
|
||||
path: lasso::ThreadedRodeo::new(),
|
||||
method: StaticLabelSet::new(),
|
||||
},
|
||||
),
|
||||
storage_controller_passthrough_request_latency: measured::HistogramVec::new(
|
||||
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
|
||||
),
|
||||
storage_controller_database_query_error: measured::CounterVec::new(
|
||||
DatabaseQueryErrorLabelGroupSet {
|
||||
operation: StaticLabelSet::new(),
|
||||
error_type: StaticLabelSet::new(),
|
||||
},
|
||||
),
|
||||
storage_controller_database_query_latency: measured::HistogramVec::new(
|
||||
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(measured::LabelGroup)]
|
||||
#[label(set = ReconcileCompleteLabelGroupSet)]
|
||||
pub(crate) struct ReconcileCompleteLabelGroup {
|
||||
pub(crate) status: ReconcileOutcome,
|
||||
}
|
||||
|
||||
#[derive(measured::LabelGroup)]
|
||||
#[label(set = HttpRequestStatusLabelGroupSet)]
|
||||
pub(crate) struct HttpRequestStatusLabelGroup<'a> {
|
||||
#[label(dynamic_with = lasso::ThreadedRodeo)]
|
||||
pub(crate) path: &'a str,
|
||||
pub(crate) method: Method,
|
||||
pub(crate) status: StatusCode,
|
||||
}
|
||||
|
||||
#[derive(measured::LabelGroup)]
|
||||
#[label(set = HttpRequestLatencyLabelGroupSet)]
|
||||
pub(crate) struct HttpRequestLatencyLabelGroup<'a> {
|
||||
#[label(dynamic_with = lasso::ThreadedRodeo)]
|
||||
pub(crate) path: &'a str,
|
||||
pub(crate) method: Method,
|
||||
}
|
||||
|
||||
impl Default for HttpRequestLatencyLabelGroupSet {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
path: lasso::ThreadedRodeo::new(),
|
||||
method: StaticLabelSet::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(measured::LabelGroup, Clone)]
|
||||
#[label(set = PageserverRequestLabelGroupSet)]
|
||||
pub(crate) struct PageserverRequestLabelGroup<'a> {
|
||||
#[label(dynamic_with = lasso::ThreadedRodeo)]
|
||||
pub(crate) pageserver_id: &'a str,
|
||||
#[label(dynamic_with = lasso::ThreadedRodeo)]
|
||||
pub(crate) path: &'a str,
|
||||
pub(crate) method: Method,
|
||||
}
|
||||
|
||||
impl Default for PageserverRequestLabelGroupSet {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
pageserver_id: lasso::ThreadedRodeo::new(),
|
||||
path: lasso::ThreadedRodeo::new(),
|
||||
method: StaticLabelSet::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(measured::LabelGroup)]
|
||||
#[label(set = DatabaseQueryErrorLabelGroupSet)]
|
||||
pub(crate) struct DatabaseQueryErrorLabelGroup {
|
||||
pub(crate) error_type: DatabaseErrorLabel,
|
||||
pub(crate) operation: DatabaseOperation,
|
||||
}
|
||||
|
||||
#[derive(measured::LabelGroup)]
|
||||
#[label(set = DatabaseQueryLatencyLabelGroupSet)]
|
||||
pub(crate) struct DatabaseQueryLatencyLabelGroup {
|
||||
pub(crate) operation: DatabaseOperation,
|
||||
}
|
||||
|
||||
#[derive(FixedCardinalityLabel)]
|
||||
pub(crate) enum ReconcileOutcome {
|
||||
#[label(rename = "ok")]
|
||||
Success,
|
||||
Error,
|
||||
Cancel,
|
||||
}
|
||||
|
||||
#[derive(FixedCardinalityLabel, Clone)]
|
||||
pub(crate) enum Method {
|
||||
Get,
|
||||
Put,
|
||||
Post,
|
||||
Delete,
|
||||
Other,
|
||||
}
|
||||
|
||||
impl From<hyper::Method> for Method {
|
||||
fn from(value: hyper::Method) -> Self {
|
||||
if value == hyper::Method::GET {
|
||||
Method::Get
|
||||
} else if value == hyper::Method::PUT {
|
||||
Method::Put
|
||||
} else if value == hyper::Method::POST {
|
||||
Method::Post
|
||||
} else if value == hyper::Method::DELETE {
|
||||
Method::Delete
|
||||
} else {
|
||||
Method::Other
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode);
|
||||
|
||||
impl LabelValue for StatusCode {
|
||||
fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
|
||||
v.write_int(self.0.as_u16() as u64)
|
||||
}
|
||||
}
|
||||
|
||||
impl FixedCardinalityLabel for StatusCode {
|
||||
fn cardinality() -> usize {
|
||||
(100..1000).len()
|
||||
}
|
||||
|
||||
fn encode(&self) -> usize {
|
||||
self.0.as_u16() as usize
|
||||
}
|
||||
|
||||
fn decode(value: usize) -> Self {
|
||||
Self(hyper::http::StatusCode::from_u16(u16::try_from(value).unwrap()).unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(FixedCardinalityLabel)]
|
||||
pub(crate) enum DatabaseErrorLabel {
|
||||
Query,
|
||||
Connection,
|
||||
ConnectionPool,
|
||||
Logical,
|
||||
}
|
||||
|
||||
impl DatabaseError {
|
||||
pub(crate) fn error_label(&self) -> DatabaseErrorLabel {
|
||||
match self {
|
||||
Self::Query(_) => DatabaseErrorLabel::Query,
|
||||
Self::Connection(_) => DatabaseErrorLabel::Connection,
|
||||
Self::ConnectionPool(_) => DatabaseErrorLabel::ConnectionPool,
|
||||
Self::Logical(_) => DatabaseErrorLabel::Logical,
|
||||
}
|
||||
}
|
||||
Lazy::force(&RECONCILER);
|
||||
}
|
||||
|
||||
@@ -12,9 +12,7 @@ use serde::Serialize;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::{backoff, id::NodeId};
|
||||
|
||||
use crate::{
|
||||
pageserver_client::PageserverClient, persistence::NodePersistence, scheduler::MaySchedule,
|
||||
};
|
||||
use crate::{persistence::NodePersistence, scheduler::MaySchedule};
|
||||
|
||||
/// Represents the in-memory description of a Node.
|
||||
///
|
||||
@@ -204,7 +202,7 @@ impl Node {
|
||||
cancel: &CancellationToken,
|
||||
) -> Option<mgmt_api::Result<T>>
|
||||
where
|
||||
O: FnMut(PageserverClient) -> F,
|
||||
O: FnMut(mgmt_api::Client) -> F,
|
||||
F: std::future::Future<Output = mgmt_api::Result<T>>,
|
||||
{
|
||||
fn is_fatal(e: &mgmt_api::Error) -> bool {
|
||||
@@ -226,12 +224,8 @@ impl Node {
|
||||
.build()
|
||||
.expect("Failed to construct HTTP client");
|
||||
|
||||
let client = PageserverClient::from_client(
|
||||
self.get_id(),
|
||||
http_client,
|
||||
self.base_url(),
|
||||
jwt.as_deref(),
|
||||
);
|
||||
let client =
|
||||
mgmt_api::Client::from_client(http_client, self.base_url(), jwt.as_deref());
|
||||
|
||||
let node_cancel_fut = self.cancel.cancelled();
|
||||
|
||||
|
||||
@@ -1,203 +0,0 @@
|
||||
use pageserver_api::{
|
||||
models::{
|
||||
LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress,
|
||||
TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
|
||||
},
|
||||
shard::TenantShardId,
|
||||
};
|
||||
use pageserver_client::mgmt_api::{Client, Result};
|
||||
use reqwest::StatusCode;
|
||||
use utils::id::{NodeId, TimelineId};
|
||||
|
||||
/// Thin wrapper around [`pageserver_client::mgmt_api::Client`]. It allows the storage
|
||||
/// controller to collect metrics in a non-intrusive manner.
|
||||
#[derive(Debug, Clone)]
|
||||
pub(crate) struct PageserverClient {
|
||||
inner: Client,
|
||||
node_id_label: String,
|
||||
}
|
||||
|
||||
macro_rules! measured_request {
|
||||
($name:literal, $method:expr, $node_id: expr, $invoke:expr) => {{
|
||||
let labels = crate::metrics::PageserverRequestLabelGroup {
|
||||
pageserver_id: $node_id,
|
||||
path: $name,
|
||||
method: $method,
|
||||
};
|
||||
|
||||
let latency = &crate::metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_pageserver_request_latency;
|
||||
let _timer_guard = latency.start_timer(labels.clone());
|
||||
|
||||
let res = $invoke;
|
||||
|
||||
if res.is_err() {
|
||||
let error_counters = &crate::metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_pageserver_request_error;
|
||||
error_counters.inc(labels)
|
||||
}
|
||||
|
||||
res
|
||||
}};
|
||||
}
|
||||
|
||||
impl PageserverClient {
|
||||
pub(crate) fn new(node_id: NodeId, mgmt_api_endpoint: String, jwt: Option<&str>) -> Self {
|
||||
Self {
|
||||
inner: Client::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt),
|
||||
node_id_label: node_id.0.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn from_client(
|
||||
node_id: NodeId,
|
||||
raw_client: reqwest::Client,
|
||||
mgmt_api_endpoint: String,
|
||||
jwt: Option<&str>,
|
||||
) -> Self {
|
||||
Self {
|
||||
inner: Client::from_client(raw_client, mgmt_api_endpoint, jwt),
|
||||
node_id_label: node_id.0.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn tenant_delete(&self, tenant_shard_id: TenantShardId) -> Result<StatusCode> {
|
||||
measured_request!(
|
||||
"tenant",
|
||||
crate::metrics::Method::Delete,
|
||||
&self.node_id_label,
|
||||
self.inner.tenant_delete(tenant_shard_id).await
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) async fn tenant_time_travel_remote_storage(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timestamp: &str,
|
||||
done_if_after: &str,
|
||||
) -> Result<()> {
|
||||
measured_request!(
|
||||
"tenant_time_travel_remote_storage",
|
||||
crate::metrics::Method::Put,
|
||||
&self.node_id_label,
|
||||
self.inner
|
||||
.tenant_time_travel_remote_storage(tenant_shard_id, timestamp, done_if_after)
|
||||
.await
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) async fn tenant_secondary_download(
|
||||
&self,
|
||||
tenant_id: TenantShardId,
|
||||
wait: Option<std::time::Duration>,
|
||||
) -> Result<(StatusCode, SecondaryProgress)> {
|
||||
measured_request!(
|
||||
"tenant_secondary_download",
|
||||
crate::metrics::Method::Post,
|
||||
&self.node_id_label,
|
||||
self.inner.tenant_secondary_download(tenant_id, wait).await
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) async fn location_config(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
config: LocationConfig,
|
||||
flush_ms: Option<std::time::Duration>,
|
||||
lazy: bool,
|
||||
) -> Result<()> {
|
||||
measured_request!(
|
||||
"location_config",
|
||||
crate::metrics::Method::Put,
|
||||
&self.node_id_label,
|
||||
self.inner
|
||||
.location_config(tenant_shard_id, config, flush_ms, lazy)
|
||||
.await
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) async fn list_location_config(&self) -> Result<LocationConfigListResponse> {
|
||||
measured_request!(
|
||||
"location_configs",
|
||||
crate::metrics::Method::Get,
|
||||
&self.node_id_label,
|
||||
self.inner.list_location_config().await
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) async fn get_location_config(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
) -> Result<Option<LocationConfig>> {
|
||||
measured_request!(
|
||||
"location_config",
|
||||
crate::metrics::Method::Get,
|
||||
&self.node_id_label,
|
||||
self.inner.get_location_config(tenant_shard_id).await
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) async fn timeline_create(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
req: &TimelineCreateRequest,
|
||||
) -> Result<TimelineInfo> {
|
||||
measured_request!(
|
||||
"timeline",
|
||||
crate::metrics::Method::Post,
|
||||
&self.node_id_label,
|
||||
self.inner.timeline_create(tenant_shard_id, req).await
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) async fn timeline_delete(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<StatusCode> {
|
||||
measured_request!(
|
||||
"timeline",
|
||||
crate::metrics::Method::Delete,
|
||||
&self.node_id_label,
|
||||
self.inner
|
||||
.timeline_delete(tenant_shard_id, timeline_id)
|
||||
.await
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) async fn tenant_shard_split(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
req: TenantShardSplitRequest,
|
||||
) -> Result<TenantShardSplitResponse> {
|
||||
measured_request!(
|
||||
"tenant_shard_split",
|
||||
crate::metrics::Method::Put,
|
||||
&self.node_id_label,
|
||||
self.inner.tenant_shard_split(tenant_shard_id, req).await
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) async fn timeline_list(
|
||||
&self,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
) -> Result<Vec<TimelineInfo>> {
|
||||
measured_request!(
|
||||
"timelines",
|
||||
crate::metrics::Method::Get,
|
||||
&self.node_id_label,
|
||||
self.inner.timeline_list(tenant_shard_id).await
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) async fn get_utilization(&self) -> Result<PageserverUtilization> {
|
||||
measured_request!(
|
||||
"utilization",
|
||||
crate::metrics::Method::Get,
|
||||
&self.node_id_label,
|
||||
self.inner.get_utilization().await
|
||||
)
|
||||
}
|
||||
}
|
||||
@@ -9,7 +9,6 @@ use camino::Utf8PathBuf;
|
||||
use diesel::pg::PgConnection;
|
||||
use diesel::prelude::*;
|
||||
use diesel::Connection;
|
||||
use pageserver_api::controller_api::ShardSchedulingPolicy;
|
||||
use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
|
||||
use pageserver_api::models::TenantConfig;
|
||||
use pageserver_api::shard::ShardConfigError;
|
||||
@@ -20,9 +19,6 @@ use serde::{Deserialize, Serialize};
|
||||
use utils::generation::Generation;
|
||||
use utils::id::{NodeId, TenantId};
|
||||
|
||||
use crate::metrics::{
|
||||
DatabaseQueryErrorLabelGroup, DatabaseQueryLatencyLabelGroup, METRICS_REGISTRY,
|
||||
};
|
||||
use crate::node::Node;
|
||||
|
||||
/// ## What do we store?
|
||||
@@ -79,25 +75,6 @@ pub(crate) enum DatabaseError {
|
||||
Logical(String),
|
||||
}
|
||||
|
||||
#[derive(measured::FixedCardinalityLabel, Clone)]
|
||||
pub(crate) enum DatabaseOperation {
|
||||
InsertNode,
|
||||
UpdateNode,
|
||||
DeleteNode,
|
||||
ListNodes,
|
||||
BeginShardSplit,
|
||||
CompleteShardSplit,
|
||||
AbortShardSplit,
|
||||
Detach,
|
||||
ReAttach,
|
||||
IncrementGeneration,
|
||||
ListTenantShards,
|
||||
InsertTenantShards,
|
||||
UpdateTenantShard,
|
||||
DeleteTenant,
|
||||
UpdateTenantConfig,
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub(crate) enum AbortShardSplitStatus {
|
||||
/// We aborted the split in the database by reverting to the parent shards
|
||||
@@ -108,12 +85,6 @@ pub(crate) enum AbortShardSplitStatus {
|
||||
|
||||
pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;
|
||||
|
||||
/// Some methods can operate on either a whole tenant or a single shard
|
||||
pub(crate) enum TenantFilter {
|
||||
Tenant(TenantId),
|
||||
Shard(TenantShardId),
|
||||
}
|
||||
|
||||
impl Persistence {
|
||||
// The default postgres connection limit is 100. We use up to 99, to leave one free for a human admin under
|
||||
// normal circumstances. This assumes we have exclusive use of the database cluster to which we connect.
|
||||
@@ -144,38 +115,10 @@ impl Persistence {
|
||||
}
|
||||
}
|
||||
|
||||
/// Wraps `with_conn` in order to collect latency and error metrics
|
||||
async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
|
||||
where
|
||||
F: FnOnce(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
|
||||
R: Send + 'static,
|
||||
{
|
||||
let latency = &METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_database_query_latency;
|
||||
let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup {
|
||||
operation: op.clone(),
|
||||
});
|
||||
|
||||
let res = self.with_conn(func).await;
|
||||
|
||||
if let Err(err) = &res {
|
||||
let error_counter = &METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_database_query_error;
|
||||
error_counter.inc(DatabaseQueryErrorLabelGroup {
|
||||
error_type: err.error_label(),
|
||||
operation: op,
|
||||
})
|
||||
}
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
/// Call the provided function in a tokio blocking thread, with a Diesel database connection.
|
||||
async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
|
||||
where
|
||||
F: FnOnce(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
|
||||
F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
|
||||
R: Send + 'static,
|
||||
{
|
||||
let mut conn = self.connection_pool.get()?;
|
||||
@@ -187,27 +130,21 @@ impl Persistence {
|
||||
/// When a node is first registered, persist it before using it for anything
|
||||
pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> {
|
||||
let np = node.to_persistent();
|
||||
self.with_measured_conn(
|
||||
DatabaseOperation::InsertNode,
|
||||
move |conn| -> DatabaseResult<()> {
|
||||
diesel::insert_into(crate::schema::nodes::table)
|
||||
.values(&np)
|
||||
.execute(conn)?;
|
||||
Ok(())
|
||||
},
|
||||
)
|
||||
self.with_conn(move |conn| -> DatabaseResult<()> {
|
||||
diesel::insert_into(crate::schema::nodes::table)
|
||||
.values(&np)
|
||||
.execute(conn)?;
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
/// At startup, populate the list of nodes which our shards may be placed on
|
||||
pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<NodePersistence>> {
|
||||
let nodes: Vec<NodePersistence> = self
|
||||
.with_measured_conn(
|
||||
DatabaseOperation::ListNodes,
|
||||
move |conn| -> DatabaseResult<_> {
|
||||
Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
|
||||
},
|
||||
)
|
||||
.with_conn(move |conn| -> DatabaseResult<_> {
|
||||
Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
|
||||
})
|
||||
.await?;
|
||||
|
||||
tracing::info!("list_nodes: loaded {} nodes", nodes.len());
|
||||
@@ -222,7 +159,7 @@ impl Persistence {
|
||||
) -> DatabaseResult<()> {
|
||||
use crate::schema::nodes::dsl::*;
|
||||
let updated = self
|
||||
.with_measured_conn(DatabaseOperation::UpdateNode, move |conn| {
|
||||
.with_conn(move |conn| {
|
||||
let updated = diesel::update(nodes)
|
||||
.filter(node_id.eq(input_node_id.0 as i64))
|
||||
.set((scheduling_policy.eq(String::from(input_scheduling)),))
|
||||
@@ -244,12 +181,9 @@ impl Persistence {
|
||||
/// be enriched at runtime with state discovered on pageservers.
|
||||
pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
|
||||
let loaded = self
|
||||
.with_measured_conn(
|
||||
DatabaseOperation::ListTenantShards,
|
||||
move |conn| -> DatabaseResult<_> {
|
||||
Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
|
||||
},
|
||||
)
|
||||
.with_conn(move |conn| -> DatabaseResult<_> {
|
||||
Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
|
||||
})
|
||||
.await?;
|
||||
|
||||
if loaded.is_empty() {
|
||||
@@ -277,15 +211,15 @@ impl Persistence {
|
||||
|
||||
let mut decoded = serde_json::from_slice::<JsonPersistence>(&bytes)
|
||||
.map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?;
|
||||
for shard in decoded.tenants.values_mut() {
|
||||
if shard.placement_policy == "\"Single\"" {
|
||||
// Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165
|
||||
shard.placement_policy = "{\"Attached\":0}".to_string();
|
||||
}
|
||||
|
||||
if shard.scheduling_policy.is_empty() {
|
||||
shard.scheduling_policy =
|
||||
serde_json::to_string(&ShardSchedulingPolicy::default()).unwrap();
|
||||
for (tenant_id, tenant) in &mut decoded.tenants {
|
||||
// Backward compat: an old attachments.json from before PR #6251, replace
|
||||
// empty strings with proper defaults.
|
||||
if tenant.tenant_id.is_empty() {
|
||||
tenant.tenant_id = tenant_id.to_string();
|
||||
tenant.config = serde_json::to_string(&TenantConfig::default())
|
||||
.map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
|
||||
tenant.placement_policy = serde_json::to_string(&PlacementPolicy::Single)
|
||||
.map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -331,20 +265,17 @@ impl Persistence {
|
||||
shards: Vec<TenantShardPersistence>,
|
||||
) -> DatabaseResult<()> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
self.with_measured_conn(
|
||||
DatabaseOperation::InsertTenantShards,
|
||||
move |conn| -> DatabaseResult<()> {
|
||||
conn.transaction(|conn| -> QueryResult<()> {
|
||||
for tenant in &shards {
|
||||
diesel::insert_into(tenant_shards)
|
||||
.values(tenant)
|
||||
.execute(conn)?;
|
||||
}
|
||||
Ok(())
|
||||
})?;
|
||||
self.with_conn(move |conn| -> DatabaseResult<()> {
|
||||
conn.transaction(|conn| -> QueryResult<()> {
|
||||
for tenant in &shards {
|
||||
diesel::insert_into(tenant_shards)
|
||||
.values(tenant)
|
||||
.execute(conn)?;
|
||||
}
|
||||
Ok(())
|
||||
},
|
||||
)
|
||||
})?;
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -352,31 +283,25 @@ impl Persistence {
|
||||
/// the tenant from memory on this server.
|
||||
pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
self.with_measured_conn(
|
||||
DatabaseOperation::DeleteTenant,
|
||||
move |conn| -> DatabaseResult<()> {
|
||||
diesel::delete(tenant_shards)
|
||||
.filter(tenant_id.eq(del_tenant_id.to_string()))
|
||||
.execute(conn)?;
|
||||
self.with_conn(move |conn| -> DatabaseResult<()> {
|
||||
diesel::delete(tenant_shards)
|
||||
.filter(tenant_id.eq(del_tenant_id.to_string()))
|
||||
.execute(conn)?;
|
||||
|
||||
Ok(())
|
||||
},
|
||||
)
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn delete_node(&self, del_node_id: NodeId) -> DatabaseResult<()> {
|
||||
use crate::schema::nodes::dsl::*;
|
||||
self.with_measured_conn(
|
||||
DatabaseOperation::DeleteNode,
|
||||
move |conn| -> DatabaseResult<()> {
|
||||
diesel::delete(nodes)
|
||||
.filter(node_id.eq(del_node_id.0 as i64))
|
||||
.execute(conn)?;
|
||||
self.with_conn(move |conn| -> DatabaseResult<()> {
|
||||
diesel::delete(nodes)
|
||||
.filter(node_id.eq(del_node_id.0 as i64))
|
||||
.execute(conn)?;
|
||||
|
||||
Ok(())
|
||||
},
|
||||
)
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -390,7 +315,7 @@ impl Persistence {
|
||||
) -> DatabaseResult<HashMap<TenantShardId, Generation>> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
let updated = self
|
||||
.with_measured_conn(DatabaseOperation::ReAttach, move |conn| {
|
||||
.with_conn(move |conn| {
|
||||
let rows_updated = diesel::update(tenant_shards)
|
||||
.filter(generation_pageserver.eq(node_id.0 as i64))
|
||||
.set(generation.eq(generation + 1))
|
||||
@@ -440,7 +365,7 @@ impl Persistence {
|
||||
) -> anyhow::Result<Generation> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
let updated = self
|
||||
.with_measured_conn(DatabaseOperation::IncrementGeneration, move |conn| {
|
||||
.with_conn(move |conn| {
|
||||
let updated = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
||||
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
||||
@@ -477,45 +402,59 @@ impl Persistence {
|
||||
/// that we only do the first time a tenant is set to an attached policy via /location_config.
|
||||
pub(crate) async fn update_tenant_shard(
|
||||
&self,
|
||||
tenant: TenantFilter,
|
||||
input_placement_policy: Option<PlacementPolicy>,
|
||||
input_config: Option<TenantConfig>,
|
||||
tenant_shard_id: TenantShardId,
|
||||
input_placement_policy: PlacementPolicy,
|
||||
input_config: TenantConfig,
|
||||
input_generation: Option<Generation>,
|
||||
input_scheduling_policy: Option<ShardSchedulingPolicy>,
|
||||
) -> DatabaseResult<()> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
|
||||
self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| {
|
||||
let query = match tenant {
|
||||
TenantFilter::Shard(tenant_shard_id) => diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
||||
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
||||
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
|
||||
.into_boxed(),
|
||||
TenantFilter::Tenant(input_tenant_id) => diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(input_tenant_id.to_string()))
|
||||
.into_boxed(),
|
||||
};
|
||||
self.with_conn(move |conn| {
|
||||
let query = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
||||
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
||||
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32));
|
||||
|
||||
#[derive(AsChangeset)]
|
||||
#[diesel(table_name = crate::schema::tenant_shards)]
|
||||
struct ShardUpdate {
|
||||
generation: Option<i32>,
|
||||
placement_policy: Option<String>,
|
||||
config: Option<String>,
|
||||
scheduling_policy: Option<String>,
|
||||
if let Some(input_generation) = input_generation {
|
||||
// Update includes generation column
|
||||
query
|
||||
.set((
|
||||
generation.eq(Some(input_generation.into().unwrap() as i32)),
|
||||
placement_policy
|
||||
.eq(serde_json::to_string(&input_placement_policy).unwrap()),
|
||||
config.eq(serde_json::to_string(&input_config).unwrap()),
|
||||
))
|
||||
.execute(conn)?;
|
||||
} else {
|
||||
// Update does not include generation column
|
||||
query
|
||||
.set((
|
||||
placement_policy
|
||||
.eq(serde_json::to_string(&input_placement_policy).unwrap()),
|
||||
config.eq(serde_json::to_string(&input_config).unwrap()),
|
||||
))
|
||||
.execute(conn)?;
|
||||
}
|
||||
|
||||
let update = ShardUpdate {
|
||||
generation: input_generation.map(|g| g.into().unwrap() as i32),
|
||||
placement_policy: input_placement_policy
|
||||
.map(|p| serde_json::to_string(&p).unwrap()),
|
||||
config: input_config.map(|c| serde_json::to_string(&c).unwrap()),
|
||||
scheduling_policy: input_scheduling_policy
|
||||
.map(|p| serde_json::to_string(&p).unwrap()),
|
||||
};
|
||||
Ok(())
|
||||
})
|
||||
.await?;
|
||||
|
||||
query.set(update).execute(conn)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn update_tenant_config(
|
||||
&self,
|
||||
input_tenant_id: TenantId,
|
||||
input_config: TenantConfig,
|
||||
) -> DatabaseResult<()> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
|
||||
self.with_conn(move |conn| {
|
||||
diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(input_tenant_id.to_string()))
|
||||
.set((config.eq(serde_json::to_string(&input_config).unwrap()),))
|
||||
.execute(conn)?;
|
||||
|
||||
Ok(())
|
||||
})
|
||||
@@ -526,7 +465,7 @@ impl Persistence {
|
||||
|
||||
pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
self.with_measured_conn(DatabaseOperation::Detach, move |conn| {
|
||||
self.with_conn(move |conn| {
|
||||
let updated = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
||||
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
||||
@@ -556,7 +495,7 @@ impl Persistence {
|
||||
parent_to_children: Vec<(TenantShardId, Vec<TenantShardPersistence>)>,
|
||||
) -> DatabaseResult<()> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| -> DatabaseResult<()> {
|
||||
self.with_conn(move |conn| -> DatabaseResult<()> {
|
||||
conn.transaction(|conn| -> DatabaseResult<()> {
|
||||
// Mark parent shards as splitting
|
||||
|
||||
@@ -620,29 +559,26 @@ impl Persistence {
|
||||
old_shard_count: ShardCount,
|
||||
) -> DatabaseResult<()> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
self.with_measured_conn(
|
||||
DatabaseOperation::CompleteShardSplit,
|
||||
move |conn| -> DatabaseResult<()> {
|
||||
conn.transaction(|conn| -> QueryResult<()> {
|
||||
// Drop parent shards
|
||||
diesel::delete(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.filter(shard_count.eq(old_shard_count.literal() as i32))
|
||||
.execute(conn)?;
|
||||
self.with_conn(move |conn| -> DatabaseResult<()> {
|
||||
conn.transaction(|conn| -> QueryResult<()> {
|
||||
// Drop parent shards
|
||||
diesel::delete(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.filter(shard_count.eq(old_shard_count.literal() as i32))
|
||||
.execute(conn)?;
|
||||
|
||||
// Clear sharding flag
|
||||
let updated = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.set((splitting.eq(0),))
|
||||
.execute(conn)?;
|
||||
debug_assert!(updated > 0);
|
||||
|
||||
Ok(())
|
||||
})?;
|
||||
// Clear sharding flag
|
||||
let updated = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.set((splitting.eq(0),))
|
||||
.execute(conn)?;
|
||||
debug_assert!(updated > 0);
|
||||
|
||||
Ok(())
|
||||
},
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -654,44 +590,40 @@ impl Persistence {
|
||||
new_shard_count: ShardCount,
|
||||
) -> DatabaseResult<AbortShardSplitStatus> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
self.with_measured_conn(
|
||||
DatabaseOperation::AbortShardSplit,
|
||||
move |conn| -> DatabaseResult<AbortShardSplitStatus> {
|
||||
let aborted =
|
||||
conn.transaction(|conn| -> DatabaseResult<AbortShardSplitStatus> {
|
||||
// Clear the splitting state on parent shards
|
||||
let updated = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.filter(shard_count.ne(new_shard_count.literal() as i32))
|
||||
.set((splitting.eq(0),))
|
||||
.execute(conn)?;
|
||||
self.with_conn(move |conn| -> DatabaseResult<AbortShardSplitStatus> {
|
||||
let aborted = conn.transaction(|conn| -> DatabaseResult<AbortShardSplitStatus> {
|
||||
// Clear the splitting state on parent shards
|
||||
let updated = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.filter(shard_count.ne(new_shard_count.literal() as i32))
|
||||
.set((splitting.eq(0),))
|
||||
.execute(conn)?;
|
||||
|
||||
// Parent shards are already gone: we cannot abort.
|
||||
if updated == 0 {
|
||||
return Ok(AbortShardSplitStatus::Complete);
|
||||
}
|
||||
// Parent shards are already gone: we cannot abort.
|
||||
if updated == 0 {
|
||||
return Ok(AbortShardSplitStatus::Complete);
|
||||
}
|
||||
|
||||
// Sanity check: if parent shards were present, their cardinality should
|
||||
// be less than the number of child shards.
|
||||
if updated >= new_shard_count.count() as usize {
|
||||
return Err(DatabaseError::Logical(format!(
|
||||
"Unexpected parent shard count {updated} while aborting split to \
|
||||
// Sanity check: if parent shards were present, their cardinality should
|
||||
// be less than the number of child shards.
|
||||
if updated >= new_shard_count.count() as usize {
|
||||
return Err(DatabaseError::Logical(format!(
|
||||
"Unexpected parent shard count {updated} while aborting split to \
|
||||
count {new_shard_count:?} on tenant {split_tenant_id}"
|
||||
)));
|
||||
}
|
||||
)));
|
||||
}
|
||||
|
||||
// Erase child shards
|
||||
diesel::delete(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.filter(shard_count.eq(new_shard_count.literal() as i32))
|
||||
.execute(conn)?;
|
||||
// Erase child shards
|
||||
diesel::delete(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.filter(shard_count.eq(new_shard_count.literal() as i32))
|
||||
.execute(conn)?;
|
||||
|
||||
Ok(AbortShardSplitStatus::Aborted)
|
||||
})?;
|
||||
Ok(AbortShardSplitStatus::Aborted)
|
||||
})?;
|
||||
|
||||
Ok(aborted)
|
||||
},
|
||||
)
|
||||
Ok(aborted)
|
||||
})
|
||||
.await
|
||||
}
|
||||
}
|
||||
@@ -726,8 +658,6 @@ pub(crate) struct TenantShardPersistence {
|
||||
pub(crate) splitting: SplitState,
|
||||
#[serde(default)]
|
||||
pub(crate) config: String,
|
||||
#[serde(default)]
|
||||
pub(crate) scheduling_policy: String,
|
||||
}
|
||||
|
||||
impl TenantShardPersistence {
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
use crate::pageserver_client::PageserverClient;
|
||||
use crate::persistence::Persistence;
|
||||
use crate::service;
|
||||
use hyper::StatusCode;
|
||||
@@ -118,15 +117,6 @@ impl Reconciler {
|
||||
flush_ms: Option<Duration>,
|
||||
lazy: bool,
|
||||
) -> Result<(), ReconcileError> {
|
||||
if !node.is_available() && config.mode == LocationConfigMode::Detached {
|
||||
// Attempts to detach from offline nodes may be imitated without doing I/O: a node which is offline
|
||||
// will get fully reconciled wrt the shard's intent state when it is reactivated, irrespective of
|
||||
// what we put into `observed`, in [`crate::service::Service::node_activate_reconcile`]
|
||||
tracing::info!("Node {node} is unavailable during detach: proceeding anyway, it will be detached on next activation");
|
||||
self.observed.locations.remove(&node.get_id());
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
self.observed
|
||||
.locations
|
||||
.insert(node.get_id(), ObservedStateLocation { conf: None });
|
||||
@@ -159,16 +149,9 @@ impl Reconciler {
|
||||
};
|
||||
tracing::info!("location_config({node}) complete: {:?}", config);
|
||||
|
||||
match config.mode {
|
||||
LocationConfigMode::Detached => {
|
||||
self.observed.locations.remove(&node.get_id());
|
||||
}
|
||||
_ => {
|
||||
self.observed
|
||||
.locations
|
||||
.insert(node.get_id(), ObservedStateLocation { conf: Some(config) });
|
||||
}
|
||||
}
|
||||
self.observed
|
||||
.locations
|
||||
.insert(node.get_id(), ObservedStateLocation { conf: Some(config) });
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -260,11 +243,8 @@ impl Reconciler {
|
||||
tenant_shard_id: TenantShardId,
|
||||
node: &Node,
|
||||
) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
|
||||
let client = PageserverClient::new(
|
||||
node.get_id(),
|
||||
node.base_url(),
|
||||
self.service_config.jwt_token.as_deref(),
|
||||
);
|
||||
let client =
|
||||
mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
|
||||
|
||||
let timelines = client.timeline_list(&tenant_shard_id).await?;
|
||||
Ok(timelines
|
||||
@@ -487,7 +467,6 @@ impl Reconciler {
|
||||
while let Err(e) = self.compute_notify().await {
|
||||
match e {
|
||||
NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)),
|
||||
NotifyError::ShuttingDown => return Err(ReconcileError::Cancel),
|
||||
_ => {
|
||||
tracing::warn!(
|
||||
"Live migration blocked by compute notification error, retrying: {e}"
|
||||
@@ -496,7 +475,7 @@ impl Reconciler {
|
||||
}
|
||||
}
|
||||
|
||||
// Downgrade the origin to secondary. If the tenant's policy is PlacementPolicy::Attached(0), then
|
||||
// Downgrade the origin to secondary. If the tenant's policy is PlacementPolicy::Single, then
|
||||
// this location will be deleted in the general case reconciliation that runs after this.
|
||||
let origin_secondary_conf = build_location_config(
|
||||
&self.shard,
|
||||
|
||||
@@ -58,70 +58,6 @@ pub(crate) struct Scheduler {
|
||||
nodes: HashMap<NodeId, SchedulerNode>,
|
||||
}
|
||||
|
||||
/// Score for soft constraint scheduling: lower scores are preferred to higher scores.
|
||||
///
|
||||
/// For example, we may set an affinity score based on the number of shards from the same
|
||||
/// tenant already on a node, to implicitly prefer to balance out shards.
|
||||
#[derive(Copy, Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
|
||||
pub(crate) struct AffinityScore(pub(crate) usize);
|
||||
|
||||
impl AffinityScore {
|
||||
/// If we have no anti-affinity at all toward a node, this is its score. It means
|
||||
/// the scheduler has a free choice amongst nodes with this score, and may pick a node
|
||||
/// based on other information such as total utilization.
|
||||
pub(crate) const FREE: Self = Self(0);
|
||||
|
||||
pub(crate) fn inc(&mut self) {
|
||||
self.0 += 1;
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::Add for AffinityScore {
|
||||
type Output = Self;
|
||||
|
||||
fn add(self, rhs: Self) -> Self::Output {
|
||||
Self(self.0 + rhs.0)
|
||||
}
|
||||
}
|
||||
|
||||
// For carrying state between multiple calls to [`TenantState::schedule`], e.g. when calling
|
||||
// it for many shards in the same tenant.
|
||||
#[derive(Debug, Default)]
|
||||
pub(crate) struct ScheduleContext {
|
||||
/// Sparse map of nodes: omitting a node implicitly makes its affinity [`AffinityScore::FREE`]
|
||||
pub(crate) nodes: HashMap<NodeId, AffinityScore>,
|
||||
|
||||
/// Specifically how many _attached_ locations are on each node
|
||||
pub(crate) attached_nodes: HashMap<NodeId, usize>,
|
||||
}
|
||||
|
||||
impl ScheduleContext {
|
||||
/// Input is a list of nodes we would like to avoid using again within this context. The more
|
||||
/// times a node is passed into this call, the less inclined we are to use it.
|
||||
pub(crate) fn avoid(&mut self, nodes: &[NodeId]) {
|
||||
for node_id in nodes {
|
||||
let entry = self.nodes.entry(*node_id).or_insert(AffinityScore::FREE);
|
||||
entry.inc()
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn push_attached(&mut self, node_id: NodeId) {
|
||||
let entry = self.attached_nodes.entry(node_id).or_default();
|
||||
*entry += 1;
|
||||
}
|
||||
|
||||
pub(crate) fn get_node_affinity(&self, node_id: NodeId) -> AffinityScore {
|
||||
self.nodes
|
||||
.get(&node_id)
|
||||
.copied()
|
||||
.unwrap_or(AffinityScore::FREE)
|
||||
}
|
||||
|
||||
pub(crate) fn get_node_attachments(&self, node_id: NodeId) -> usize {
|
||||
self.attached_nodes.get(&node_id).copied().unwrap_or(0)
|
||||
}
|
||||
}
|
||||
|
||||
impl Scheduler {
|
||||
pub(crate) fn new<'a>(nodes: impl Iterator<Item = &'a Node>) -> Self {
|
||||
let mut scheduler_nodes = HashMap::new();
|
||||
@@ -288,47 +224,27 @@ impl Scheduler {
|
||||
node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None })
|
||||
}
|
||||
|
||||
/// hard_exclude: it is forbidden to use nodes in this list, typically becacuse they
|
||||
/// are already in use by this shard -- we use this to avoid picking the same node
|
||||
/// as both attached and secondary location. This is a hard constraint: if we cannot
|
||||
/// find any nodes that aren't in this list, then we will return a [`ScheduleError::ImpossibleConstraint`].
|
||||
///
|
||||
/// context: we prefer to avoid using nodes identified in the context, according
|
||||
/// to their anti-affinity score. We use this to prefeer to avoid placing shards in
|
||||
/// the same tenant on the same node. This is a soft constraint: the context will never
|
||||
/// cause us to fail to schedule a shard.
|
||||
pub(crate) fn schedule_shard(
|
||||
&self,
|
||||
hard_exclude: &[NodeId],
|
||||
context: &ScheduleContext,
|
||||
) -> Result<NodeId, ScheduleError> {
|
||||
pub(crate) fn schedule_shard(&self, hard_exclude: &[NodeId]) -> Result<NodeId, ScheduleError> {
|
||||
if self.nodes.is_empty() {
|
||||
return Err(ScheduleError::NoPageservers);
|
||||
}
|
||||
|
||||
let mut scores: Vec<(NodeId, AffinityScore, usize)> = self
|
||||
let mut tenant_counts: Vec<(NodeId, usize)> = self
|
||||
.nodes
|
||||
.iter()
|
||||
.filter_map(|(k, v)| {
|
||||
if hard_exclude.contains(k) || v.may_schedule == MaySchedule::No {
|
||||
None
|
||||
} else {
|
||||
Some((
|
||||
*k,
|
||||
context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE),
|
||||
v.shard_count,
|
||||
))
|
||||
Some((*k, v.shard_count))
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Sort by, in order of precedence:
|
||||
// 1st: Affinity score. We should never pick a higher-score node if a lower-score node is available
|
||||
// 2nd: Utilization. Within nodes with the same affinity, use the least loaded nodes.
|
||||
// 3rd: Node ID. This is a convenience to make selection deterministic in tests and empty systems.
|
||||
scores.sort_by_key(|i| (i.1, i.2, i.0));
|
||||
// Sort by tenant count. Nodes with the same tenant count are sorted by ID.
|
||||
tenant_counts.sort_by_key(|i| (i.1, i.0));
|
||||
|
||||
if scores.is_empty() {
|
||||
if tenant_counts.is_empty() {
|
||||
// After applying constraints, no pageservers were left. We log some detail about
|
||||
// the state of nodes to help understand why this happened. This is not logged as an error because
|
||||
// it is legitimately possible for enough nodes to be Offline to prevent scheduling a shard.
|
||||
@@ -344,11 +260,10 @@ impl Scheduler {
|
||||
return Err(ScheduleError::ImpossibleConstraint);
|
||||
}
|
||||
|
||||
// Lowest score wins
|
||||
let node_id = scores.first().unwrap().0;
|
||||
let node_id = tenant_counts.first().unwrap().0;
|
||||
tracing::info!(
|
||||
"scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})",
|
||||
scores.iter().map(|i| i.0 .0).collect::<Vec<_>>()
|
||||
"scheduler selected node {node_id} (elegible nodes {:?}, exclude: {hard_exclude:?})",
|
||||
tenant_counts.iter().map(|i| i.0 .0).collect::<Vec<_>>()
|
||||
);
|
||||
|
||||
// Note that we do not update shard count here to reflect the scheduling: that
|
||||
@@ -356,12 +271,6 @@ impl Scheduler {
|
||||
|
||||
Ok(node_id)
|
||||
}
|
||||
|
||||
/// Unit test access to internal state
|
||||
#[cfg(test)]
|
||||
pub(crate) fn get_node_shard_count(&self, node_id: NodeId) -> usize {
|
||||
self.nodes.get(&node_id).unwrap().shard_count
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -407,17 +316,15 @@ mod tests {
|
||||
let mut t1_intent = IntentState::new();
|
||||
let mut t2_intent = IntentState::new();
|
||||
|
||||
let context = ScheduleContext::default();
|
||||
|
||||
let scheduled = scheduler.schedule_shard(&[], &context)?;
|
||||
let scheduled = scheduler.schedule_shard(&[])?;
|
||||
t1_intent.set_attached(&mut scheduler, Some(scheduled));
|
||||
let scheduled = scheduler.schedule_shard(&[], &context)?;
|
||||
let scheduled = scheduler.schedule_shard(&[])?;
|
||||
t2_intent.set_attached(&mut scheduler, Some(scheduled));
|
||||
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);
|
||||
|
||||
let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers(), &context)?;
|
||||
let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers())?;
|
||||
t1_intent.push_secondary(&mut scheduler, scheduled);
|
||||
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
|
||||
|
||||
@@ -22,7 +22,6 @@ diesel::table! {
|
||||
placement_policy -> Varchar,
|
||||
splitting -> Int2,
|
||||
config -> Text,
|
||||
scheduling_policy -> Varchar,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -4,12 +4,8 @@ use std::{
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use crate::{
|
||||
metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
|
||||
persistence::TenantShardPersistence,
|
||||
scheduler::{AffinityScore, MaySchedule, ScheduleContext},
|
||||
};
|
||||
use pageserver_api::controller_api::{PlacementPolicy, ShardSchedulingPolicy};
|
||||
use crate::{metrics, persistence::TenantShardPersistence};
|
||||
use pageserver_api::controller_api::PlacementPolicy;
|
||||
use pageserver_api::{
|
||||
models::{LocationConfig, LocationConfigMode, TenantConfig},
|
||||
shard::{ShardIdentity, TenantShardId},
|
||||
@@ -117,10 +113,6 @@ pub(crate) struct TenantState {
|
||||
/// sending it. This is the mechanism by which compute notifications are included in the scope
|
||||
/// of state that we publish externally in an eventually consistent way.
|
||||
pub(crate) pending_compute_notification: bool,
|
||||
|
||||
// Support/debug tool: if something is going wrong or flapping with scheduling, this may
|
||||
// be set to a non-active state to avoid making changes while the issue is fixed.
|
||||
scheduling_policy: ShardSchedulingPolicy,
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Debug, Serialize)]
|
||||
@@ -251,13 +243,8 @@ impl IntentState {
|
||||
|
||||
impl Drop for IntentState {
|
||||
fn drop(&mut self) {
|
||||
// Must clear before dropping, to avoid leaving stale refcounts in the Scheduler.
|
||||
// We do not check this while panicking, to avoid polluting unit test failures or
|
||||
// other assertions with this assertion's output. It's still wrong to leak these,
|
||||
// but if we already have a panic then we don't need to independently flag this case.
|
||||
if !(std::thread::panicking()) {
|
||||
debug_assert!(self.attached.is_none() && self.secondary.is_empty());
|
||||
}
|
||||
// Must clear before dropping, to avoid leaving stale refcounts in the Scheduler
|
||||
debug_assert!(self.attached.is_none() && self.secondary.is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -302,26 +289,6 @@ pub enum ReconcileWaitError {
|
||||
Failed(TenantShardId, String),
|
||||
}
|
||||
|
||||
#[derive(Eq, PartialEq, Debug)]
|
||||
pub(crate) struct ReplaceSecondary {
|
||||
old_node_id: NodeId,
|
||||
new_node_id: NodeId,
|
||||
}
|
||||
|
||||
#[derive(Eq, PartialEq, Debug)]
|
||||
pub(crate) struct MigrateAttachment {
|
||||
old_attached_node_id: NodeId,
|
||||
new_attached_node_id: NodeId,
|
||||
}
|
||||
|
||||
#[derive(Eq, PartialEq, Debug)]
|
||||
pub(crate) enum ScheduleOptimization {
|
||||
// Replace one of our secondary locations with a different node
|
||||
ReplaceSecondary(ReplaceSecondary),
|
||||
// Migrate attachment to an existing secondary location
|
||||
MigrateAttachment(MigrateAttachment),
|
||||
}
|
||||
|
||||
impl ReconcilerWaiter {
|
||||
pub(crate) async fn wait_timeout(&self, timeout: Duration) -> Result<(), ReconcileWaitError> {
|
||||
tokio::select! {
|
||||
@@ -400,7 +367,6 @@ impl TenantState {
|
||||
error_waiter: Arc::new(SeqWait::new(Sequence(0))),
|
||||
last_error: Arc::default(),
|
||||
pending_compute_notification: false,
|
||||
scheduling_policy: ShardSchedulingPolicy::default(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -456,7 +422,6 @@ impl TenantState {
|
||||
fn schedule_attached(
|
||||
&mut self,
|
||||
scheduler: &mut Scheduler,
|
||||
context: &ScheduleContext,
|
||||
) -> Result<(bool, NodeId), ScheduleError> {
|
||||
// No work to do if we already have an attached tenant
|
||||
if let Some(node_id) = self.intent.attached {
|
||||
@@ -470,33 +435,14 @@ impl TenantState {
|
||||
Ok((true, promote_secondary))
|
||||
} else {
|
||||
// Pick a fresh node: either we had no secondaries or none were schedulable
|
||||
let node_id = scheduler.schedule_shard(&self.intent.secondary, context)?;
|
||||
let node_id = scheduler.schedule_shard(&self.intent.secondary)?;
|
||||
tracing::debug!("Selected {} as attached", node_id);
|
||||
self.intent.set_attached(scheduler, Some(node_id));
|
||||
Ok((true, node_id))
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn schedule(
|
||||
&mut self,
|
||||
scheduler: &mut Scheduler,
|
||||
context: &mut ScheduleContext,
|
||||
) -> Result<(), ScheduleError> {
|
||||
let r = self.do_schedule(scheduler, context);
|
||||
|
||||
context.avoid(&self.intent.all_pageservers());
|
||||
if let Some(attached) = self.intent.get_attached() {
|
||||
context.push_attached(*attached);
|
||||
}
|
||||
|
||||
r
|
||||
}
|
||||
|
||||
pub(crate) fn do_schedule(
|
||||
&mut self,
|
||||
scheduler: &mut Scheduler,
|
||||
context: &ScheduleContext,
|
||||
) -> Result<(), ScheduleError> {
|
||||
pub(crate) fn schedule(&mut self, scheduler: &mut Scheduler) -> Result<(), ScheduleError> {
|
||||
// TODO: before scheduling new nodes, check if any existing content in
|
||||
// self.intent refers to pageservers that are offline, and pick other
|
||||
// pageservers if so.
|
||||
@@ -504,16 +450,6 @@ impl TenantState {
|
||||
// TODO: respect the splitting bit on tenants: if they are currently splitting then we may not
|
||||
// change their attach location.
|
||||
|
||||
match self.scheduling_policy {
|
||||
ShardSchedulingPolicy::Active | ShardSchedulingPolicy::Essential => {}
|
||||
ShardSchedulingPolicy::Pause | ShardSchedulingPolicy::Stop => {
|
||||
// Warn to make it obvious why other things aren't happening/working, if we skip scheduling
|
||||
tracing::warn!(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(),
|
||||
"Scheduling is disabled by policy {:?}", self.scheduling_policy);
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
// Build the set of pageservers already in use by this tenant, to avoid scheduling
|
||||
// more work on the same pageservers we're already using.
|
||||
let mut modified = false;
|
||||
@@ -521,7 +457,22 @@ impl TenantState {
|
||||
// Add/remove nodes to fulfil policy
|
||||
use PlacementPolicy::*;
|
||||
match self.policy {
|
||||
Attached(secondary_count) => {
|
||||
Single => {
|
||||
// Should have exactly one attached, and zero secondaries
|
||||
if !self.intent.secondary.is_empty() {
|
||||
self.intent.clear_secondary(scheduler);
|
||||
modified = true;
|
||||
}
|
||||
|
||||
let (modified_attached, _attached_node_id) = self.schedule_attached(scheduler)?;
|
||||
modified |= modified_attached;
|
||||
|
||||
if !self.intent.secondary.is_empty() {
|
||||
self.intent.clear_secondary(scheduler);
|
||||
modified = true;
|
||||
}
|
||||
}
|
||||
Double(secondary_count) => {
|
||||
let retain_secondaries = if self.intent.attached.is_none()
|
||||
&& scheduler.node_preferred(&self.intent.secondary).is_some()
|
||||
{
|
||||
@@ -540,13 +491,12 @@ impl TenantState {
|
||||
}
|
||||
|
||||
// Should have exactly one attached, and N secondaries
|
||||
let (modified_attached, attached_node_id) =
|
||||
self.schedule_attached(scheduler, context)?;
|
||||
let (modified_attached, attached_node_id) = self.schedule_attached(scheduler)?;
|
||||
modified |= modified_attached;
|
||||
|
||||
let mut used_pageservers = vec![attached_node_id];
|
||||
while self.intent.secondary.len() < secondary_count {
|
||||
let node_id = scheduler.schedule_shard(&used_pageservers, context)?;
|
||||
let node_id = scheduler.schedule_shard(&used_pageservers)?;
|
||||
self.intent.push_secondary(scheduler, node_id);
|
||||
used_pageservers.push(node_id);
|
||||
modified = true;
|
||||
@@ -559,7 +509,7 @@ impl TenantState {
|
||||
modified = true;
|
||||
} else if self.intent.secondary.is_empty() {
|
||||
// Populate secondary by scheduling a fresh node
|
||||
let node_id = scheduler.schedule_shard(&[], context)?;
|
||||
let node_id = scheduler.schedule_shard(&[])?;
|
||||
self.intent.push_secondary(scheduler, node_id);
|
||||
modified = true;
|
||||
}
|
||||
@@ -586,167 +536,6 @@ impl TenantState {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Optimize attachments: if a shard has a secondary location that is preferable to
|
||||
/// its primary location based on soft constraints, switch that secondary location
|
||||
/// to be attached.
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
|
||||
pub(crate) fn optimize_attachment(
|
||||
&self,
|
||||
nodes: &HashMap<NodeId, Node>,
|
||||
schedule_context: &ScheduleContext,
|
||||
) -> Option<ScheduleOptimization> {
|
||||
let attached = (*self.intent.get_attached())?;
|
||||
if self.intent.secondary.is_empty() {
|
||||
// We can only do useful work if we have both attached and secondary locations: this
|
||||
// function doesn't schedule new locations, only swaps between attached and secondaries.
|
||||
return None;
|
||||
}
|
||||
|
||||
let current_affinity_score = schedule_context.get_node_affinity(attached);
|
||||
let current_attachment_count = schedule_context.get_node_attachments(attached);
|
||||
|
||||
// Generate score for each node, dropping any un-schedulable nodes.
|
||||
let all_pageservers = self.intent.all_pageservers();
|
||||
let mut scores = all_pageservers
|
||||
.iter()
|
||||
.flat_map(|node_id| {
|
||||
if matches!(
|
||||
nodes
|
||||
.get(node_id)
|
||||
.map(|n| n.may_schedule())
|
||||
.unwrap_or(MaySchedule::No),
|
||||
MaySchedule::No
|
||||
) {
|
||||
None
|
||||
} else {
|
||||
let affinity_score = schedule_context.get_node_affinity(*node_id);
|
||||
let attachment_count = schedule_context.get_node_attachments(*node_id);
|
||||
Some((*node_id, affinity_score, attachment_count))
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
// Sort precedence:
|
||||
// 1st - prefer nodes with the lowest total affinity score
|
||||
// 2nd - prefer nodes with the lowest number of attachments in this context
|
||||
// 3rd - if all else is equal, sort by node ID for determinism in tests.
|
||||
scores.sort_by_key(|i| (i.1, i.2, i.0));
|
||||
|
||||
if let Some((preferred_node, preferred_affinity_score, preferred_attachment_count)) =
|
||||
scores.first()
|
||||
{
|
||||
if attached != *preferred_node {
|
||||
// The best alternative must be more than 1 better than us, otherwise we could end
|
||||
// up flapping back next time we're called (e.g. there's no point migrating from
|
||||
// a location with score 1 to a score zero, because on next location the situation
|
||||
// would be the same, but in reverse).
|
||||
if current_affinity_score > *preferred_affinity_score + AffinityScore(1)
|
||||
|| current_attachment_count > *preferred_attachment_count + 1
|
||||
{
|
||||
tracing::info!(
|
||||
"Identified optimization: migrate attachment {attached}->{preferred_node} (secondaries {:?})",
|
||||
self.intent.get_secondary()
|
||||
);
|
||||
return Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
|
||||
old_attached_node_id: attached,
|
||||
new_attached_node_id: *preferred_node,
|
||||
}));
|
||||
}
|
||||
} else {
|
||||
tracing::debug!(
|
||||
"Node {} is already preferred (score {:?})",
|
||||
preferred_node,
|
||||
preferred_affinity_score
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Fall-through: we didn't find an optimization
|
||||
None
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
|
||||
pub(crate) fn optimize_secondary(
|
||||
&self,
|
||||
scheduler: &Scheduler,
|
||||
schedule_context: &ScheduleContext,
|
||||
) -> Option<ScheduleOptimization> {
|
||||
if self.intent.secondary.is_empty() {
|
||||
// We can only do useful work if we have both attached and secondary locations: this
|
||||
// function doesn't schedule new locations, only swaps between attached and secondaries.
|
||||
return None;
|
||||
}
|
||||
|
||||
for secondary in self.intent.get_secondary() {
|
||||
let Some(affinity_score) = schedule_context.nodes.get(secondary) else {
|
||||
// We're already on a node unaffected any affinity constraints,
|
||||
// so we won't change it.
|
||||
continue;
|
||||
};
|
||||
|
||||
// Let the scheduler suggest a node, where it would put us if we were scheduling afresh
|
||||
// This implicitly limits the choice to nodes that are available, and prefers nodes
|
||||
// with lower utilization.
|
||||
let Ok(candidate_node) =
|
||||
scheduler.schedule_shard(&self.intent.all_pageservers(), schedule_context)
|
||||
else {
|
||||
// A scheduling error means we have no possible candidate replacements
|
||||
continue;
|
||||
};
|
||||
|
||||
let candidate_affinity_score = schedule_context
|
||||
.nodes
|
||||
.get(&candidate_node)
|
||||
.unwrap_or(&AffinityScore::FREE);
|
||||
|
||||
// The best alternative must be more than 1 better than us, otherwise we could end
|
||||
// up flapping back next time we're called.
|
||||
if *candidate_affinity_score + AffinityScore(1) < *affinity_score {
|
||||
// If some other node is available and has a lower score than this node, then
|
||||
// that other node is a good place to migrate to.
|
||||
tracing::info!(
|
||||
"Identified optimization: replace secondary {secondary}->{candidate_node} (current secondaries {:?})",
|
||||
self.intent.get_secondary()
|
||||
);
|
||||
return Some(ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
|
||||
old_node_id: *secondary,
|
||||
new_node_id: candidate_node,
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
pub(crate) fn apply_optimization(
|
||||
&mut self,
|
||||
scheduler: &mut Scheduler,
|
||||
optimization: ScheduleOptimization,
|
||||
) {
|
||||
metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_schedule_optimization
|
||||
.inc();
|
||||
|
||||
match optimization {
|
||||
ScheduleOptimization::MigrateAttachment(MigrateAttachment {
|
||||
old_attached_node_id,
|
||||
new_attached_node_id,
|
||||
}) => {
|
||||
self.intent.demote_attached(old_attached_node_id);
|
||||
self.intent
|
||||
.promote_attached(scheduler, new_attached_node_id);
|
||||
}
|
||||
ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
|
||||
old_node_id,
|
||||
new_node_id,
|
||||
}) => {
|
||||
self.intent.remove_secondary(scheduler, old_node_id);
|
||||
self.intent.push_secondary(scheduler, new_node_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Query whether the tenant's observed state for attached node matches its intent state, and if so,
|
||||
/// yield the node ID. This is appropriate for emitting compute hook notifications: we are checking that
|
||||
/// the node in question is not only where we intend to attach, but that the tenant is indeed already attached there.
|
||||
@@ -891,19 +680,6 @@ impl TenantState {
|
||||
}
|
||||
}
|
||||
|
||||
// Pre-checks done: finally check whether we may actually do the work
|
||||
match self.scheduling_policy {
|
||||
ShardSchedulingPolicy::Active
|
||||
| ShardSchedulingPolicy::Essential
|
||||
| ShardSchedulingPolicy::Pause => {}
|
||||
ShardSchedulingPolicy::Stop => {
|
||||
// We only reach this point if there is work to do and we're going to skip
|
||||
// doing it: warn it obvious why this tenant isn't doing what it ought to.
|
||||
tracing::warn!("Skipping reconcile for policy {:?}", self.scheduling_policy);
|
||||
return None;
|
||||
}
|
||||
}
|
||||
|
||||
// Build list of nodes from which the reconciler should detach
|
||||
let mut detach = Vec::new();
|
||||
for node_id in self.observed.locations.keys() {
|
||||
@@ -957,10 +733,7 @@ impl TenantState {
|
||||
let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq,
|
||||
tenant_id=%reconciler.tenant_shard_id.tenant_id,
|
||||
shard_id=%reconciler.tenant_shard_id.shard_slug());
|
||||
metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_reconcile_spawn
|
||||
.inc();
|
||||
metrics::RECONCILER.spawned.inc();
|
||||
let result_tx = result_tx.clone();
|
||||
let join_handle = tokio::task::spawn(
|
||||
async move {
|
||||
@@ -978,12 +751,10 @@ impl TenantState {
|
||||
// TODO: wrap all remote API operations in cancellation check
|
||||
// as well.
|
||||
if reconciler.cancel.is_cancelled() {
|
||||
metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_reconcile_complete
|
||||
.inc(ReconcileCompleteLabelGroup {
|
||||
status: ReconcileOutcome::Cancel,
|
||||
});
|
||||
metrics::RECONCILER
|
||||
.complete
|
||||
.with_label_values(&[metrics::ReconcilerMetrics::CANCEL])
|
||||
.inc();
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -998,18 +769,18 @@ impl TenantState {
|
||||
}
|
||||
|
||||
// Update result counter
|
||||
let outcome_label = match &result {
|
||||
Ok(_) => ReconcileOutcome::Success,
|
||||
Err(ReconcileError::Cancel) => ReconcileOutcome::Cancel,
|
||||
Err(_) => ReconcileOutcome::Error,
|
||||
};
|
||||
|
||||
metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_reconcile_complete
|
||||
.inc(ReconcileCompleteLabelGroup {
|
||||
status: outcome_label,
|
||||
});
|
||||
match &result {
|
||||
Ok(_) => metrics::RECONCILER
|
||||
.complete
|
||||
.with_label_values(&[metrics::ReconcilerMetrics::SUCCESS]),
|
||||
Err(ReconcileError::Cancel) => metrics::RECONCILER
|
||||
.complete
|
||||
.with_label_values(&[metrics::ReconcilerMetrics::CANCEL]),
|
||||
Err(_) => metrics::RECONCILER
|
||||
.complete
|
||||
.with_label_values(&[metrics::ReconcilerMetrics::ERROR]),
|
||||
}
|
||||
.inc();
|
||||
|
||||
result_tx
|
||||
.send(ReconcileResult {
|
||||
@@ -1040,22 +811,6 @@ impl TenantState {
|
||||
})
|
||||
}
|
||||
|
||||
/// Get a waiter for any reconciliation in flight, but do not start reconciliation
|
||||
/// if it is not already running
|
||||
pub(crate) fn get_waiter(&self) -> Option<ReconcilerWaiter> {
|
||||
if self.reconciler.is_some() {
|
||||
Some(ReconcilerWaiter {
|
||||
tenant_shard_id: self.tenant_shard_id,
|
||||
seq_wait: self.waiter.clone(),
|
||||
error_seq_wait: self.error_waiter.clone(),
|
||||
error: self.last_error.clone(),
|
||||
seq: self.sequence,
|
||||
})
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Called when a ReconcileResult has been emitted and the service is updating
|
||||
/// our state: if the result is from a sequence >= my ReconcileHandle, then drop
|
||||
/// the handle to indicate there is no longer a reconciliation in progress.
|
||||
@@ -1081,40 +836,6 @@ impl TenantState {
|
||||
debug_assert!(!self.intent.all_pageservers().contains(&node_id));
|
||||
}
|
||||
|
||||
pub(crate) fn set_scheduling_policy(&mut self, p: ShardSchedulingPolicy) {
|
||||
self.scheduling_policy = p;
|
||||
}
|
||||
|
||||
pub(crate) fn get_scheduling_policy(&self) -> &ShardSchedulingPolicy {
|
||||
&self.scheduling_policy
|
||||
}
|
||||
|
||||
pub(crate) fn from_persistent(
|
||||
tsp: TenantShardPersistence,
|
||||
intent: IntentState,
|
||||
) -> anyhow::Result<Self> {
|
||||
let tenant_shard_id = tsp.get_tenant_shard_id()?;
|
||||
let shard_identity = tsp.get_shard_identity()?;
|
||||
|
||||
Ok(Self {
|
||||
tenant_shard_id,
|
||||
shard: shard_identity,
|
||||
sequence: Sequence::initial(),
|
||||
generation: tsp.generation.map(|g| Generation::new(g as u32)),
|
||||
policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
|
||||
intent,
|
||||
observed: ObservedState::new(),
|
||||
config: serde_json::from_str(&tsp.config).unwrap(),
|
||||
reconciler: None,
|
||||
splitting: tsp.splitting,
|
||||
waiter: Arc::new(SeqWait::new(Sequence::initial())),
|
||||
error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
|
||||
last_error: Arc::default(),
|
||||
pending_compute_notification: false,
|
||||
scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(),
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn to_persistent(&self) -> TenantShardPersistence {
|
||||
TenantShardPersistence {
|
||||
tenant_id: self.tenant_shard_id.tenant_id.to_string(),
|
||||
@@ -1126,7 +847,6 @@ impl TenantState {
|
||||
placement_policy: serde_json::to_string(&self.policy).unwrap(),
|
||||
config: serde_json::to_string(&self.config).unwrap(),
|
||||
splitting: SplitState::default(),
|
||||
scheduling_policy: serde_json::to_string(&self.scheduling_policy).unwrap(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1165,32 +885,6 @@ pub(crate) mod tests {
|
||||
)
|
||||
}
|
||||
|
||||
fn make_test_tenant(policy: PlacementPolicy, shard_count: ShardCount) -> Vec<TenantState> {
|
||||
let tenant_id = TenantId::generate();
|
||||
|
||||
(0..shard_count.count())
|
||||
.map(|i| {
|
||||
let shard_number = ShardNumber(i);
|
||||
|
||||
let tenant_shard_id = TenantShardId {
|
||||
tenant_id,
|
||||
shard_number,
|
||||
shard_count,
|
||||
};
|
||||
TenantState::new(
|
||||
tenant_shard_id,
|
||||
ShardIdentity::new(
|
||||
shard_number,
|
||||
shard_count,
|
||||
pageserver_api::shard::ShardStripeSize(32768),
|
||||
)
|
||||
.unwrap(),
|
||||
policy.clone(),
|
||||
)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Test the scheduling behaviors used when a tenant configured for HA is subject
|
||||
/// to nodes being marked offline.
|
||||
#[test]
|
||||
@@ -1200,11 +894,10 @@ pub(crate) mod tests {
|
||||
let mut nodes = make_test_nodes(3);
|
||||
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
let mut context = ScheduleContext::default();
|
||||
|
||||
let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
||||
let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));
|
||||
tenant_state
|
||||
.schedule(&mut scheduler, &mut context)
|
||||
.schedule(&mut scheduler)
|
||||
.expect("we have enough nodes, scheduling should work");
|
||||
|
||||
// Expect to initially be schedule on to different nodes
|
||||
@@ -1230,7 +923,7 @@ pub(crate) mod tests {
|
||||
|
||||
// Scheduling the node should promote the still-available secondary node to attached
|
||||
tenant_state
|
||||
.schedule(&mut scheduler, &mut context)
|
||||
.schedule(&mut scheduler)
|
||||
.expect("active nodes are available");
|
||||
assert_eq!(tenant_state.intent.attached.unwrap(), secondary_node_id);
|
||||
|
||||
@@ -1250,7 +943,7 @@ pub(crate) mod tests {
|
||||
let nodes = make_test_nodes(3);
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
|
||||
let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
||||
let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));
|
||||
|
||||
tenant_state.observed.locations.insert(
|
||||
NodeId(3),
|
||||
@@ -1294,219 +987,4 @@ pub(crate) mod tests {
|
||||
tenant_state.intent.clear(&mut scheduler);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scheduling_mode() -> anyhow::Result<()> {
|
||||
let nodes = make_test_nodes(3);
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
|
||||
let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
||||
|
||||
// In pause mode, schedule() shouldn't do anything
|
||||
tenant_state.scheduling_policy = ShardSchedulingPolicy::Pause;
|
||||
assert!(tenant_state
|
||||
.schedule(&mut scheduler, &mut ScheduleContext::default())
|
||||
.is_ok());
|
||||
assert!(tenant_state.intent.all_pageservers().is_empty());
|
||||
|
||||
// In active mode, schedule() works
|
||||
tenant_state.scheduling_policy = ShardSchedulingPolicy::Active;
|
||||
assert!(tenant_state
|
||||
.schedule(&mut scheduler, &mut ScheduleContext::default())
|
||||
.is_ok());
|
||||
assert!(!tenant_state.intent.all_pageservers().is_empty());
|
||||
|
||||
tenant_state.intent.clear(&mut scheduler);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn optimize_attachment() -> anyhow::Result<()> {
|
||||
let nodes = make_test_nodes(3);
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
|
||||
let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
||||
let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
||||
|
||||
// Initially: both nodes attached on shard 1, and both have secondary locations
|
||||
// on different nodes.
|
||||
shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1)));
|
||||
shard_a.intent.push_secondary(&mut scheduler, NodeId(2));
|
||||
shard_b.intent.set_attached(&mut scheduler, Some(NodeId(1)));
|
||||
shard_b.intent.push_secondary(&mut scheduler, NodeId(3));
|
||||
|
||||
let mut schedule_context = ScheduleContext::default();
|
||||
schedule_context.avoid(&shard_a.intent.all_pageservers());
|
||||
schedule_context.push_attached(shard_a.intent.get_attached().unwrap());
|
||||
schedule_context.avoid(&shard_b.intent.all_pageservers());
|
||||
schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
|
||||
|
||||
let optimization_a = shard_a.optimize_attachment(&nodes, &schedule_context);
|
||||
|
||||
// Either shard should recognize that it has the option to switch to a secondary location where there
|
||||
// would be no other shards from the same tenant, and request to do so.
|
||||
assert_eq!(
|
||||
optimization_a,
|
||||
Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
|
||||
old_attached_node_id: NodeId(1),
|
||||
new_attached_node_id: NodeId(2)
|
||||
}))
|
||||
);
|
||||
|
||||
// Note that these optimizing two shards in the same tenant with the same ScheduleContext is
|
||||
// mutually exclusive (the optimization of one invalidates the stats) -- it is the responsibility
|
||||
// of [`Service::optimize_all`] to avoid trying
|
||||
// to do optimizations for multiple shards in the same tenant at the same time. Generating
|
||||
// both optimizations is just done for test purposes
|
||||
let optimization_b = shard_b.optimize_attachment(&nodes, &schedule_context);
|
||||
assert_eq!(
|
||||
optimization_b,
|
||||
Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
|
||||
old_attached_node_id: NodeId(1),
|
||||
new_attached_node_id: NodeId(3)
|
||||
}))
|
||||
);
|
||||
|
||||
// Applying these optimizations should result in the end state proposed
|
||||
shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap());
|
||||
assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(2)));
|
||||
assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(1)]);
|
||||
shard_b.apply_optimization(&mut scheduler, optimization_b.unwrap());
|
||||
assert_eq!(shard_b.intent.get_attached(), &Some(NodeId(3)));
|
||||
assert_eq!(shard_b.intent.get_secondary(), &vec![NodeId(1)]);
|
||||
|
||||
shard_a.intent.clear(&mut scheduler);
|
||||
shard_b.intent.clear(&mut scheduler);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn optimize_secondary() -> anyhow::Result<()> {
|
||||
let nodes = make_test_nodes(4);
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
|
||||
let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
||||
let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
||||
|
||||
// Initially: both nodes attached on shard 1, and both have secondary locations
|
||||
// on different nodes.
|
||||
shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1)));
|
||||
shard_a.intent.push_secondary(&mut scheduler, NodeId(3));
|
||||
shard_b.intent.set_attached(&mut scheduler, Some(NodeId(2)));
|
||||
shard_b.intent.push_secondary(&mut scheduler, NodeId(3));
|
||||
|
||||
let mut schedule_context = ScheduleContext::default();
|
||||
schedule_context.avoid(&shard_a.intent.all_pageservers());
|
||||
schedule_context.push_attached(shard_a.intent.get_attached().unwrap());
|
||||
schedule_context.avoid(&shard_b.intent.all_pageservers());
|
||||
schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
|
||||
|
||||
let optimization_a = shard_a.optimize_secondary(&scheduler, &schedule_context);
|
||||
|
||||
// Since there is a node with no locations available, the node with two locations for the
|
||||
// same tenant should generate an optimization to move one away
|
||||
assert_eq!(
|
||||
optimization_a,
|
||||
Some(ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
|
||||
old_node_id: NodeId(3),
|
||||
new_node_id: NodeId(4)
|
||||
}))
|
||||
);
|
||||
|
||||
shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap());
|
||||
assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(1)));
|
||||
assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(4)]);
|
||||
|
||||
shard_a.intent.clear(&mut scheduler);
|
||||
shard_b.intent.clear(&mut scheduler);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Optimize til quiescent: this emulates what Service::optimize_all does, when
|
||||
// called repeatedly in the background.
|
||||
fn optimize_til_idle(
|
||||
nodes: &HashMap<NodeId, Node>,
|
||||
scheduler: &mut Scheduler,
|
||||
shards: &mut [TenantState],
|
||||
) {
|
||||
let mut loop_n = 0;
|
||||
loop {
|
||||
let mut schedule_context = ScheduleContext::default();
|
||||
let mut any_changed = false;
|
||||
|
||||
for shard in shards.iter() {
|
||||
schedule_context.avoid(&shard.intent.all_pageservers());
|
||||
if let Some(attached) = shard.intent.get_attached() {
|
||||
schedule_context.push_attached(*attached);
|
||||
}
|
||||
}
|
||||
|
||||
for shard in shards.iter_mut() {
|
||||
let optimization = shard.optimize_attachment(nodes, &schedule_context);
|
||||
if let Some(optimization) = optimization {
|
||||
shard.apply_optimization(scheduler, optimization);
|
||||
any_changed = true;
|
||||
break;
|
||||
}
|
||||
|
||||
let optimization = shard.optimize_secondary(scheduler, &schedule_context);
|
||||
if let Some(optimization) = optimization {
|
||||
shard.apply_optimization(scheduler, optimization);
|
||||
any_changed = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if !any_changed {
|
||||
break;
|
||||
}
|
||||
|
||||
// Assert no infinite loop
|
||||
loop_n += 1;
|
||||
assert!(loop_n < 1000);
|
||||
}
|
||||
}
|
||||
|
||||
/// Test the balancing behavior of shard scheduling: that it achieves a balance, and
|
||||
/// that it converges.
|
||||
#[test]
|
||||
fn optimize_add_nodes() -> anyhow::Result<()> {
|
||||
let nodes = make_test_nodes(4);
|
||||
|
||||
// Only show the scheduler a couple of nodes
|
||||
let mut scheduler = Scheduler::new([].iter());
|
||||
scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap());
|
||||
scheduler.node_upsert(nodes.get(&NodeId(2)).unwrap());
|
||||
|
||||
let mut shards = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4));
|
||||
let mut schedule_context = ScheduleContext::default();
|
||||
for shard in &mut shards {
|
||||
assert!(shard
|
||||
.schedule(&mut scheduler, &mut schedule_context)
|
||||
.is_ok());
|
||||
}
|
||||
|
||||
// We should see equal number of locations on the two nodes.
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4);
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4);
|
||||
|
||||
// Add another two nodes: we should see the shards spread out when their optimize
|
||||
// methods are called
|
||||
scheduler.node_upsert(nodes.get(&NodeId(3)).unwrap());
|
||||
scheduler.node_upsert(nodes.get(&NodeId(4)).unwrap());
|
||||
optimize_til_idle(&nodes, &mut scheduler, &mut shards);
|
||||
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 2);
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2);
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 2);
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 2);
|
||||
|
||||
for shard in shards.iter_mut() {
|
||||
shard.intent.clear(&mut scheduler);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -294,7 +294,7 @@ where
|
||||
// is in state 'taken' but the thread that would unlock it is
|
||||
// not there.
|
||||
// 2. A rust object that represented some external resource in the
|
||||
// parent now got implicitly copied by the fork, even though
|
||||
// parent now got implicitly copied by the the fork, even though
|
||||
// the object's type is not `Copy`. The parent program may use
|
||||
// non-copyability as way to enforce unique ownership of an
|
||||
// external resource in the typesystem. The fork breaks that
|
||||
|
||||
@@ -437,7 +437,7 @@ async fn handle_tenant(
|
||||
|
||||
let placement_policy = match create_match.get_one::<String>("placement-policy") {
|
||||
Some(s) if !s.is_empty() => serde_json::from_str::<PlacementPolicy>(s)?,
|
||||
_ => PlacementPolicy::Attached(0),
|
||||
_ => PlacementPolicy::Single,
|
||||
};
|
||||
|
||||
let tenant_conf = PageServerNode::parse_config(tenant_conf)?;
|
||||
@@ -523,6 +523,88 @@ async fn handle_tenant(
|
||||
.with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
|
||||
println!("tenant {tenant_id} successfully configured on the pageserver");
|
||||
}
|
||||
Some(("migrate", matches)) => {
|
||||
let tenant_shard_id = get_tenant_shard_id(matches, env)?;
|
||||
let new_pageserver = get_pageserver(env, matches)?;
|
||||
let new_pageserver_id = new_pageserver.conf.id;
|
||||
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
storage_controller
|
||||
.tenant_migrate(tenant_shard_id, new_pageserver_id)
|
||||
.await?;
|
||||
|
||||
println!("tenant {tenant_shard_id} migrated to {}", new_pageserver_id);
|
||||
}
|
||||
Some(("status", matches)) => {
|
||||
let tenant_id = get_tenant_id(matches, env)?;
|
||||
|
||||
let mut shard_table = comfy_table::Table::new();
|
||||
shard_table.set_header(["Shard", "Pageserver", "Physical Size"]);
|
||||
|
||||
let mut tenant_synthetic_size = None;
|
||||
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
for shard in storage_controller.tenant_locate(tenant_id).await?.shards {
|
||||
let pageserver =
|
||||
PageServerNode::from_env(env, env.get_pageserver_conf(shard.node_id)?);
|
||||
|
||||
let size = pageserver
|
||||
.http_client
|
||||
.tenant_details(shard.shard_id)
|
||||
.await?
|
||||
.tenant_info
|
||||
.current_physical_size
|
||||
.unwrap();
|
||||
|
||||
shard_table.add_row([
|
||||
format!("{}", shard.shard_id.shard_slug()),
|
||||
format!("{}", shard.node_id.0),
|
||||
format!("{} MiB", size / (1024 * 1024)),
|
||||
]);
|
||||
|
||||
if shard.shard_id.is_zero() {
|
||||
tenant_synthetic_size =
|
||||
Some(pageserver.tenant_synthetic_size(shard.shard_id).await?);
|
||||
}
|
||||
}
|
||||
|
||||
let Some(synthetic_size) = tenant_synthetic_size else {
|
||||
bail!("Shard 0 not found")
|
||||
};
|
||||
|
||||
let mut tenant_table = comfy_table::Table::new();
|
||||
tenant_table.add_row(["Tenant ID".to_string(), tenant_id.to_string()]);
|
||||
tenant_table.add_row([
|
||||
"Synthetic size".to_string(),
|
||||
format!("{} MiB", synthetic_size.size.unwrap_or(0) / (1024 * 1024)),
|
||||
]);
|
||||
|
||||
println!("{tenant_table}");
|
||||
println!("{shard_table}");
|
||||
}
|
||||
Some(("shard-split", matches)) => {
|
||||
let tenant_id = get_tenant_id(matches, env)?;
|
||||
let shard_count: u8 = matches.get_one::<u8>("shard-count").cloned().unwrap_or(0);
|
||||
let shard_stripe_size: Option<ShardStripeSize> = matches
|
||||
.get_one::<Option<ShardStripeSize>>("shard-stripe-size")
|
||||
.cloned()
|
||||
.unwrap();
|
||||
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
let result = storage_controller
|
||||
.tenant_split(tenant_id, shard_count, shard_stripe_size)
|
||||
.await?;
|
||||
println!(
|
||||
"Split tenant {} into shards {}",
|
||||
tenant_id,
|
||||
result
|
||||
.new_shards
|
||||
.iter()
|
||||
.map(|s| format!("{:?}", s))
|
||||
.collect::<Vec<_>>()
|
||||
.join(",")
|
||||
);
|
||||
}
|
||||
|
||||
Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
|
||||
None => bail!("no tenant subcommand provided"),
|
||||
@@ -1496,6 +1578,19 @@ fn cli() -> Command {
|
||||
.subcommand(Command::new("config")
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
|
||||
.subcommand(Command::new("migrate")
|
||||
.about("Migrate a tenant from one pageserver to another")
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(pageserver_id_arg.clone()))
|
||||
.subcommand(Command::new("status")
|
||||
.about("Human readable summary of the tenant's shards and attachment locations")
|
||||
.arg(tenant_id_arg.clone()))
|
||||
.subcommand(Command::new("shard-split")
|
||||
.about("Increase the number of shards in the tenant")
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
|
||||
.arg(Arg::new("shard-stripe-size").value_parser(value_parser!(u32)).long("shard-stripe-size").action(ArgAction::Set).help("Sharding stripe size in pages"))
|
||||
)
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("pageserver")
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
//!
|
||||
//! The endpoint is managed by the `compute_ctl` binary. When an endpoint is
|
||||
//! started, we launch `compute_ctl` It synchronizes the safekeepers, downloads
|
||||
//! the basebackup from the pageserver to initialize the data directory, and
|
||||
//! the basebackup from the pageserver to initialize the the data directory, and
|
||||
//! finally launches the PostgreSQL process. It watches the PostgreSQL process
|
||||
//! until it exits.
|
||||
//!
|
||||
|
||||
@@ -127,8 +127,8 @@ pub struct PageServerConf {
|
||||
pub pg_auth_type: AuthType,
|
||||
pub http_auth_type: AuthType,
|
||||
|
||||
pub(crate) virtual_file_io_engine: Option<String>,
|
||||
pub(crate) get_vectored_impl: Option<String>,
|
||||
pub(crate) virtual_file_io_engine: String,
|
||||
pub(crate) get_vectored_impl: String,
|
||||
}
|
||||
|
||||
impl Default for PageServerConf {
|
||||
@@ -139,8 +139,9 @@ impl Default for PageServerConf {
|
||||
listen_http_addr: String::new(),
|
||||
pg_auth_type: AuthType::Trust,
|
||||
http_auth_type: AuthType::Trust,
|
||||
virtual_file_io_engine: None,
|
||||
get_vectored_impl: None,
|
||||
// FIXME: use the ones exposed by pageserver crate
|
||||
virtual_file_io_engine: "tokio-epoll-uring".to_owned(),
|
||||
get_vectored_impl: "sequential".to_owned(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -101,16 +101,8 @@ impl PageServerNode {
|
||||
|
||||
let pg_auth_type_param = format!("pg_auth_type='{}'", pg_auth_type);
|
||||
let listen_pg_addr_param = format!("listen_pg_addr='{}'", listen_pg_addr);
|
||||
let virtual_file_io_engine = if let Some(virtual_file_io_engine) = virtual_file_io_engine {
|
||||
format!("virtual_file_io_engine='{virtual_file_io_engine}'")
|
||||
} else {
|
||||
String::new()
|
||||
};
|
||||
let get_vectored_impl = if let Some(get_vectored_impl) = get_vectored_impl {
|
||||
format!("get_vectored_impl='{get_vectored_impl}'")
|
||||
} else {
|
||||
String::new()
|
||||
};
|
||||
let virtual_file_io_engine = format!("virtual_file_io_engine='{virtual_file_io_engine}'");
|
||||
let get_vectored_impl = format!("get_vectored_impl='{get_vectored_impl}'");
|
||||
|
||||
let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
|
||||
|
||||
@@ -389,10 +381,6 @@ impl PageServerNode {
|
||||
.remove("image_creation_threshold")
|
||||
.map(|x| x.parse::<usize>())
|
||||
.transpose()?,
|
||||
image_layer_creation_check_threshold: settings
|
||||
.remove("image_layer_creation_check_threshold")
|
||||
.map(|x| x.parse::<u8>())
|
||||
.transpose()?,
|
||||
pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
|
||||
walreceiver_connect_timeout: settings
|
||||
.remove("walreceiver_connect_timeout")
|
||||
@@ -505,12 +493,6 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<usize>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'image_creation_threshold' as non zero integer")?,
|
||||
image_layer_creation_check_threshold: settings
|
||||
.remove("image_layer_creation_check_threshold")
|
||||
.map(|x| x.parse::<u8>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'image_creation_check_threshold' as integer")?,
|
||||
|
||||
pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
|
||||
walreceiver_connect_timeout: settings
|
||||
.remove("walreceiver_connect_timeout")
|
||||
|
||||
@@ -279,7 +279,6 @@ impl StorageController {
|
||||
&self.listen,
|
||||
"-p",
|
||||
self.path.as_ref(),
|
||||
"--dev",
|
||||
"--database-url",
|
||||
&database_url,
|
||||
"--max-unavailable-interval",
|
||||
@@ -476,7 +475,7 @@ impl StorageController {
|
||||
pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
|
||||
self.dispatch::<(), _>(
|
||||
Method::GET,
|
||||
format!("debug/v1/tenant/{tenant_id}/locate"),
|
||||
format!("control/v1/tenant/{tenant_id}/locate"),
|
||||
None,
|
||||
)
|
||||
.await
|
||||
|
||||
@@ -40,7 +40,7 @@ macro_rules! register_hll {
|
||||
}};
|
||||
|
||||
($N:literal, $NAME:expr, $HELP:expr $(,)?) => {{
|
||||
$crate::register_hll!($N, $crate::opts!($NAME, $HELP))
|
||||
$crate::register_hll!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES)
|
||||
}};
|
||||
}
|
||||
|
||||
|
||||
@@ -6,10 +6,7 @@ use std::str::FromStr;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::NodeId;
|
||||
|
||||
use crate::{
|
||||
models::{ShardParameters, TenantConfig},
|
||||
shard::{ShardStripeSize, TenantShardId},
|
||||
};
|
||||
use crate::{models::ShardParameters, shard::TenantShardId};
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantCreateResponseShard {
|
||||
@@ -42,12 +39,6 @@ pub struct NodeConfigureRequest {
|
||||
pub scheduling: Option<NodeSchedulingPolicy>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantPolicyRequest {
|
||||
pub placement: Option<PlacementPolicy>,
|
||||
pub scheduling: Option<ShardSchedulingPolicy>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantLocateResponseShard {
|
||||
pub shard_id: TenantShardId,
|
||||
@@ -66,31 +57,6 @@ pub struct TenantLocateResponse {
|
||||
pub shard_params: ShardParameters,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantDescribeResponse {
|
||||
pub shards: Vec<TenantDescribeResponseShard>,
|
||||
pub stripe_size: ShardStripeSize,
|
||||
pub policy: PlacementPolicy,
|
||||
pub config: TenantConfig,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantDescribeResponseShard {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
|
||||
pub node_attached: Option<NodeId>,
|
||||
pub node_secondary: Vec<NodeId>,
|
||||
|
||||
pub last_error: String,
|
||||
|
||||
/// A task is currently running to reconcile this tenant's intent state with the state on pageservers
|
||||
pub is_reconciling: bool,
|
||||
/// This shard failed in sending a compute notification to the cloud control plane, and a retry is pending.
|
||||
pub is_pending_compute_notification: bool,
|
||||
/// A shard split is currently underway
|
||||
pub is_splitting: bool,
|
||||
}
|
||||
|
||||
/// Explicitly migrating a particular shard is a low level operation
|
||||
/// TODO: higher level "Reschedule tenant" operation where the request
|
||||
/// specifies some constraints, e.g. asking it to get off particular node(s)
|
||||
@@ -176,32 +142,6 @@ impl FromStr for NodeAvailability {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
|
||||
pub enum ShardSchedulingPolicy {
|
||||
// Normal mode: the tenant's scheduled locations may be updated at will, including
|
||||
// for non-essential optimization.
|
||||
Active,
|
||||
|
||||
// Disable optimizations, but permit scheduling when necessary to fulfil the PlacementPolicy.
|
||||
// For example, this still permits a node's attachment location to change to a secondary in
|
||||
// response to a node failure, or to assign a new secondary if a node was removed.
|
||||
Essential,
|
||||
|
||||
// No scheduling: leave the shard running wherever it currently is. Even if the shard is
|
||||
// unavailable, it will not be rescheduled to another node.
|
||||
Pause,
|
||||
|
||||
// No reconciling: we will make no location_conf API calls to pageservers at all. If the
|
||||
// shard is unavailable, it stays that way. If a node fails, this shard doesn't get failed over.
|
||||
Stop,
|
||||
}
|
||||
|
||||
impl Default for ShardSchedulingPolicy {
|
||||
fn default() -> Self {
|
||||
Self::Active
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
|
||||
pub enum NodeSchedulingPolicy {
|
||||
Active,
|
||||
@@ -241,8 +181,11 @@ impl From<NodeSchedulingPolicy> for String {
|
||||
/// to create secondary locations.
|
||||
#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
|
||||
pub enum PlacementPolicy {
|
||||
/// Normal live state: one attached pageserver and zero or more secondaries.
|
||||
Attached(usize),
|
||||
/// Cheapest way to attach a tenant: just one pageserver, no secondary
|
||||
Single,
|
||||
/// Production-ready way to attach a tenant: one attached pageserver and
|
||||
/// some number of secondaries.
|
||||
Double(usize),
|
||||
/// Create one secondary mode locations. This is useful when onboarding
|
||||
/// a tenant, or for an idle tenant that we might want to bring online quickly.
|
||||
Secondary,
|
||||
@@ -264,14 +207,14 @@ mod test {
|
||||
/// Check stability of PlacementPolicy's serialization
|
||||
#[test]
|
||||
fn placement_policy_encoding() -> anyhow::Result<()> {
|
||||
let v = PlacementPolicy::Attached(1);
|
||||
let v = PlacementPolicy::Double(1);
|
||||
let encoded = serde_json::to_string(&v)?;
|
||||
assert_eq!(encoded, "{\"Attached\":1}");
|
||||
assert_eq!(encoded, "{\"Double\":1}");
|
||||
assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
|
||||
|
||||
let v = PlacementPolicy::Detached;
|
||||
let v = PlacementPolicy::Single;
|
||||
let encoded = serde_json::to_string(&v)?;
|
||||
assert_eq!(encoded, "\"Detached\"");
|
||||
assert_eq!(encoded, "\"Single\"");
|
||||
assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -301,7 +301,6 @@ pub struct TenantConfig {
|
||||
pub heatmap_period: Option<String>,
|
||||
pub lazy_slru_download: Option<bool>,
|
||||
pub timeline_get_throttle: Option<ThrottleConfig>,
|
||||
pub image_layer_creation_check_threshold: Option<u8>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
|
||||
@@ -6,9 +6,7 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::NodeId;
|
||||
|
||||
use crate::{
|
||||
controller_api::NodeRegisterRequest, models::LocationConfigMode, shard::TenantShardId,
|
||||
};
|
||||
use crate::{controller_api::NodeRegisterRequest, shard::TenantShardId};
|
||||
|
||||
/// Upcall message sent by the pageserver to the configured `control_plane_api` on
|
||||
/// startup.
|
||||
@@ -22,20 +20,12 @@ pub struct ReAttachRequest {
|
||||
pub register: Option<NodeRegisterRequest>,
|
||||
}
|
||||
|
||||
fn default_mode() -> LocationConfigMode {
|
||||
LocationConfigMode::AttachedSingle
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ReAttachResponseTenant {
|
||||
pub id: TenantShardId,
|
||||
/// Mandatory if LocationConfigMode is None or set to an Attached* mode
|
||||
pub gen: Option<u32>,
|
||||
|
||||
/// Default value only for backward compat: this field should be set
|
||||
#[serde(default = "default_mode")]
|
||||
pub mode: LocationConfigMode,
|
||||
pub gen: u32,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ReAttachResponse {
|
||||
pub tenants: Vec<ReAttachResponseTenant>,
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use anyhow::*;
|
||||
use clap::{value_parser, Arg, ArgMatches, Command};
|
||||
use postgres::Client;
|
||||
use std::{path::PathBuf, str::FromStr};
|
||||
use wal_craft::*;
|
||||
|
||||
@@ -9,8 +8,8 @@ fn main() -> Result<()> {
|
||||
.init();
|
||||
let arg_matches = cli().get_matches();
|
||||
|
||||
let wal_craft = |arg_matches: &ArgMatches, client: &mut Client| {
|
||||
let intermediate_lsns = match arg_matches
|
||||
let wal_craft = |arg_matches: &ArgMatches, client| {
|
||||
let (intermediate_lsns, end_of_wal_lsn) = match arg_matches
|
||||
.get_one::<String>("type")
|
||||
.map(|s| s.as_str())
|
||||
.context("'type' is required")?
|
||||
@@ -26,7 +25,6 @@ fn main() -> Result<()> {
|
||||
LastWalRecordCrossingSegment::NAME => LastWalRecordCrossingSegment::craft(client)?,
|
||||
a => panic!("Unknown --type argument: {a}"),
|
||||
};
|
||||
let end_of_wal_lsn = client.pg_current_wal_insert_lsn()?;
|
||||
for lsn in intermediate_lsns {
|
||||
println!("intermediate_lsn = {lsn}");
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@ use postgres::types::PgLsn;
|
||||
use postgres::Client;
|
||||
use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
|
||||
use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
|
||||
use std::cmp::Ordering;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::Command;
|
||||
use std::time::{Duration, Instant};
|
||||
@@ -231,52 +232,59 @@ pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> anyhow
|
||||
pub trait Crafter {
|
||||
const NAME: &'static str;
|
||||
|
||||
/// Generates WAL using the client `client`. Returns a vector of some valid
|
||||
/// "interesting" intermediate LSNs which one may start reading from.
|
||||
/// test_end_of_wal uses this to check various starting points.
|
||||
///
|
||||
/// Note that postgres is generally keen about writing some WAL. While we
|
||||
/// try to disable it (autovacuum, big wal_writer_delay, etc) it is always
|
||||
/// possible, e.g. xl_running_xacts are dumped each 15s. So checks about
|
||||
/// stable WAL end would be flaky unless postgres is shut down. For this
|
||||
/// reason returning potential end of WAL here is pointless. Most of the
|
||||
/// time this doesn't happen though, so it is reasonable to create needed
|
||||
/// WAL structure and immediately kill postgres like test_end_of_wal does.
|
||||
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>>;
|
||||
/// Generates WAL using the client `client`. Returns a pair of:
|
||||
/// * A vector of some valid "interesting" intermediate LSNs which one may start reading from.
|
||||
/// May include or exclude Lsn(0) and the end-of-wal.
|
||||
/// * The expected end-of-wal LSN.
|
||||
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)>;
|
||||
}
|
||||
|
||||
/// Wraps some WAL craft function, providing current LSN to it before the
|
||||
/// insertion and flushing WAL afterwards. Also pushes initial LSN to the
|
||||
/// result.
|
||||
fn craft_internal<C: postgres::GenericClient>(
|
||||
client: &mut C,
|
||||
f: impl Fn(&mut C, PgLsn) -> anyhow::Result<Vec<PgLsn>>,
|
||||
) -> anyhow::Result<Vec<PgLsn>> {
|
||||
f: impl Fn(&mut C, PgLsn) -> anyhow::Result<(Vec<PgLsn>, Option<PgLsn>)>,
|
||||
) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
|
||||
ensure_server_config(client)?;
|
||||
|
||||
let initial_lsn = client.pg_current_wal_insert_lsn()?;
|
||||
info!("LSN initial = {}", initial_lsn);
|
||||
|
||||
let mut intermediate_lsns = f(client, initial_lsn)?;
|
||||
let (mut intermediate_lsns, last_lsn) = f(client, initial_lsn)?;
|
||||
let last_lsn = match last_lsn {
|
||||
None => client.pg_current_wal_insert_lsn()?,
|
||||
Some(last_lsn) => {
|
||||
let insert_lsn = client.pg_current_wal_insert_lsn()?;
|
||||
match last_lsn.cmp(&insert_lsn) {
|
||||
Ordering::Less => bail!(
|
||||
"Some records were inserted after the crafted WAL: {} vs {}",
|
||||
last_lsn,
|
||||
insert_lsn
|
||||
),
|
||||
Ordering::Equal => last_lsn,
|
||||
Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"),
|
||||
}
|
||||
}
|
||||
};
|
||||
if !intermediate_lsns.starts_with(&[initial_lsn]) {
|
||||
intermediate_lsns.insert(0, initial_lsn);
|
||||
}
|
||||
|
||||
// Some records may be not flushed, e.g. non-transactional logical messages.
|
||||
//
|
||||
// Note: this is broken if pg_current_wal_insert_lsn is at page boundary
|
||||
// because pg_current_wal_insert_lsn skips page headers.
|
||||
client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?;
|
||||
Ok(intermediate_lsns)
|
||||
match last_lsn.cmp(&client.pg_current_wal_flush_lsn()?) {
|
||||
Ordering::Less => bail!("Some records were flushed after the crafted WAL"),
|
||||
Ordering::Equal => {}
|
||||
Ordering::Greater => bail!("Reported LSN is greater than flush_lsn"),
|
||||
}
|
||||
Ok((intermediate_lsns, last_lsn))
|
||||
}
|
||||
|
||||
pub struct Simple;
|
||||
impl Crafter for Simple {
|
||||
const NAME: &'static str = "simple";
|
||||
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
|
||||
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
|
||||
craft_internal(client, |client, _| {
|
||||
client.execute("CREATE table t(x int)", &[])?;
|
||||
Ok(Vec::new())
|
||||
Ok((Vec::new(), None))
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -284,36 +292,29 @@ impl Crafter for Simple {
|
||||
pub struct LastWalRecordXlogSwitch;
|
||||
impl Crafter for LastWalRecordXlogSwitch {
|
||||
const NAME: &'static str = "last_wal_record_xlog_switch";
|
||||
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
|
||||
// Do not use craft_internal because here we end up with flush_lsn exactly on
|
||||
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
|
||||
// Do not use generate_internal because here we end up with flush_lsn exactly on
|
||||
// the segment boundary and insert_lsn after the initial page header, which is unusual.
|
||||
ensure_server_config(client)?;
|
||||
|
||||
client.execute("CREATE table t(x int)", &[])?;
|
||||
let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
|
||||
// pg_switch_wal returns end of last record of the switched segment,
|
||||
// i.e. end of SWITCH itself.
|
||||
let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
|
||||
let before_xlog_switch_u64 = u64::from(before_xlog_switch);
|
||||
let next_segment = PgLsn::from(
|
||||
before_xlog_switch_u64 - (before_xlog_switch_u64 % WAL_SEGMENT_SIZE as u64)
|
||||
+ WAL_SEGMENT_SIZE as u64,
|
||||
);
|
||||
let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
|
||||
let next_segment = PgLsn::from(0x0200_0000);
|
||||
ensure!(
|
||||
xlog_switch_record_end <= next_segment,
|
||||
"XLOG_SWITCH record ended after the expected segment boundary: {} > {}",
|
||||
xlog_switch_record_end,
|
||||
after_xlog_switch <= next_segment,
|
||||
"XLOG_SWITCH message ended after the expected segment boundary: {} > {}",
|
||||
after_xlog_switch,
|
||||
next_segment
|
||||
);
|
||||
Ok(vec![before_xlog_switch, xlog_switch_record_end])
|
||||
Ok((vec![before_xlog_switch, after_xlog_switch], next_segment))
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LastWalRecordXlogSwitchEndsOnPageBoundary;
|
||||
/// Craft xlog SWITCH record ending at page boundary.
|
||||
impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
|
||||
const NAME: &'static str = "last_wal_record_xlog_switch_ends_on_page_boundary";
|
||||
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
|
||||
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
|
||||
// Do not use generate_internal because here we end up with flush_lsn exactly on
|
||||
// the segment boundary and insert_lsn after the initial page header, which is unusual.
|
||||
ensure_server_config(client)?;
|
||||
@@ -360,29 +361,28 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
|
||||
|
||||
// Emit the XLOG_SWITCH
|
||||
let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
|
||||
let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
|
||||
let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
|
||||
let next_segment = PgLsn::from(0x0200_0000);
|
||||
ensure!(
|
||||
xlog_switch_record_end < next_segment,
|
||||
"XLOG_SWITCH record ended on or after the expected segment boundary: {} > {}",
|
||||
xlog_switch_record_end,
|
||||
after_xlog_switch < next_segment,
|
||||
"XLOG_SWITCH message ended on or after the expected segment boundary: {} > {}",
|
||||
after_xlog_switch,
|
||||
next_segment
|
||||
);
|
||||
ensure!(
|
||||
u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
|
||||
u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
|
||||
"XLOG_SWITCH message ended not on page boundary: {}, offset = {}",
|
||||
xlog_switch_record_end,
|
||||
u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
|
||||
after_xlog_switch,
|
||||
u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ
|
||||
);
|
||||
Ok(vec![before_xlog_switch, xlog_switch_record_end])
|
||||
Ok((vec![before_xlog_switch, after_xlog_switch], next_segment))
|
||||
}
|
||||
}
|
||||
|
||||
/// Write ~16MB logical message; it should cross WAL segment.
|
||||
fn craft_seg_size_logical_message(
|
||||
fn craft_single_logical_message(
|
||||
client: &mut impl postgres::GenericClient,
|
||||
transactional: bool,
|
||||
) -> anyhow::Result<Vec<PgLsn>> {
|
||||
) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
|
||||
craft_internal(client, |client, initial_lsn| {
|
||||
ensure!(
|
||||
initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024),
|
||||
@@ -405,24 +405,34 @@ fn craft_seg_size_logical_message(
|
||||
"Logical message crossed two segments"
|
||||
);
|
||||
|
||||
Ok(vec![message_lsn])
|
||||
if transactional {
|
||||
// Transactional logical messages are part of a transaction, so the one above is
|
||||
// followed by a small COMMIT record.
|
||||
|
||||
let after_message_lsn = client.pg_current_wal_insert_lsn()?;
|
||||
ensure!(
|
||||
message_lsn < after_message_lsn,
|
||||
"No record found after the emitted message"
|
||||
);
|
||||
Ok((vec![message_lsn], Some(after_message_lsn)))
|
||||
} else {
|
||||
Ok((Vec::new(), Some(message_lsn)))
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub struct WalRecordCrossingSegmentFollowedBySmallOne;
|
||||
impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne {
|
||||
const NAME: &'static str = "wal_record_crossing_segment_followed_by_small_one";
|
||||
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
|
||||
// Transactional message crossing WAL segment will be followed by small
|
||||
// commit record.
|
||||
craft_seg_size_logical_message(client, true)
|
||||
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
|
||||
craft_single_logical_message(client, true)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LastWalRecordCrossingSegment;
|
||||
impl Crafter for LastWalRecordCrossingSegment {
|
||||
const NAME: &'static str = "last_wal_record_crossing_segment";
|
||||
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
|
||||
craft_seg_size_logical_message(client, false)
|
||||
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
|
||||
craft_single_logical_message(client, false)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,15 +11,13 @@ use utils::const_assert;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
fn init_logging() {
|
||||
let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(format!(
|
||||
"crate=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"
|
||||
)))
|
||||
let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(
|
||||
format!("crate=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"),
|
||||
))
|
||||
.is_test(true)
|
||||
.try_init();
|
||||
}
|
||||
|
||||
/// Test that find_end_of_wal returns the same results as pg_dump on various
|
||||
/// WALs created by Crafter.
|
||||
fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
|
||||
use crate::*;
|
||||
|
||||
@@ -40,13 +38,13 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
|
||||
}
|
||||
cfg.initdb().unwrap();
|
||||
let srv = cfg.start_server().unwrap();
|
||||
let intermediate_lsns = C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
|
||||
let (intermediate_lsns, expected_end_of_wal_partial) =
|
||||
C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
|
||||
let intermediate_lsns: Vec<Lsn> = intermediate_lsns
|
||||
.iter()
|
||||
.map(|&lsn| u64::from(lsn).into())
|
||||
.collect();
|
||||
// Kill postgres. Note that it might have inserted to WAL something after
|
||||
// 'craft' did its job.
|
||||
let expected_end_of_wal: Lsn = u64::from(expected_end_of_wal_partial).into();
|
||||
srv.kill();
|
||||
|
||||
// Check find_end_of_wal on the initial WAL
|
||||
@@ -58,7 +56,7 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
|
||||
.filter(|fname| IsXLogFileName(fname))
|
||||
.max()
|
||||
.unwrap();
|
||||
let expected_end_of_wal = find_pg_waldump_end_of_wal(&cfg, &last_segment);
|
||||
check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal);
|
||||
for start_lsn in intermediate_lsns
|
||||
.iter()
|
||||
.chain(std::iter::once(&expected_end_of_wal))
|
||||
@@ -93,7 +91,11 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
|
||||
}
|
||||
}
|
||||
|
||||
fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &str) -> Lsn {
|
||||
fn check_pg_waldump_end_of_wal(
|
||||
cfg: &crate::Conf,
|
||||
last_segment: &str,
|
||||
expected_end_of_wal: Lsn,
|
||||
) {
|
||||
// Get the actual end of WAL by pg_waldump
|
||||
let waldump_output = cfg
|
||||
.pg_waldump("000000010000000000000001", last_segment)
|
||||
@@ -111,8 +113,11 @@ fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &str) -> Lsn {
|
||||
}
|
||||
};
|
||||
let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap();
|
||||
info!("waldump erred on {}", waldump_wal_end);
|
||||
waldump_wal_end
|
||||
info!(
|
||||
"waldump erred on {}, expected wal end at {}",
|
||||
waldump_wal_end, expected_end_of_wal
|
||||
);
|
||||
assert_eq!(waldump_wal_end, expected_end_of_wal);
|
||||
}
|
||||
|
||||
fn check_end_of_wal(
|
||||
@@ -205,9 +210,9 @@ pub fn test_update_next_xid() {
|
||||
#[test]
|
||||
pub fn test_encode_logical_message() {
|
||||
let expected = [
|
||||
64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255, 38,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114, 101, 102,
|
||||
105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
|
||||
64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255,
|
||||
38, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114,
|
||||
101, 102, 105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
|
||||
];
|
||||
let actual = encode_logical_message("prefix", "message");
|
||||
assert_eq!(expected, actual[..]);
|
||||
|
||||
@@ -198,7 +198,6 @@ impl LocalFs {
|
||||
fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.create(true)
|
||||
.truncate(true)
|
||||
.open(&temp_file_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
|
||||
@@ -247,7 +247,7 @@ fn scenario_4() {
|
||||
//
|
||||
// This is in total 5000 + 1000 + 5000 + 1000 = 12000
|
||||
//
|
||||
// (If we used the method from the previous scenario, and
|
||||
// (If we used the the method from the previous scenario, and
|
||||
// kept only snapshot at the branch point, we'd need to keep
|
||||
// all the WAL between 10000-18000 on the main branch, so
|
||||
// the total size would be 5000 + 1000 + 8000 = 14000. The
|
||||
|
||||
@@ -13,7 +13,6 @@ testing = ["fail/failpoints"]
|
||||
[dependencies]
|
||||
arc-swap.workspace = true
|
||||
sentry.workspace = true
|
||||
async-compression.workspace = true
|
||||
async-trait.workspace = true
|
||||
anyhow.workspace = true
|
||||
bincode.workspace = true
|
||||
@@ -37,7 +36,6 @@ serde_json.workspace = true
|
||||
signal-hook.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-tar.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-error.workspace = true
|
||||
@@ -48,7 +46,6 @@ strum.workspace = true
|
||||
strum_macros.workspace = true
|
||||
url.workspace = true
|
||||
uuid.workspace = true
|
||||
walkdir.workspace = true
|
||||
|
||||
pq_proto.workspace = true
|
||||
postgres_connection.workspace = true
|
||||
|
||||
@@ -192,7 +192,6 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
|
||||
"#;
|
||||
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn test_decode() {
|
||||
let expected_claims = Claims {
|
||||
tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()),
|
||||
|
||||
@@ -245,7 +245,7 @@ impl std::io::Write for ChannelWriter {
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
SERVE_METRICS_COUNT.inc();
|
||||
|
||||
let started_at = std::time::Instant::now();
|
||||
@@ -367,6 +367,7 @@ pub fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
|
||||
.middleware(Middleware::post_with_info(
|
||||
add_request_id_header_to_response,
|
||||
))
|
||||
.get("/metrics", |r| request_span(r, prometheus_metrics_handler))
|
||||
.err_handler(route_error_handler)
|
||||
}
|
||||
|
||||
|
||||
@@ -87,8 +87,6 @@ pub mod failpoint_support;
|
||||
|
||||
pub mod yielding_loop;
|
||||
|
||||
pub mod zstd;
|
||||
|
||||
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
|
||||
///
|
||||
/// we have several cases:
|
||||
|
||||
@@ -63,7 +63,6 @@ impl UnwrittenLockFile {
|
||||
pub fn create_exclusive(lock_file_path: &Utf8Path) -> anyhow::Result<UnwrittenLockFile> {
|
||||
let lock_file = fs::OpenOptions::new()
|
||||
.create(true) // O_CREAT
|
||||
.truncate(true)
|
||||
.write(true)
|
||||
.open(lock_file_path)
|
||||
.context("open lock file")?;
|
||||
|
||||
@@ -245,7 +245,7 @@ impl<'a, T> Guard<'a, T> {
|
||||
///
|
||||
/// The permit will be on a semaphore part of the new internal value, and any following
|
||||
/// [`OnceCell::get_or_init`] will wait on it to complete.
|
||||
pub fn take_and_deinit(mut self) -> (T, InitPermit) {
|
||||
pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
|
||||
let mut swapped = Inner::default();
|
||||
let sem = swapped.init_semaphore.clone();
|
||||
// acquire and forget right away, moving the control over to InitPermit
|
||||
@@ -543,7 +543,7 @@ mod tests {
|
||||
target.set(42, permit);
|
||||
|
||||
let (_answer, permit) = {
|
||||
let guard = target
|
||||
let mut guard = target
|
||||
.get_or_init(|permit| async { Ok::<_, Infallible>((11, permit)) })
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
@@ -1,60 +1,27 @@
|
||||
use std::{alloc::Layout, cmp::Ordering, ops::RangeBounds};
|
||||
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
||||
pub enum VecMapOrdering {
|
||||
Greater,
|
||||
GreaterOrEqual,
|
||||
}
|
||||
|
||||
/// Ordered map datastructure implemented in a Vec.
|
||||
/// Append only - can only add keys that are larger than the
|
||||
/// current max key.
|
||||
/// Ordering can be adjusted using [`VecMapOrdering`]
|
||||
/// during `VecMap` construction.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct VecMap<K, V> {
|
||||
data: Vec<(K, V)>,
|
||||
ordering: VecMapOrdering,
|
||||
}
|
||||
pub struct VecMap<K, V>(Vec<(K, V)>);
|
||||
|
||||
impl<K, V> Default for VecMap<K, V> {
|
||||
fn default() -> Self {
|
||||
VecMap {
|
||||
data: Default::default(),
|
||||
ordering: VecMapOrdering::Greater,
|
||||
}
|
||||
VecMap(Default::default())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum VecMapError {
|
||||
#[error("Key violates ordering constraint")]
|
||||
InvalidKey,
|
||||
#[error("Mismatched ordering constraints")]
|
||||
ExtendOrderingError,
|
||||
}
|
||||
#[derive(Debug)]
|
||||
pub struct InvalidKey;
|
||||
|
||||
impl<K: Ord, V> VecMap<K, V> {
|
||||
pub fn new(ordering: VecMapOrdering) -> Self {
|
||||
Self {
|
||||
data: Vec::new(),
|
||||
ordering,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_capacity(capacity: usize, ordering: VecMapOrdering) -> Self {
|
||||
Self {
|
||||
data: Vec::with_capacity(capacity),
|
||||
ordering,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.data.is_empty()
|
||||
self.0.is_empty()
|
||||
}
|
||||
|
||||
pub fn as_slice(&self) -> &[(K, V)] {
|
||||
self.data.as_slice()
|
||||
self.0.as_slice()
|
||||
}
|
||||
|
||||
/// This function may panic if given a range where the lower bound is
|
||||
@@ -62,7 +29,7 @@ impl<K: Ord, V> VecMap<K, V> {
|
||||
pub fn slice_range<R: RangeBounds<K>>(&self, range: R) -> &[(K, V)] {
|
||||
use std::ops::Bound::*;
|
||||
|
||||
let binary_search = |k: &K| self.data.binary_search_by_key(&k, extract_key);
|
||||
let binary_search = |k: &K| self.0.binary_search_by_key(&k, extract_key);
|
||||
|
||||
let start_idx = match range.start_bound() {
|
||||
Unbounded => 0,
|
||||
@@ -74,7 +41,7 @@ impl<K: Ord, V> VecMap<K, V> {
|
||||
};
|
||||
|
||||
let end_idx = match range.end_bound() {
|
||||
Unbounded => self.data.len(),
|
||||
Unbounded => self.0.len(),
|
||||
Included(k) => match binary_search(k) {
|
||||
Ok(idx) => idx + 1,
|
||||
Err(idx) => idx,
|
||||
@@ -82,30 +49,34 @@ impl<K: Ord, V> VecMap<K, V> {
|
||||
Excluded(k) => binary_search(k).unwrap_or_else(std::convert::identity),
|
||||
};
|
||||
|
||||
&self.data[start_idx..end_idx]
|
||||
&self.0[start_idx..end_idx]
|
||||
}
|
||||
|
||||
/// Add a key value pair to the map.
|
||||
/// If `key` is not respective of the `self` ordering the
|
||||
/// pair will not be added and `InvalidKey` error will be returned.
|
||||
pub fn append(&mut self, key: K, value: V) -> Result<usize, VecMapError> {
|
||||
self.validate_key_order(&key)?;
|
||||
/// If `key` is less than or equal to the current maximum key
|
||||
/// the pair will not be added and InvalidKey error will be returned.
|
||||
pub fn append(&mut self, key: K, value: V) -> Result<usize, InvalidKey> {
|
||||
if let Some((last_key, _last_value)) = self.0.last() {
|
||||
if &key <= last_key {
|
||||
return Err(InvalidKey);
|
||||
}
|
||||
}
|
||||
|
||||
let delta_size = self.instrument_vec_op(|vec| vec.push((key, value)));
|
||||
Ok(delta_size)
|
||||
}
|
||||
|
||||
/// Update the maximum key value pair or add a new key value pair to the map.
|
||||
/// If `key` is not respective of the `self` ordering no updates or additions
|
||||
/// will occur and `InvalidKey` error will be returned.
|
||||
/// If `key` is less than the current maximum key no updates or additions
|
||||
/// will occur and InvalidKey error will be returned.
|
||||
pub fn append_or_update_last(
|
||||
&mut self,
|
||||
key: K,
|
||||
mut value: V,
|
||||
) -> Result<(Option<V>, usize), VecMapError> {
|
||||
if let Some((last_key, last_value)) = self.data.last_mut() {
|
||||
) -> Result<(Option<V>, usize), InvalidKey> {
|
||||
if let Some((last_key, last_value)) = self.0.last_mut() {
|
||||
match key.cmp(last_key) {
|
||||
Ordering::Less => return Err(VecMapError::InvalidKey),
|
||||
Ordering::Less => return Err(InvalidKey),
|
||||
Ordering::Equal => {
|
||||
std::mem::swap(last_value, &mut value);
|
||||
const DELTA_SIZE: usize = 0;
|
||||
@@ -129,67 +100,40 @@ impl<K: Ord, V> VecMap<K, V> {
|
||||
V: Clone,
|
||||
{
|
||||
let split_idx = self
|
||||
.data
|
||||
.0
|
||||
.binary_search_by_key(&cutoff, extract_key)
|
||||
.unwrap_or_else(std::convert::identity);
|
||||
|
||||
(
|
||||
VecMap {
|
||||
data: self.data[..split_idx].to_vec(),
|
||||
ordering: self.ordering,
|
||||
},
|
||||
VecMap {
|
||||
data: self.data[split_idx..].to_vec(),
|
||||
ordering: self.ordering,
|
||||
},
|
||||
VecMap(self.0[..split_idx].to_vec()),
|
||||
VecMap(self.0[split_idx..].to_vec()),
|
||||
)
|
||||
}
|
||||
|
||||
/// Move items from `other` to the end of `self`, leaving `other` empty.
|
||||
/// If the `other` ordering is different from `self` ordering
|
||||
/// `ExtendOrderingError` error will be returned.
|
||||
/// If any keys in `other` is not respective of the ordering defined in
|
||||
/// `self`, `InvalidKey` error will be returned and no mutation will occur.
|
||||
pub fn extend(&mut self, other: &mut Self) -> Result<usize, VecMapError> {
|
||||
if self.ordering != other.ordering {
|
||||
return Err(VecMapError::ExtendOrderingError);
|
||||
}
|
||||
/// If any keys in `other` is less than or equal to any key in `self`,
|
||||
/// `InvalidKey` error will be returned and no mutation will occur.
|
||||
pub fn extend(&mut self, other: &mut Self) -> Result<usize, InvalidKey> {
|
||||
let self_last_opt = self.0.last().map(extract_key);
|
||||
let other_first_opt = other.0.last().map(extract_key);
|
||||
|
||||
let other_first_opt = other.data.last().map(extract_key);
|
||||
if let Some(other_first) = other_first_opt {
|
||||
self.validate_key_order(other_first)?;
|
||||
}
|
||||
|
||||
let delta_size = self.instrument_vec_op(|vec| vec.append(&mut other.data));
|
||||
Ok(delta_size)
|
||||
}
|
||||
|
||||
/// Validate the current last key in `self` and key being
|
||||
/// inserted against the order defined in `self`.
|
||||
fn validate_key_order(&self, key: &K) -> Result<(), VecMapError> {
|
||||
if let Some(last_key) = self.data.last().map(extract_key) {
|
||||
match (&self.ordering, &key.cmp(last_key)) {
|
||||
(VecMapOrdering::Greater, Ordering::Less | Ordering::Equal) => {
|
||||
return Err(VecMapError::InvalidKey);
|
||||
}
|
||||
(VecMapOrdering::Greater, Ordering::Greater) => {}
|
||||
(VecMapOrdering::GreaterOrEqual, Ordering::Less) => {
|
||||
return Err(VecMapError::InvalidKey);
|
||||
}
|
||||
(VecMapOrdering::GreaterOrEqual, Ordering::Equal | Ordering::Greater) => {}
|
||||
if let (Some(self_last), Some(other_first)) = (self_last_opt, other_first_opt) {
|
||||
if self_last >= other_first {
|
||||
return Err(InvalidKey);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
let delta_size = self.instrument_vec_op(|vec| vec.append(&mut other.0));
|
||||
Ok(delta_size)
|
||||
}
|
||||
|
||||
/// Instrument an operation on the underlying [`Vec`].
|
||||
/// Will panic if the operation decreases capacity.
|
||||
/// Returns the increase in memory usage caused by the op.
|
||||
fn instrument_vec_op(&mut self, op: impl FnOnce(&mut Vec<(K, V)>)) -> usize {
|
||||
let old_cap = self.data.capacity();
|
||||
op(&mut self.data);
|
||||
let new_cap = self.data.capacity();
|
||||
let old_cap = self.0.capacity();
|
||||
op(&mut self.0);
|
||||
let new_cap = self.0.capacity();
|
||||
|
||||
match old_cap.cmp(&new_cap) {
|
||||
Ordering::Less => {
|
||||
@@ -201,36 +145,6 @@ impl<K: Ord, V> VecMap<K, V> {
|
||||
Ordering::Greater => panic!("VecMap capacity shouldn't ever decrease"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Similar to `from_iter` defined in `FromIter` trait except
|
||||
/// that it accepts an [`VecMapOrdering`]
|
||||
pub fn from_iter<I: IntoIterator<Item = (K, V)>>(iter: I, ordering: VecMapOrdering) -> Self {
|
||||
let iter = iter.into_iter();
|
||||
let initial_capacity = {
|
||||
match iter.size_hint() {
|
||||
(lower_bound, None) => lower_bound,
|
||||
(_, Some(upper_bound)) => upper_bound,
|
||||
}
|
||||
};
|
||||
|
||||
let mut vec_map = VecMap::with_capacity(initial_capacity, ordering);
|
||||
for (key, value) in iter {
|
||||
vec_map
|
||||
.append(key, value)
|
||||
.expect("The passed collection needs to be sorted!");
|
||||
}
|
||||
|
||||
vec_map
|
||||
}
|
||||
}
|
||||
|
||||
impl<K: Ord, V> IntoIterator for VecMap<K, V> {
|
||||
type Item = (K, V);
|
||||
type IntoIter = std::vec::IntoIter<(K, V)>;
|
||||
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
self.data.into_iter()
|
||||
}
|
||||
}
|
||||
|
||||
fn extract_key<K, V>(entry: &(K, V)) -> &K {
|
||||
@@ -241,7 +155,7 @@ fn extract_key<K, V>(entry: &(K, V)) -> &K {
|
||||
mod tests {
|
||||
use std::{collections::BTreeMap, ops::Bound};
|
||||
|
||||
use super::{VecMap, VecMapOrdering};
|
||||
use super::VecMap;
|
||||
|
||||
#[test]
|
||||
fn unbounded_range() {
|
||||
@@ -396,59 +310,5 @@ mod tests {
|
||||
left.extend(&mut one_map).unwrap_err();
|
||||
assert_eq!(left.as_slice(), &[(0, ()), (1, ())]);
|
||||
assert_eq!(one_map.as_slice(), &[(1, ())]);
|
||||
|
||||
let mut map_greater_or_equal = VecMap::new(VecMapOrdering::GreaterOrEqual);
|
||||
map_greater_or_equal.append(2, ()).unwrap();
|
||||
map_greater_or_equal.append(2, ()).unwrap();
|
||||
|
||||
left.extend(&mut map_greater_or_equal).unwrap_err();
|
||||
assert_eq!(left.as_slice(), &[(0, ()), (1, ())]);
|
||||
assert_eq!(map_greater_or_equal.as_slice(), &[(2, ()), (2, ())]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extend_with_ordering() {
|
||||
let mut left = VecMap::new(VecMapOrdering::GreaterOrEqual);
|
||||
left.append(0, ()).unwrap();
|
||||
assert_eq!(left.as_slice(), &[(0, ())]);
|
||||
|
||||
let mut greater_right = VecMap::new(VecMapOrdering::Greater);
|
||||
greater_right.append(0, ()).unwrap();
|
||||
left.extend(&mut greater_right).unwrap_err();
|
||||
assert_eq!(left.as_slice(), &[(0, ())]);
|
||||
|
||||
let mut greater_or_equal_right = VecMap::new(VecMapOrdering::GreaterOrEqual);
|
||||
greater_or_equal_right.append(2, ()).unwrap();
|
||||
greater_or_equal_right.append(2, ()).unwrap();
|
||||
left.extend(&mut greater_or_equal_right).unwrap();
|
||||
assert_eq!(left.as_slice(), &[(0, ()), (2, ()), (2, ())]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vec_map_from_sorted() {
|
||||
let vec = vec![(1, ()), (2, ()), (3, ()), (6, ())];
|
||||
let vec_map = VecMap::from_iter(vec, VecMapOrdering::Greater);
|
||||
assert_eq!(vec_map.as_slice(), &[(1, ()), (2, ()), (3, ()), (6, ())]);
|
||||
|
||||
let vec = vec![(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())];
|
||||
let vec_map = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual);
|
||||
assert_eq!(
|
||||
vec_map.as_slice(),
|
||||
&[(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn vec_map_from_unsorted_greater() {
|
||||
let vec = vec![(1, ()), (2, ()), (2, ()), (3, ()), (6, ())];
|
||||
let _ = VecMap::from_iter(vec, VecMapOrdering::Greater);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn vec_map_from_unsorted_greater_or_equal() {
|
||||
let vec = vec![(1, ()), (2, ()), (3, ()), (6, ()), (5, ())];
|
||||
let _ = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,78 +0,0 @@
|
||||
use std::io::SeekFrom;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use async_compression::{
|
||||
tokio::{bufread::ZstdDecoder, write::ZstdEncoder},
|
||||
zstd::CParameter,
|
||||
Level,
|
||||
};
|
||||
use camino::Utf8Path;
|
||||
use nix::NixPath;
|
||||
use tokio::{
|
||||
fs::{File, OpenOptions},
|
||||
io::AsyncBufRead,
|
||||
io::AsyncSeekExt,
|
||||
io::AsyncWriteExt,
|
||||
};
|
||||
use tokio_tar::{Archive, Builder, HeaderMode};
|
||||
use walkdir::WalkDir;
|
||||
|
||||
/// Creates a Zstandard tarball.
|
||||
pub async fn create_zst_tarball(path: &Utf8Path, tarball: &Utf8Path) -> Result<(File, u64)> {
|
||||
let file = OpenOptions::new()
|
||||
.create(true)
|
||||
.truncate(true)
|
||||
.read(true)
|
||||
.write(true)
|
||||
.open(&tarball)
|
||||
.await
|
||||
.with_context(|| format!("tempfile creation {tarball}"))?;
|
||||
|
||||
let mut paths = Vec::new();
|
||||
for entry in WalkDir::new(path) {
|
||||
let entry = entry?;
|
||||
let metadata = entry.metadata().expect("error getting dir entry metadata");
|
||||
// Also allow directories so that we also get empty directories
|
||||
if !(metadata.is_file() || metadata.is_dir()) {
|
||||
continue;
|
||||
}
|
||||
let path = entry.into_path();
|
||||
paths.push(path);
|
||||
}
|
||||
// Do a sort to get a more consistent listing
|
||||
paths.sort_unstable();
|
||||
let zstd = ZstdEncoder::with_quality_and_params(
|
||||
file,
|
||||
Level::Default,
|
||||
&[CParameter::enable_long_distance_matching(true)],
|
||||
);
|
||||
let mut builder = Builder::new(zstd);
|
||||
// Use reproducible header mode
|
||||
builder.mode(HeaderMode::Deterministic);
|
||||
for p in paths {
|
||||
let rel_path = p.strip_prefix(path)?;
|
||||
if rel_path.is_empty() {
|
||||
// The top directory should not be compressed,
|
||||
// the tar crate doesn't like that
|
||||
continue;
|
||||
}
|
||||
builder.append_path_with_name(&p, rel_path).await?;
|
||||
}
|
||||
let mut zstd = builder.into_inner().await?;
|
||||
zstd.shutdown().await?;
|
||||
let mut compressed = zstd.into_inner();
|
||||
let compressed_len = compressed.metadata().await?.len();
|
||||
compressed.seek(SeekFrom::Start(0)).await?;
|
||||
Ok((compressed, compressed_len))
|
||||
}
|
||||
|
||||
/// Creates a Zstandard tarball.
|
||||
pub async fn extract_zst_tarball(
|
||||
path: &Utf8Path,
|
||||
tarball: impl AsyncBufRead + Unpin,
|
||||
) -> Result<()> {
|
||||
let decoder = Box::pin(ZstdDecoder::new(tarball));
|
||||
let mut archive = Archive::new(decoder);
|
||||
archive.unpack(path).await?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -69,7 +69,7 @@ pub struct Config {
|
||||
/// should be removed once we have a better solution there.
|
||||
sys_buffer_bytes: u64,
|
||||
|
||||
/// Minimum fraction of total system memory reserved *before* the cgroup threshold; in
|
||||
/// Minimum fraction of total system memory reserved *before* the the cgroup threshold; in
|
||||
/// other words, providing a ceiling for the highest value of the threshold by enforcing that
|
||||
/// there's at least `cgroup_min_overhead_fraction` of the total memory remaining beyond the
|
||||
/// threshold.
|
||||
|
||||
@@ -59,7 +59,6 @@ signal-hook.workspace = true
|
||||
smallvec = { workspace = true, features = ["write"] }
|
||||
svg_fmt.workspace = true
|
||||
sync_wrapper.workspace = true
|
||||
sysinfo.workspace = true
|
||||
tokio-tar.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
|
||||
|
||||
@@ -1,156 +1,160 @@
|
||||
//! Quantify a single walredo manager's throughput under N concurrent callers.
|
||||
//! Simple benchmarking around walredo.
|
||||
//!
|
||||
//! The benchmark implementation ([`bench_impl`]) is parametrized by
|
||||
//! - `redo_work` => [`Request::short_request`] or [`Request::medium_request`]
|
||||
//! - `n_redos` => number of times the benchmark shell execute the `redo_work`
|
||||
//! - `nclients` => number of clients (more on this shortly).
|
||||
//! Right now they hope to just set a baseline. Later we can try to expand into latency and
|
||||
//! throughput after figuring out the coordinated omission problems below.
|
||||
//!
|
||||
//! The benchmark impl sets up a multi-threaded tokio runtime with default parameters.
|
||||
//! It spawns `nclients` times [`client`] tokio tasks.
|
||||
//! Each task executes the `redo_work` `n_redos/nclients` times.
|
||||
//! There are two sets of inputs; `short` and `medium`. They were collected on postgres v14 by
|
||||
//! logging what happens when a sequential scan is requested on a small table, then picking out two
|
||||
//! suitable from logs.
|
||||
//!
|
||||
//! We exercise the following combinations:
|
||||
//! - `redo_work = short / medium``
|
||||
//! - `nclients = [1, 2, 4, 8, 16, 32, 64, 128]`
|
||||
//!
|
||||
//! We let `criterion` determine the `n_redos` using `iter_custom`.
|
||||
//! The idea is that for each `(redo_work, nclients)` combination,
|
||||
//! criterion will run the `bench_impl` multiple times with different `n_redos`.
|
||||
//! The `bench_impl` reports the aggregate wall clock time from the clients' perspective.
|
||||
//! Criterion will divide that by `n_redos` to compute the "time per iteration".
|
||||
//! In our case, "time per iteration" means "time per redo_work execution".
|
||||
//!
|
||||
//! NB: the way by which `iter_custom` determines the "number of iterations"
|
||||
//! is called sampling. Apparently the idea here is to detect outliers.
|
||||
//! We're not sure whether the current choice of sampling method makes sense.
|
||||
//! See https://bheisler.github.io/criterion.rs/book/user_guide/command_line_output.html#collecting-samples
|
||||
//!
|
||||
//! # Reference Numbers
|
||||
//!
|
||||
//! 2024-03-20 on i3en.3xlarge
|
||||
//!
|
||||
//! ```text
|
||||
//! short/1 time: [26.483 µs 26.614 µs 26.767 µs]
|
||||
//! short/2 time: [32.223 µs 32.465 µs 32.767 µs]
|
||||
//! short/4 time: [47.203 µs 47.583 µs 47.984 µs]
|
||||
//! short/8 time: [89.135 µs 89.612 µs 90.139 µs]
|
||||
//! short/16 time: [190.12 µs 191.52 µs 192.88 µs]
|
||||
//! short/32 time: [380.96 µs 382.63 µs 384.20 µs]
|
||||
//! short/64 time: [736.86 µs 741.07 µs 745.03 µs]
|
||||
//! short/128 time: [1.4106 ms 1.4206 ms 1.4294 ms]
|
||||
//! medium/1 time: [111.81 µs 112.25 µs 112.79 µs]
|
||||
//! medium/2 time: [158.26 µs 159.13 µs 160.21 µs]
|
||||
//! medium/4 time: [334.65 µs 337.14 µs 340.07 µs]
|
||||
//! medium/8 time: [675.32 µs 679.91 µs 685.25 µs]
|
||||
//! medium/16 time: [1.2929 ms 1.2996 ms 1.3067 ms]
|
||||
//! medium/32 time: [2.4295 ms 2.4461 ms 2.4623 ms]
|
||||
//! medium/64 time: [4.3973 ms 4.4458 ms 4.4875 ms]
|
||||
//! medium/128 time: [7.5955 ms 7.7847 ms 7.9481 ms]
|
||||
//! ```
|
||||
//! Reference data (git blame to see commit) on an i3en.3xlarge
|
||||
// ```text
|
||||
//! short/short/1 time: [39.175 µs 39.348 µs 39.536 µs]
|
||||
//! short/short/2 time: [51.227 µs 51.487 µs 51.755 µs]
|
||||
//! short/short/4 time: [76.048 µs 76.362 µs 76.674 µs]
|
||||
//! short/short/8 time: [128.94 µs 129.82 µs 130.74 µs]
|
||||
//! short/short/16 time: [227.84 µs 229.00 µs 230.28 µs]
|
||||
//! short/short/32 time: [455.97 µs 457.81 µs 459.90 µs]
|
||||
//! short/short/64 time: [902.46 µs 904.84 µs 907.32 µs]
|
||||
//! short/short/128 time: [1.7416 ms 1.7487 ms 1.7561 ms]
|
||||
//! ``
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use bytes::{Buf, Bytes};
|
||||
use criterion::{BenchmarkId, Criterion};
|
||||
use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
|
||||
use pageserver_api::{key::Key, shard::TenantShardId};
|
||||
use std::{
|
||||
sync::Arc,
|
||||
time::{Duration, Instant},
|
||||
use pageserver::{
|
||||
config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
|
||||
};
|
||||
use tokio::{sync::Barrier, task::JoinSet};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use tokio::task::JoinSet;
|
||||
use utils::{id::TenantId, lsn::Lsn};
|
||||
|
||||
fn bench(c: &mut Criterion) {
|
||||
{
|
||||
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||
for nclients in nclients {
|
||||
let mut group = c.benchmark_group("short");
|
||||
group.bench_with_input(
|
||||
BenchmarkId::from_parameter(nclients),
|
||||
&nclients,
|
||||
|b, nclients| {
|
||||
let redo_work = Arc::new(Request::short_input());
|
||||
b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
|
||||
|
||||
{
|
||||
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||
for nclients in nclients {
|
||||
let mut group = c.benchmark_group("medium");
|
||||
group.bench_with_input(
|
||||
BenchmarkId::from_parameter(nclients),
|
||||
&nclients,
|
||||
|b, nclients| {
|
||||
let redo_work = Arc::new(Request::medium_input());
|
||||
b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
criterion::criterion_group!(benches, bench);
|
||||
criterion::criterion_main!(benches);
|
||||
fn redo_scenarios(c: &mut Criterion) {
|
||||
// logging should be enabled when adding more inputs, since walredo will only report malformed
|
||||
// input to the stderr.
|
||||
// utils::logging::init(utils::logging::LogFormat::Plain).unwrap();
|
||||
|
||||
// Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
|
||||
fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
|
||||
let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
|
||||
|
||||
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
|
||||
let conf = Box::leak(Box::new(conf));
|
||||
let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
|
||||
|
||||
let manager = PostgresRedoManager::new(conf, tenant_shard_id);
|
||||
|
||||
let manager = Arc::new(manager);
|
||||
|
||||
{
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
tracing::info!("executing first");
|
||||
rt.block_on(short().execute(&manager)).unwrap();
|
||||
tracing::info!("first executed");
|
||||
}
|
||||
|
||||
let thread_counts = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||
|
||||
let mut group = c.benchmark_group("short");
|
||||
group.sampling_mode(criterion::SamplingMode::Flat);
|
||||
|
||||
for thread_count in thread_counts {
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("short", thread_count),
|
||||
&thread_count,
|
||||
|b, thread_count| {
|
||||
add_multithreaded_walredo_requesters(b, *thread_count, &manager, short);
|
||||
},
|
||||
);
|
||||
}
|
||||
drop(group);
|
||||
|
||||
let mut group = c.benchmark_group("medium");
|
||||
group.sampling_mode(criterion::SamplingMode::Flat);
|
||||
|
||||
for thread_count in thread_counts {
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("medium", thread_count),
|
||||
&thread_count,
|
||||
|b, thread_count| {
|
||||
add_multithreaded_walredo_requesters(b, *thread_count, &manager, medium);
|
||||
},
|
||||
);
|
||||
}
|
||||
drop(group);
|
||||
}
|
||||
|
||||
/// Sets up a multi-threaded tokio runtime with default worker thread count,
|
||||
/// then, spawn `requesters` tasks that repeatedly:
|
||||
/// - get input from `input_factor()`
|
||||
/// - call `manager.request_redo()` with their input
|
||||
///
|
||||
/// This stress-tests the scalability of a single walredo manager at high tokio-level concurrency.
|
||||
///
|
||||
/// Using tokio's default worker thread count means the results will differ on machines
|
||||
/// with different core countrs. We don't care about that, the performance will always
|
||||
/// be different on different hardware. To compare performance of different software versions,
|
||||
/// use the same hardware.
|
||||
fn add_multithreaded_walredo_requesters(
|
||||
b: &mut criterion::Bencher,
|
||||
nrequesters: usize,
|
||||
manager: &Arc<PostgresRedoManager>,
|
||||
input_factory: fn() -> Request,
|
||||
) {
|
||||
assert_ne!(nrequesters, 0);
|
||||
|
||||
let rt = tokio::runtime::Builder::new_multi_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let start = Arc::new(Barrier::new(nclients as usize));
|
||||
let barrier = Arc::new(tokio::sync::Barrier::new(nrequesters + 1));
|
||||
|
||||
let mut tasks = JoinSet::new();
|
||||
|
||||
let manager = PostgresRedoManager::new(conf, tenant_shard_id);
|
||||
let manager = Arc::new(manager);
|
||||
|
||||
for _ in 0..nclients {
|
||||
rt.block_on(async {
|
||||
tasks.spawn(client(
|
||||
Arc::clone(&manager),
|
||||
Arc::clone(&start),
|
||||
Arc::clone(&redo_work),
|
||||
// divide the amount of work equally among the clients
|
||||
n_redos / nclients,
|
||||
))
|
||||
let mut requesters = JoinSet::new();
|
||||
for _ in 0..nrequesters {
|
||||
let _entered = rt.enter();
|
||||
let manager = manager.clone();
|
||||
let barrier = barrier.clone();
|
||||
requesters.spawn(async move {
|
||||
loop {
|
||||
let input = input_factory();
|
||||
barrier.wait().await;
|
||||
let page = input.execute(&manager).await.unwrap();
|
||||
assert_eq!(page.remaining(), 8192);
|
||||
barrier.wait().await;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
rt.block_on(async move {
|
||||
let mut total_wallclock_time = std::time::Duration::from_millis(0);
|
||||
while let Some(res) = tasks.join_next().await {
|
||||
total_wallclock_time += res.unwrap();
|
||||
}
|
||||
total_wallclock_time
|
||||
})
|
||||
let do_one_iteration = || {
|
||||
rt.block_on(async {
|
||||
barrier.wait().await;
|
||||
// wait for work to complete
|
||||
barrier.wait().await;
|
||||
})
|
||||
};
|
||||
|
||||
b.iter_batched(
|
||||
|| {
|
||||
// warmup
|
||||
do_one_iteration();
|
||||
},
|
||||
|()| {
|
||||
// work loop
|
||||
do_one_iteration();
|
||||
},
|
||||
criterion::BatchSize::PerIteration,
|
||||
);
|
||||
|
||||
rt.block_on(requesters.shutdown());
|
||||
}
|
||||
|
||||
async fn client(
|
||||
mgr: Arc<PostgresRedoManager>,
|
||||
start: Arc<Barrier>,
|
||||
redo_work: Arc<Request>,
|
||||
n_redos: u64,
|
||||
) -> Duration {
|
||||
start.wait().await;
|
||||
let start = Instant::now();
|
||||
for _ in 0..n_redos {
|
||||
let page = redo_work.execute(&mgr).await.unwrap();
|
||||
assert_eq!(page.remaining(), 8192);
|
||||
// The real pageserver will rarely if ever do 2 walredos in a row without
|
||||
// yielding to the executor.
|
||||
tokio::task::yield_now().await;
|
||||
}
|
||||
start.elapsed()
|
||||
}
|
||||
criterion_group!(benches, redo_scenarios);
|
||||
criterion_main!(benches);
|
||||
|
||||
macro_rules! lsn {
|
||||
($input:expr) => {{
|
||||
@@ -162,46 +166,12 @@ macro_rules! lsn {
|
||||
}};
|
||||
}
|
||||
|
||||
/// Simple wrapper around `WalRedoManager::request_redo`.
|
||||
///
|
||||
/// In benchmarks this is cloned around.
|
||||
#[derive(Clone)]
|
||||
struct Request {
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
base_img: Option<(Lsn, Bytes)>,
|
||||
records: Vec<(Lsn, NeonWalRecord)>,
|
||||
pg_version: u32,
|
||||
}
|
||||
|
||||
impl Request {
|
||||
async fn execute(&self, manager: &PostgresRedoManager) -> anyhow::Result<Bytes> {
|
||||
let Request {
|
||||
key,
|
||||
lsn,
|
||||
base_img,
|
||||
records,
|
||||
pg_version,
|
||||
} = self;
|
||||
|
||||
// TODO: avoid these clones
|
||||
manager
|
||||
.request_redo(*key, *lsn, base_img.clone(), records.clone(), *pg_version)
|
||||
.await
|
||||
}
|
||||
|
||||
fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord {
|
||||
let rec = Bytes::from_static(bytes);
|
||||
NeonWalRecord::Postgres { will_init, rec }
|
||||
}
|
||||
|
||||
/// Short payload, 1132 bytes.
|
||||
// pg_records are copypasted from log, where they are put with Debug impl of Bytes, which uses \0
|
||||
// for null bytes.
|
||||
#[allow(clippy::octal_escapes)]
|
||||
pub fn short_input() -> Request {
|
||||
let pg_record = Self::pg_record;
|
||||
Request {
|
||||
/// Short payload, 1132 bytes.
|
||||
// pg_records are copypasted from log, where they are put with Debug impl of Bytes, which uses \0
|
||||
// for null bytes.
|
||||
#[allow(clippy::octal_escapes)]
|
||||
fn short() -> Request {
|
||||
Request {
|
||||
key: Key {
|
||||
field1: 0,
|
||||
field2: 1663,
|
||||
@@ -224,14 +194,13 @@ impl Request {
|
||||
],
|
||||
pg_version: 14,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Medium sized payload, serializes as 26393 bytes.
|
||||
// see [`short`]
|
||||
#[allow(clippy::octal_escapes)]
|
||||
pub fn medium_input() -> Request {
|
||||
let pg_record = Self::pg_record;
|
||||
Request {
|
||||
/// Medium sized payload, serializes as 26393 bytes.
|
||||
// see [`short`]
|
||||
#[allow(clippy::octal_escapes)]
|
||||
fn medium() -> Request {
|
||||
Request {
|
||||
key: Key {
|
||||
field1: 0,
|
||||
field2: 1663,
|
||||
@@ -473,5 +442,37 @@ impl Request {
|
||||
],
|
||||
pg_version: 14,
|
||||
}
|
||||
}
|
||||
|
||||
fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord {
|
||||
let rec = Bytes::from_static(bytes);
|
||||
NeonWalRecord::Postgres { will_init, rec }
|
||||
}
|
||||
|
||||
/// Simple wrapper around `WalRedoManager::request_redo`.
|
||||
///
|
||||
/// In benchmarks this is cloned around.
|
||||
#[derive(Clone)]
|
||||
struct Request {
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
base_img: Option<(Lsn, Bytes)>,
|
||||
records: Vec<(Lsn, NeonWalRecord)>,
|
||||
pg_version: u32,
|
||||
}
|
||||
|
||||
impl Request {
|
||||
async fn execute(self, manager: &PostgresRedoManager) -> anyhow::Result<Bytes> {
|
||||
let Request {
|
||||
key,
|
||||
lsn,
|
||||
base_img,
|
||||
records,
|
||||
pg_version,
|
||||
} = self;
|
||||
|
||||
manager
|
||||
.request_redo(key, lsn, base_img, records, pg_version)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -43,8 +43,7 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
|
||||
fanout: u64,
|
||||
ctx: &E::RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
assert!(fanout >= 1, "fanout needs to be at least 1 but is {fanout}");
|
||||
let exp_base = fanout.max(2);
|
||||
assert!(fanout >= 2);
|
||||
// Start at L0
|
||||
let mut current_level_no = 0;
|
||||
let mut current_level_target_height = target_file_size;
|
||||
@@ -107,7 +106,7 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
|
||||
break;
|
||||
}
|
||||
current_level_no += 1;
|
||||
current_level_target_height = current_level_target_height.saturating_mul(exp_base);
|
||||
current_level_target_height = current_level_target_height.saturating_mul(fanout);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -600,37 +600,32 @@ fn start_pageserver(
|
||||
None,
|
||||
"consumption metrics collection",
|
||||
true,
|
||||
{
|
||||
let tenant_manager = tenant_manager.clone();
|
||||
async move {
|
||||
// first wait until background jobs are cleared to launch.
|
||||
//
|
||||
// this is because we only process active tenants and timelines, and the
|
||||
// Timeline::get_current_logical_size will spawn the logical size calculation,
|
||||
// which will not be rate-limited.
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
async move {
|
||||
// first wait until background jobs are cleared to launch.
|
||||
//
|
||||
// this is because we only process active tenants and timelines, and the
|
||||
// Timeline::get_current_logical_size will spawn the logical size calculation,
|
||||
// which will not be rate-limited.
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => { return Ok(()); },
|
||||
_ = background_jobs_barrier.wait() => {}
|
||||
};
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => { return Ok(()); },
|
||||
_ = background_jobs_barrier.wait() => {}
|
||||
};
|
||||
|
||||
pageserver::consumption_metrics::collect_metrics(
|
||||
tenant_manager,
|
||||
metric_collection_endpoint,
|
||||
&conf.metric_collection_bucket,
|
||||
conf.metric_collection_interval,
|
||||
conf.cached_metric_collection_interval,
|
||||
conf.synthetic_size_calculation_interval,
|
||||
conf.id,
|
||||
local_disk_storage,
|
||||
cancel,
|
||||
metrics_ctx,
|
||||
)
|
||||
.instrument(info_span!("metrics_collection"))
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
pageserver::consumption_metrics::collect_metrics(
|
||||
metric_collection_endpoint,
|
||||
conf.metric_collection_interval,
|
||||
conf.cached_metric_collection_interval,
|
||||
conf.synthetic_size_calculation_interval,
|
||||
conf.id,
|
||||
local_disk_storage,
|
||||
cancel,
|
||||
metrics_ctx,
|
||||
)
|
||||
.instrument(info_span!("metrics_collection"))
|
||||
.await?;
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
@@ -95,8 +95,6 @@ pub mod defaults {
|
||||
|
||||
pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
|
||||
|
||||
pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
|
||||
|
||||
///
|
||||
/// Default built-in configuration file.
|
||||
///
|
||||
@@ -158,8 +156,6 @@ pub mod defaults {
|
||||
#heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
|
||||
#secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY}
|
||||
|
||||
#ephemeral_bytes_per_memory_kb = {DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB}
|
||||
|
||||
[remote_storage]
|
||||
|
||||
"#
|
||||
@@ -238,7 +234,6 @@ pub struct PageServerConf {
|
||||
// How often to send unchanged cached metrics to the metrics endpoint.
|
||||
pub cached_metric_collection_interval: Duration,
|
||||
pub metric_collection_endpoint: Option<Url>,
|
||||
pub metric_collection_bucket: Option<RemoteStorageConfig>,
|
||||
pub synthetic_size_calculation_interval: Duration,
|
||||
|
||||
pub disk_usage_based_eviction: Option<DiskUsageEvictionTaskConfig>,
|
||||
@@ -283,13 +278,6 @@ pub struct PageServerConf {
|
||||
pub max_vectored_read_bytes: MaxVectoredReadBytes,
|
||||
|
||||
pub validate_vectored_get: bool,
|
||||
|
||||
/// How many bytes of ephemeral layer content will we allow per kilobyte of RAM. When this
|
||||
/// is exceeded, we start proactively closing ephemeral layers to limit the total amount
|
||||
/// of ephemeral data.
|
||||
///
|
||||
/// Setting this to zero disables limits on total ephemeral layer size.
|
||||
pub ephemeral_bytes_per_memory_kb: usize,
|
||||
}
|
||||
|
||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||
@@ -385,7 +373,6 @@ struct PageServerConfigBuilder {
|
||||
cached_metric_collection_interval: BuilderValue<Duration>,
|
||||
metric_collection_endpoint: BuilderValue<Option<Url>>,
|
||||
synthetic_size_calculation_interval: BuilderValue<Duration>,
|
||||
metric_collection_bucket: BuilderValue<Option<RemoteStorageConfig>>,
|
||||
|
||||
disk_usage_based_eviction: BuilderValue<Option<DiskUsageEvictionTaskConfig>>,
|
||||
|
||||
@@ -411,8 +398,6 @@ struct PageServerConfigBuilder {
|
||||
max_vectored_read_bytes: BuilderValue<MaxVectoredReadBytes>,
|
||||
|
||||
validate_vectored_get: BuilderValue<bool>,
|
||||
|
||||
ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
|
||||
}
|
||||
|
||||
impl PageServerConfigBuilder {
|
||||
@@ -470,8 +455,6 @@ impl PageServerConfigBuilder {
|
||||
.expect("cannot parse default synthetic size calculation interval")),
|
||||
metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT),
|
||||
|
||||
metric_collection_bucket: Set(None),
|
||||
|
||||
disk_usage_based_eviction: Set(None),
|
||||
|
||||
test_remote_failures: Set(0),
|
||||
@@ -499,7 +482,6 @@ impl PageServerConfigBuilder {
|
||||
NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
|
||||
)),
|
||||
validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
|
||||
ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -604,13 +586,6 @@ impl PageServerConfigBuilder {
|
||||
self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint)
|
||||
}
|
||||
|
||||
pub fn metric_collection_bucket(
|
||||
&mut self,
|
||||
metric_collection_bucket: Option<RemoteStorageConfig>,
|
||||
) {
|
||||
self.metric_collection_bucket = BuilderValue::Set(metric_collection_bucket)
|
||||
}
|
||||
|
||||
pub fn synthetic_size_calculation_interval(
|
||||
&mut self,
|
||||
synthetic_size_calculation_interval: Duration,
|
||||
@@ -679,10 +654,6 @@ impl PageServerConfigBuilder {
|
||||
self.validate_vectored_get = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn get_ephemeral_bytes_per_memory_kb(&mut self, value: usize) {
|
||||
self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||
let default = Self::default_values();
|
||||
|
||||
@@ -723,7 +694,6 @@ impl PageServerConfigBuilder {
|
||||
metric_collection_interval,
|
||||
cached_metric_collection_interval,
|
||||
metric_collection_endpoint,
|
||||
metric_collection_bucket,
|
||||
synthetic_size_calculation_interval,
|
||||
disk_usage_based_eviction,
|
||||
test_remote_failures,
|
||||
@@ -738,7 +708,6 @@ impl PageServerConfigBuilder {
|
||||
get_vectored_impl,
|
||||
max_vectored_read_bytes,
|
||||
validate_vectored_get,
|
||||
ephemeral_bytes_per_memory_kb,
|
||||
}
|
||||
CUSTOM LOGIC
|
||||
{
|
||||
@@ -973,9 +942,6 @@ impl PageServerConf {
|
||||
let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?;
|
||||
builder.metric_collection_endpoint(Some(endpoint));
|
||||
},
|
||||
"metric_collection_bucket" => {
|
||||
builder.metric_collection_bucket(RemoteStorageConfig::from_toml(item)?)
|
||||
}
|
||||
"synthetic_size_calculation_interval" =>
|
||||
builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
|
||||
"test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
|
||||
@@ -1029,9 +995,6 @@ impl PageServerConf {
|
||||
"validate_vectored_get" => {
|
||||
builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
|
||||
}
|
||||
"ephemeral_bytes_per_memory_kb" => {
|
||||
builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
|
||||
}
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
}
|
||||
}
|
||||
@@ -1094,7 +1057,6 @@ impl PageServerConf {
|
||||
metric_collection_interval: Duration::from_secs(60),
|
||||
cached_metric_collection_interval: Duration::from_secs(60 * 60),
|
||||
metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
|
||||
metric_collection_bucket: None,
|
||||
synthetic_size_calculation_interval: Duration::from_secs(60),
|
||||
disk_usage_based_eviction: None,
|
||||
test_remote_failures: 0,
|
||||
@@ -1113,7 +1075,6 @@ impl PageServerConf {
|
||||
.expect("Invalid default constant"),
|
||||
),
|
||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1328,7 +1289,6 @@ background_task_maximum_delay = '334 s'
|
||||
defaults::DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL
|
||||
)?,
|
||||
metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
|
||||
metric_collection_bucket: None,
|
||||
synthetic_size_calculation_interval: humantime::parse_duration(
|
||||
defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL
|
||||
)?,
|
||||
@@ -1351,7 +1311,6 @@ background_task_maximum_delay = '334 s'
|
||||
.expect("Invalid default constant")
|
||||
),
|
||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
|
||||
},
|
||||
"Correct defaults should be used when no config values are provided"
|
||||
);
|
||||
@@ -1404,7 +1363,6 @@ background_task_maximum_delay = '334 s'
|
||||
metric_collection_interval: Duration::from_secs(222),
|
||||
cached_metric_collection_interval: Duration::from_secs(22200),
|
||||
metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
|
||||
metric_collection_bucket: None,
|
||||
synthetic_size_calculation_interval: Duration::from_secs(333),
|
||||
disk_usage_based_eviction: None,
|
||||
test_remote_failures: 0,
|
||||
@@ -1423,7 +1381,6 @@ background_task_maximum_delay = '334 s'
|
||||
.expect("Invalid default constant")
|
||||
),
|
||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
|
||||
},
|
||||
"Should be able to parse all basic config values correctly"
|
||||
);
|
||||
|
||||
@@ -3,13 +3,10 @@
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
|
||||
use crate::tenant::tasks::BackgroundLoopKind;
|
||||
use crate::tenant::{
|
||||
mgr::TenantManager, LogicalSizeCalculationCause, PageReconstructError, Tenant,
|
||||
};
|
||||
use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError, Tenant};
|
||||
use camino::Utf8PathBuf;
|
||||
use consumption_metrics::EventType;
|
||||
use pageserver_api::models::TenantState;
|
||||
use remote_storage::{GenericRemoteStorage, RemoteStorageConfig};
|
||||
use reqwest::Url;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
@@ -43,9 +40,7 @@ type Cache = HashMap<MetricsKey, (EventType, u64)>;
|
||||
/// Main thread that serves metrics collection
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn collect_metrics(
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
metric_collection_endpoint: &Url,
|
||||
metric_collection_bucket: &Option<RemoteStorageConfig>,
|
||||
metric_collection_interval: Duration,
|
||||
_cached_metric_collection_interval: Duration,
|
||||
synthetic_size_calculation_interval: Duration,
|
||||
@@ -70,19 +65,15 @@ pub async fn collect_metrics(
|
||||
None,
|
||||
"synthetic size calculation",
|
||||
false,
|
||||
{
|
||||
let tenant_manager = tenant_manager.clone();
|
||||
async move {
|
||||
calculate_synthetic_size_worker(
|
||||
tenant_manager,
|
||||
synthetic_size_calculation_interval,
|
||||
&cancel,
|
||||
&worker_ctx,
|
||||
)
|
||||
.instrument(info_span!("synthetic_size_worker"))
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
async move {
|
||||
calculate_synthetic_size_worker(
|
||||
synthetic_size_calculation_interval,
|
||||
&cancel,
|
||||
&worker_ctx,
|
||||
)
|
||||
.instrument(info_span!("synthetic_size_worker"))
|
||||
.await?;
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
|
||||
@@ -103,27 +94,13 @@ pub async fn collect_metrics(
|
||||
.build()
|
||||
.expect("Failed to create http client with timeout");
|
||||
|
||||
let bucket_client = if let Some(bucket_config) = metric_collection_bucket {
|
||||
match GenericRemoteStorage::from_config(bucket_config) {
|
||||
Ok(client) => Some(client),
|
||||
Err(e) => {
|
||||
// Non-fatal error: if we were given an invalid config, we will proceed
|
||||
// with sending metrics over the network, but not to S3.
|
||||
tracing::warn!("Invalid configuration for metric_collection_bucket: {e}");
|
||||
None
|
||||
}
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let node_id = node_id.to_string();
|
||||
|
||||
loop {
|
||||
let started_at = Instant::now();
|
||||
|
||||
// these are point in time, with variable "now"
|
||||
let metrics = metrics::collect_all_metrics(&tenant_manager, &cached_metrics, &ctx).await;
|
||||
let metrics = metrics::collect_all_metrics(&cached_metrics, &ctx).await;
|
||||
|
||||
let metrics = Arc::new(metrics);
|
||||
|
||||
@@ -141,18 +118,10 @@ pub async fn collect_metrics(
|
||||
tracing::error!("failed to persist metrics to {path:?}: {e:#}");
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(bucket_client) = &bucket_client {
|
||||
let res =
|
||||
upload::upload_metrics_bucket(bucket_client, &cancel, &node_id, &metrics).await;
|
||||
if let Err(e) = res {
|
||||
tracing::error!("failed to upload to S3: {e:#}");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let upload = async {
|
||||
let res = upload::upload_metrics_http(
|
||||
let res = upload::upload_metrics(
|
||||
&client,
|
||||
metric_collection_endpoint,
|
||||
&cancel,
|
||||
@@ -163,7 +132,7 @@ pub async fn collect_metrics(
|
||||
.await;
|
||||
if let Err(e) = res {
|
||||
// serialization error which should never happen
|
||||
tracing::error!("failed to upload via HTTP due to {e:#}");
|
||||
tracing::error!("failed to upload due to {e:#}");
|
||||
}
|
||||
};
|
||||
|
||||
@@ -278,7 +247,6 @@ async fn reschedule(
|
||||
|
||||
/// Caclculate synthetic size for each active tenant
|
||||
async fn calculate_synthetic_size_worker(
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
synthetic_size_calculation_interval: Duration,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
@@ -291,7 +259,7 @@ async fn calculate_synthetic_size_worker(
|
||||
loop {
|
||||
let started_at = Instant::now();
|
||||
|
||||
let tenants = match tenant_manager.list_tenants() {
|
||||
let tenants = match mgr::list_tenants().await {
|
||||
Ok(tenants) => tenants,
|
||||
Err(e) => {
|
||||
warn!("cannot get tenant list: {e:#}");
|
||||
@@ -310,14 +278,10 @@ async fn calculate_synthetic_size_worker(
|
||||
continue;
|
||||
}
|
||||
|
||||
let Ok(tenant) = tenant_manager.get_attached_tenant_shard(tenant_shard_id) else {
|
||||
let Ok(tenant) = mgr::get_tenant(tenant_shard_id, true) else {
|
||||
continue;
|
||||
};
|
||||
|
||||
if !tenant.is_active() {
|
||||
continue;
|
||||
}
|
||||
|
||||
// there is never any reason to exit calculate_synthetic_size_worker following any
|
||||
// return value -- we don't need to care about shutdown because no tenant is found when
|
||||
// pageserver is shut down.
|
||||
@@ -355,7 +319,9 @@ async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &Re
|
||||
};
|
||||
|
||||
// this error can be returned if timeline is shutting down, but it does not
|
||||
// mean the synthetic size worker should terminate.
|
||||
// mean the synthetic size worker should terminate. we do not need any checks
|
||||
// in this function because `mgr::get_tenant` will error out after shutdown has
|
||||
// progressed to shutting down tenants.
|
||||
let shutting_down = matches!(
|
||||
e.downcast_ref::<PageReconstructError>(),
|
||||
Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
use crate::tenant::mgr::TenantManager;
|
||||
use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogicalSize};
|
||||
use chrono::{DateTime, Utc};
|
||||
use consumption_metrics::EventType;
|
||||
@@ -182,7 +181,6 @@ impl MetricsKey {
|
||||
}
|
||||
|
||||
pub(super) async fn collect_all_metrics(
|
||||
tenant_manager: &Arc<TenantManager>,
|
||||
cached_metrics: &Cache,
|
||||
ctx: &RequestContext,
|
||||
) -> Vec<RawMetric> {
|
||||
@@ -190,7 +188,7 @@ pub(super) async fn collect_all_metrics(
|
||||
|
||||
let started_at = std::time::Instant::now();
|
||||
|
||||
let tenants = match tenant_manager.list_tenants() {
|
||||
let tenants = match crate::tenant::mgr::list_tenants().await {
|
||||
Ok(tenants) => tenants,
|
||||
Err(err) => {
|
||||
tracing::error!("failed to list tenants: {:?}", err);
|
||||
@@ -202,8 +200,7 @@ pub(super) async fn collect_all_metrics(
|
||||
if state != TenantState::Active || !id.is_zero() {
|
||||
None
|
||||
} else {
|
||||
tenant_manager
|
||||
.get_attached_tenant_shard(id)
|
||||
crate::tenant::mgr::get_tenant(id, true)
|
||||
.ok()
|
||||
.map(|tenant| (id.tenant_id, tenant))
|
||||
}
|
||||
|
||||
@@ -1,9 +1,4 @@
|
||||
use std::time::SystemTime;
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use consumption_metrics::{Event, EventChunk, IdempotencyKey, CHUNK_SIZE};
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath};
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::Instrument;
|
||||
|
||||
@@ -18,9 +13,8 @@ struct Ids {
|
||||
pub(super) timeline_id: Option<TimelineId>,
|
||||
}
|
||||
|
||||
/// Serialize and write metrics to an HTTP endpoint
|
||||
#[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))]
|
||||
pub(super) async fn upload_metrics_http(
|
||||
pub(super) async fn upload_metrics(
|
||||
client: &reqwest::Client,
|
||||
metric_collection_endpoint: &reqwest::Url,
|
||||
cancel: &CancellationToken,
|
||||
@@ -80,60 +74,6 @@ pub(super) async fn upload_metrics_http(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Serialize and write metrics to a remote storage object
|
||||
#[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))]
|
||||
pub(super) async fn upload_metrics_bucket(
|
||||
client: &GenericRemoteStorage,
|
||||
cancel: &CancellationToken,
|
||||
node_id: &str,
|
||||
metrics: &[RawMetric],
|
||||
) -> anyhow::Result<()> {
|
||||
if metrics.is_empty() {
|
||||
// Skip uploads if we have no metrics, so that readers don't have to handle the edge case
|
||||
// of an empty object.
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Compose object path
|
||||
let datetime: DateTime<Utc> = SystemTime::now().into();
|
||||
let ts_prefix = datetime.format("year=%Y/month=%m/day=%d/%H:%M:%SZ");
|
||||
let path = RemotePath::from_string(&format!("{ts_prefix}_{node_id}.ndjson.gz"))?;
|
||||
|
||||
// Set up a gzip writer into a buffer
|
||||
let mut compressed_bytes: Vec<u8> = Vec::new();
|
||||
let compressed_writer = std::io::Cursor::new(&mut compressed_bytes);
|
||||
let mut gzip_writer = async_compression::tokio::write::GzipEncoder::new(compressed_writer);
|
||||
|
||||
// Serialize and write into compressed buffer
|
||||
let started_at = std::time::Instant::now();
|
||||
for res in serialize_in_chunks(CHUNK_SIZE, metrics, node_id) {
|
||||
let (_chunk, body) = res?;
|
||||
gzip_writer.write_all(&body).await?;
|
||||
}
|
||||
gzip_writer.flush().await?;
|
||||
gzip_writer.shutdown().await?;
|
||||
let compressed_length = compressed_bytes.len();
|
||||
|
||||
// Write to remote storage
|
||||
client
|
||||
.upload_storage_object(
|
||||
futures::stream::once(futures::future::ready(Ok(compressed_bytes.into()))),
|
||||
compressed_length,
|
||||
&path,
|
||||
cancel,
|
||||
)
|
||||
.await?;
|
||||
let elapsed = started_at.elapsed();
|
||||
|
||||
tracing::info!(
|
||||
compressed_length,
|
||||
elapsed_ms = elapsed.as_millis(),
|
||||
"write metrics bucket at {path}",
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// The return type is quite ugly, but we gain testability in isolation
|
||||
fn serialize_in_chunks<'a, F>(
|
||||
chunk_size: usize,
|
||||
|
||||
@@ -5,8 +5,7 @@ use pageserver_api::{
|
||||
controller_api::NodeRegisterRequest,
|
||||
shard::TenantShardId,
|
||||
upcall_api::{
|
||||
ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
|
||||
ValidateRequestTenant, ValidateResponse,
|
||||
ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
|
||||
},
|
||||
};
|
||||
use serde::{de::DeserializeOwned, Serialize};
|
||||
@@ -38,9 +37,7 @@ pub trait ControlPlaneGenerationsApi {
|
||||
fn re_attach(
|
||||
&self,
|
||||
conf: &PageServerConf,
|
||||
) -> impl Future<
|
||||
Output = Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError>,
|
||||
> + Send;
|
||||
) -> impl Future<Output = Result<HashMap<TenantShardId, Generation>, RetryForeverError>> + Send;
|
||||
fn validate(
|
||||
&self,
|
||||
tenants: Vec<(TenantShardId, Generation)>,
|
||||
@@ -121,7 +118,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
||||
async fn re_attach(
|
||||
&self,
|
||||
conf: &PageServerConf,
|
||||
) -> Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError> {
|
||||
) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
|
||||
let re_attach_path = self
|
||||
.base_url
|
||||
.join("re-attach")
|
||||
@@ -184,7 +181,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
||||
Ok(response
|
||||
.tenants
|
||||
.into_iter()
|
||||
.map(|rart| (rart.id, rart))
|
||||
.map(|t| (t.id, Generation::new(t.gen)))
|
||||
.collect::<HashMap<_, _>>())
|
||||
}
|
||||
|
||||
|
||||
@@ -724,8 +724,8 @@ impl DeletionQueue {
|
||||
mod test {
|
||||
use camino::Utf8Path;
|
||||
use hex_literal::hex;
|
||||
use pageserver_api::{shard::ShardIndex, upcall_api::ReAttachResponseTenant};
|
||||
use std::{io::ErrorKind, time::Duration};
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
use std::io::ErrorKind;
|
||||
use tracing::info;
|
||||
|
||||
use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
|
||||
@@ -834,10 +834,9 @@ mod test {
|
||||
async fn re_attach(
|
||||
&self,
|
||||
_conf: &PageServerConf,
|
||||
) -> Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError> {
|
||||
) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
async fn validate(
|
||||
&self,
|
||||
tenants: Vec<(TenantShardId, Generation)>,
|
||||
|
||||
@@ -61,6 +61,7 @@ use crate::{
|
||||
metrics::disk_usage_based_eviction::METRICS,
|
||||
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
||||
tenant::{
|
||||
self,
|
||||
mgr::TenantManager,
|
||||
remote_timeline_client::LayerFileMetadata,
|
||||
secondary::SecondaryTenant,
|
||||
@@ -813,8 +814,8 @@ async fn collect_eviction_candidates(
|
||||
const LOG_DURATION_THRESHOLD: std::time::Duration = std::time::Duration::from_secs(10);
|
||||
|
||||
// get a snapshot of the list of tenants
|
||||
let tenants = tenant_manager
|
||||
.list_tenants()
|
||||
let tenants = tenant::mgr::list_tenants()
|
||||
.await
|
||||
.context("get list of tenants")?;
|
||||
|
||||
// TODO: avoid listing every layer in every tenant: this loop can block the executor,
|
||||
@@ -826,12 +827,8 @@ async fn collect_eviction_candidates(
|
||||
if cancel.is_cancelled() {
|
||||
return Ok(EvictionCandidates::Cancelled);
|
||||
}
|
||||
let tenant = match tenant_manager.get_attached_tenant_shard(tenant_id) {
|
||||
Ok(tenant) if tenant.is_active() => tenant,
|
||||
Ok(_) => {
|
||||
debug!(tenant_id=%tenant_id.tenant_id, shard_id=%tenant_id.shard_slug(), "Tenant shard is not active");
|
||||
continue;
|
||||
}
|
||||
let tenant = match tenant::mgr::get_tenant(tenant_id, true) {
|
||||
Ok(tenant) => tenant,
|
||||
Err(e) => {
|
||||
// this can happen if tenant has lifecycle transition after we fetched it
|
||||
debug!("failed to get tenant: {e:#}");
|
||||
|
||||
@@ -1038,7 +1038,7 @@ paths:
|
||||
format: hex
|
||||
responses:
|
||||
"201":
|
||||
description: Timeline was created, or already existed with matching parameters
|
||||
description: TimelineInfo
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
@@ -1068,17 +1068,11 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"409":
|
||||
description: Timeline already exists, with different parameters. Creation cannot proceed.
|
||||
description: Timeline already exists, creation skipped
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ConflictError"
|
||||
"429":
|
||||
description: A creation request was sent for the same Timeline Id while a creation was already in progress. Back off and retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
|
||||
@@ -36,7 +36,6 @@ use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::auth::JwtAuth;
|
||||
use utils::failpoint_support::failpoints_handler;
|
||||
use utils::http::endpoint::prometheus_metrics_handler;
|
||||
use utils::http::endpoint::request_span;
|
||||
use utils::http::json::json_request_or_empty_body;
|
||||
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
|
||||
@@ -49,8 +48,8 @@ use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::config::{LocationConf, TenantConfOpt};
|
||||
use crate::tenant::mgr::GetActiveTenantError;
|
||||
use crate::tenant::mgr::{
|
||||
GetTenantError, TenantManager, TenantMapError, TenantMapInsertError, TenantSlotError,
|
||||
TenantSlotUpsertError, TenantStateError,
|
||||
GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
|
||||
TenantSlotError, TenantSlotUpsertError, TenantStateError,
|
||||
};
|
||||
use crate::tenant::mgr::{TenantSlot, UpsertLocationError};
|
||||
use crate::tenant::remote_timeline_client;
|
||||
@@ -249,11 +248,16 @@ impl From<GetTenantError> for ApiError {
|
||||
fn from(tse: GetTenantError) -> ApiError {
|
||||
match tse {
|
||||
GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
|
||||
GetTenantError::Broken(reason) => {
|
||||
ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason))
|
||||
}
|
||||
GetTenantError::NotActive(_) => {
|
||||
// Why is this not `ApiError::NotFound`?
|
||||
// Because we must be careful to never return 404 for a tenant if it does
|
||||
// in fact exist locally. If we did, the caller could draw the conclusion
|
||||
// that it can attach the tenant to another PS and we'd be in split-brain.
|
||||
//
|
||||
// (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls).
|
||||
ApiError::ResourceUnavailable("Tenant not yet active".into())
|
||||
}
|
||||
GetTenantError::MapState(e) => ApiError::ResourceUnavailable(format!("{e}").into()),
|
||||
@@ -264,9 +268,6 @@ impl From<GetTenantError> for ApiError {
|
||||
impl From<GetActiveTenantError> for ApiError {
|
||||
fn from(e: GetActiveTenantError) -> ApiError {
|
||||
match e {
|
||||
GetActiveTenantError::Broken(reason) => {
|
||||
ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason))
|
||||
}
|
||||
GetActiveTenantError::WillNotBecomeActive(_) => ApiError::Conflict(format!("{}", e)),
|
||||
GetActiveTenantError::Cancelled => ApiError::ShuttingDown,
|
||||
GetActiveTenantError::NotFound(gte) => gte.into(),
|
||||
@@ -277,6 +278,19 @@ impl From<GetActiveTenantError> for ApiError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<SetNewTenantConfigError> for ApiError {
|
||||
fn from(e: SetNewTenantConfigError) -> ApiError {
|
||||
match e {
|
||||
SetNewTenantConfigError::GetTenant(tid) => {
|
||||
ApiError::NotFound(anyhow!("tenant {}", tid).into())
|
||||
}
|
||||
e @ (SetNewTenantConfigError::Persist(_) | SetNewTenantConfigError::Other(_)) => {
|
||||
ApiError::InternalServerError(anyhow::Error::new(e))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<crate::tenant::DeleteTimelineError> for ApiError {
|
||||
fn from(value: crate::tenant::DeleteTimelineError) -> Self {
|
||||
use crate::tenant::DeleteTimelineError::*;
|
||||
@@ -480,7 +494,7 @@ async fn timeline_create_handler(
|
||||
async {
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
.get_attached_tenant_shard(tenant_shard_id, false)?;
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
@@ -520,13 +534,10 @@ async fn timeline_create_handler(
|
||||
HttpErrorBody::from_msg("Tenant shutting down".to_string()),
|
||||
)
|
||||
}
|
||||
Err(e @ tenant::CreateTimelineError::Conflict) => {
|
||||
json_response(StatusCode::CONFLICT, HttpErrorBody::from_msg(e.to_string()))
|
||||
}
|
||||
Err(e @ tenant::CreateTimelineError::AlreadyCreating) => json_response(
|
||||
StatusCode::TOO_MANY_REQUESTS,
|
||||
HttpErrorBody::from_msg(e.to_string()),
|
||||
),
|
||||
Err(
|
||||
e @ tenant::CreateTimelineError::Conflict
|
||||
| e @ tenant::CreateTimelineError::AlreadyCreating,
|
||||
) => json_response(StatusCode::CONFLICT, HttpErrorBody::from_msg(e.to_string())),
|
||||
Err(tenant::CreateTimelineError::AncestorLsn(err)) => json_response(
|
||||
StatusCode::NOT_ACCEPTABLE,
|
||||
HttpErrorBody::from_msg(format!("{err:#}")),
|
||||
@@ -569,7 +580,7 @@ async fn timeline_list_handler(
|
||||
let response_data = async {
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
.get_attached_tenant_shard(tenant_shard_id, false)?;
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
@@ -607,7 +618,6 @@ async fn timeline_preserve_initdb_handler(
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let state = get_state(&request);
|
||||
|
||||
// Part of the process for disaster recovery from safekeeper-stored WAL:
|
||||
// If we don't recover into a new timeline but want to keep the timeline ID,
|
||||
@@ -615,9 +625,7 @@ async fn timeline_preserve_initdb_handler(
|
||||
// location where timeline recreation cand find it.
|
||||
|
||||
async {
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
let tenant = mgr::get_tenant(tenant_shard_id, false)?;
|
||||
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, false)
|
||||
@@ -659,7 +667,7 @@ async fn timeline_detail_handler(
|
||||
let timeline_info = async {
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
.get_attached_tenant_shard(tenant_shard_id, false)?;
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
@@ -846,7 +854,7 @@ async fn timeline_delete_handler(
|
||||
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)
|
||||
.get_attached_tenant_shard(tenant_shard_id, false)
|
||||
.map_err(|e| {
|
||||
match e {
|
||||
// GetTenantError has a built-in conversion to ApiError, but in this context we don't
|
||||
@@ -964,11 +972,10 @@ async fn tenant_list_handler(
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&request, None)?;
|
||||
let state = get_state(&request);
|
||||
|
||||
let response_data = state
|
||||
.tenant_manager
|
||||
.list_tenants()
|
||||
let response_data = mgr::list_tenants()
|
||||
.instrument(info_span!("tenant_list"))
|
||||
.await
|
||||
.map_err(|_| {
|
||||
ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".into())
|
||||
})?
|
||||
@@ -991,12 +998,9 @@ async fn tenant_status(
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let state = get_state(&request);
|
||||
|
||||
let tenant_info = async {
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
let tenant = mgr::get_tenant(tenant_shard_id, false)?;
|
||||
|
||||
// Calculate total physical size of all timelines
|
||||
let mut current_physical_size = 0;
|
||||
@@ -1069,7 +1073,9 @@ async fn tenant_size_handler(
|
||||
let inputs_only: Option<bool> = parse_query_param(&request, "inputs_only")?;
|
||||
let retention_period: Option<u64> = parse_query_param(&request, "retention_period")?;
|
||||
let headers = request.headers();
|
||||
let state = get_state(&request);
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let tenant = mgr::get_tenant(tenant_shard_id, true)?;
|
||||
|
||||
if !tenant_shard_id.is_zero() {
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
@@ -1077,12 +1083,6 @@ async fn tenant_size_handler(
|
||||
)));
|
||||
}
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
// this can be long operation
|
||||
let inputs = tenant
|
||||
.gather_size_inputs(
|
||||
@@ -1151,15 +1151,10 @@ async fn tenant_shard_split_handler(
|
||||
let state = get_state(&request);
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
let new_shards = state
|
||||
.tenant_manager
|
||||
.shard_split(
|
||||
tenant,
|
||||
tenant_shard_id,
|
||||
ShardCount::new(req.new_shard_count),
|
||||
req.new_stripe_size,
|
||||
&ctx,
|
||||
@@ -1377,11 +1372,8 @@ async fn get_tenant_config_handler(
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let state = get_state(&request);
|
||||
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
let tenant = mgr::get_tenant(tenant_shard_id, false)?;
|
||||
|
||||
let response = HashMap::from([
|
||||
(
|
||||
@@ -1409,31 +1401,15 @@ async fn update_tenant_config_handler(
|
||||
let tenant_id = request_data.tenant_id;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let new_tenant_conf =
|
||||
let tenant_conf =
|
||||
TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||
|
||||
let tenant = state
|
||||
state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
// This is a legacy API that only operates on attached tenants: the preferred
|
||||
// API to use is the location_config/ endpoint, which lets the caller provide
|
||||
// the full LocationConf.
|
||||
let location_conf = LocationConf::attached_single(
|
||||
new_tenant_conf.clone(),
|
||||
tenant.get_generation(),
|
||||
&ShardParameters::default(),
|
||||
);
|
||||
|
||||
crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
tenant.set_new_tenant_config(new_tenant_conf);
|
||||
.set_new_tenant_config(tenant_conf, tenant_id)
|
||||
.instrument(info_span!("tenant_config", %tenant_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
@@ -1657,12 +1633,10 @@ async fn handle_tenant_break(
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?;
|
||||
|
||||
let state = get_state(&r);
|
||||
state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?
|
||||
.set_broken("broken from test".to_owned())
|
||||
.await;
|
||||
let tenant = crate::tenant::mgr::get_tenant(tenant_shard_id, true)
|
||||
.map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;
|
||||
|
||||
tenant.set_broken("broken from test".to_owned()).await;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
@@ -1906,7 +1880,7 @@ async fn active_timeline_of_active_tenant(
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<Arc<Timeline>, ApiError> {
|
||||
let tenant = tenant_manager.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
let tenant = tenant_manager.get_attached_tenant_shard(tenant_shard_id, false)?;
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
@@ -2292,7 +2266,6 @@ pub fn make_router(
|
||||
|
||||
Ok(router
|
||||
.data(state)
|
||||
.get("/metrics", |r| request_span(r, prometheus_metrics_handler))
|
||||
.get("/v1/status", |r| api_handler(r, status_handler))
|
||||
.put("/v1/failpoints", |r| {
|
||||
testing_api_handler("manage failpoints", r, failpoints_handler)
|
||||
|
||||
@@ -2,20 +2,28 @@
|
||||
//! Import data and WAL from a PostgreSQL data directory and WAL segments into
|
||||
//! a neon Timeline.
|
||||
//!
|
||||
use std::io::SeekFrom;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use async_compression::tokio::bufread::ZstdDecoder;
|
||||
use async_compression::{tokio::write::ZstdEncoder, zstd::CParameter, Level};
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8Path;
|
||||
use futures::StreamExt;
|
||||
use tokio::io::{AsyncRead, AsyncReadExt};
|
||||
use nix::NixPath;
|
||||
use tokio::fs::{File, OpenOptions};
|
||||
use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt};
|
||||
use tokio_tar::Archive;
|
||||
use tokio_tar::Builder;
|
||||
use tokio_tar::HeaderMode;
|
||||
use tracing::*;
|
||||
use walkdir::WalkDir;
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::metrics::WAL_INGEST;
|
||||
use crate::pgdatadir_mapping::*;
|
||||
use crate::tenant::remote_timeline_client::INITDB_PATH;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::walingest::WalIngest;
|
||||
use crate::walrecord::DecodedWALRecord;
|
||||
@@ -625,3 +633,65 @@ async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> Result<Bytes>
|
||||
reader.read_to_end(&mut buf).await?;
|
||||
Ok(Bytes::from(buf))
|
||||
}
|
||||
|
||||
pub async fn create_tar_zst(pgdata_path: &Utf8Path, tmp_path: &Utf8Path) -> Result<(File, u64)> {
|
||||
let file = OpenOptions::new()
|
||||
.create(true)
|
||||
.truncate(true)
|
||||
.read(true)
|
||||
.write(true)
|
||||
.open(&tmp_path)
|
||||
.await
|
||||
.with_context(|| format!("tempfile creation {tmp_path}"))?;
|
||||
|
||||
let mut paths = Vec::new();
|
||||
for entry in WalkDir::new(pgdata_path) {
|
||||
let entry = entry?;
|
||||
let metadata = entry.metadata().expect("error getting dir entry metadata");
|
||||
// Also allow directories so that we also get empty directories
|
||||
if !(metadata.is_file() || metadata.is_dir()) {
|
||||
continue;
|
||||
}
|
||||
let path = entry.into_path();
|
||||
paths.push(path);
|
||||
}
|
||||
// Do a sort to get a more consistent listing
|
||||
paths.sort_unstable();
|
||||
let zstd = ZstdEncoder::with_quality_and_params(
|
||||
file,
|
||||
Level::Default,
|
||||
&[CParameter::enable_long_distance_matching(true)],
|
||||
);
|
||||
let mut builder = Builder::new(zstd);
|
||||
// Use reproducible header mode
|
||||
builder.mode(HeaderMode::Deterministic);
|
||||
for path in paths {
|
||||
let rel_path = path.strip_prefix(pgdata_path)?;
|
||||
if rel_path.is_empty() {
|
||||
// The top directory should not be compressed,
|
||||
// the tar crate doesn't like that
|
||||
continue;
|
||||
}
|
||||
builder.append_path_with_name(&path, rel_path).await?;
|
||||
}
|
||||
let mut zstd = builder.into_inner().await?;
|
||||
zstd.shutdown().await?;
|
||||
let mut compressed = zstd.into_inner();
|
||||
let compressed_len = compressed.metadata().await?.len();
|
||||
const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024;
|
||||
if compressed_len > INITDB_TAR_ZST_WARN_LIMIT {
|
||||
warn!("compressed {INITDB_PATH} size of {compressed_len} is above limit {INITDB_TAR_ZST_WARN_LIMIT}.");
|
||||
}
|
||||
compressed.seek(SeekFrom::Start(0)).await?;
|
||||
Ok((compressed, compressed_len))
|
||||
}
|
||||
|
||||
pub async fn extract_tar_zst(
|
||||
pgdata_path: &Utf8Path,
|
||||
tar_zst: impl AsyncBufRead + Unpin,
|
||||
) -> Result<()> {
|
||||
let tar = Box::pin(ZstdDecoder::new(tar_zst));
|
||||
let mut archive = Archive::new(tar);
|
||||
archive.unpack(pgdata_path).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -435,7 +435,7 @@ pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(||
|
||||
static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_remote_physical_size",
|
||||
"The size of the layer files present in the remote storage that are listed in the remote index_part.json.",
|
||||
"The size of the layer files present in the remote storage that are listed in the the remote index_part.json.",
|
||||
// Corollary: If any files are missing from the index part, they won't be included here.
|
||||
&["tenant_id", "shard_id", "timeline_id"]
|
||||
)
|
||||
@@ -699,14 +699,6 @@ pub static STARTUP_IS_LOADING: Lazy<UIntGauge> = Lazy::new(|| {
|
||||
.expect("Failed to register pageserver_startup_is_loading")
|
||||
});
|
||||
|
||||
pub(crate) static TIMELINE_EPHEMERAL_BYTES: Lazy<UIntGauge> = Lazy::new(|| {
|
||||
register_uint_gauge!(
|
||||
"pageserver_timeline_ephemeral_bytes",
|
||||
"Total number of bytes in ephemeral layers, summed for all timelines. Approximate, lazily updated."
|
||||
)
|
||||
.expect("Failed to register metric")
|
||||
});
|
||||
|
||||
/// Metrics related to the lifecycle of a [`crate::tenant::Tenant`] object: things
|
||||
/// like how long it took to load.
|
||||
///
|
||||
|
||||
@@ -760,7 +760,6 @@ impl PageServerHandler {
|
||||
let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &tenant.cancel)));
|
||||
timeline
|
||||
.import_basebackup_from_tar(
|
||||
tenant.clone(),
|
||||
&mut copyin_reader,
|
||||
base_lsn,
|
||||
self.broker_client.clone(),
|
||||
|
||||
@@ -34,7 +34,6 @@ use strum::IntoEnumIterator;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, trace, warn};
|
||||
use utils::bin_ser::DeserializeError;
|
||||
use utils::vec_map::{VecMap, VecMapOrdering};
|
||||
use utils::{bin_ser::BeSer, lsn::Lsn};
|
||||
|
||||
const MAX_AUX_FILE_DELTAS: usize = 1024;
|
||||
@@ -1547,13 +1546,12 @@ impl<'a> DatadirModification<'a> {
|
||||
if !self.pending_updates.is_empty() {
|
||||
// The put_batch call below expects expects the inputs to be sorted by Lsn,
|
||||
// so we do that first.
|
||||
let lsn_ordered_batch: VecMap<Lsn, (Key, Value)> = VecMap::from_iter(
|
||||
self.pending_updates
|
||||
.drain()
|
||||
.map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (lsn, (key, val))))
|
||||
.kmerge_by(|lhs, rhs| lhs.0 < rhs.0),
|
||||
VecMapOrdering::GreaterOrEqual,
|
||||
);
|
||||
let lsn_ordered_batch: Vec<(Key, Lsn, Value)> = self
|
||||
.pending_updates
|
||||
.drain()
|
||||
.map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (key, lsn, val)))
|
||||
.kmerge_by(|lhs, rhs| lhs.1 .0 < rhs.1 .0)
|
||||
.collect();
|
||||
|
||||
writer.put_batch(lsn_ordered_batch, ctx).await?;
|
||||
}
|
||||
|
||||
@@ -43,8 +43,6 @@ use utils::sync::gate::Gate;
|
||||
use utils::sync::gate::GateGuard;
|
||||
use utils::timeout::timeout_cancellable;
|
||||
use utils::timeout::TimeoutCancellableError;
|
||||
use utils::zstd::create_zst_tarball;
|
||||
use utils::zstd::extract_zst_tarball;
|
||||
|
||||
use self::config::AttachedLocationConfig;
|
||||
use self::config::AttachmentMode;
|
||||
@@ -202,13 +200,6 @@ pub(super) struct AttachedTenantConf {
|
||||
}
|
||||
|
||||
impl AttachedTenantConf {
|
||||
fn new(tenant_conf: TenantConfOpt, location: AttachedLocationConfig) -> Self {
|
||||
Self {
|
||||
tenant_conf,
|
||||
location,
|
||||
}
|
||||
}
|
||||
|
||||
fn try_from(location_conf: LocationConf) -> anyhow::Result<Self> {
|
||||
match &location_conf.mode {
|
||||
LocationMode::Attached(attach_conf) => Ok(Self {
|
||||
@@ -685,20 +676,9 @@ impl Tenant {
|
||||
}
|
||||
|
||||
// Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state.
|
||||
enum BrokenVerbosity {
|
||||
Error,
|
||||
Info
|
||||
}
|
||||
let make_broken =
|
||||
|t: &Tenant, err: anyhow::Error, verbosity: BrokenVerbosity| {
|
||||
match verbosity {
|
||||
BrokenVerbosity::Info => {
|
||||
info!("attach cancelled, setting tenant state to Broken: {err}");
|
||||
},
|
||||
BrokenVerbosity::Error => {
|
||||
error!("attach failed, setting tenant state to Broken: {err:?}");
|
||||
}
|
||||
}
|
||||
|t: &Tenant, err: anyhow::Error| {
|
||||
error!("attach failed, setting tenant state to Broken: {err:?}");
|
||||
t.state.send_modify(|state| {
|
||||
// The Stopping case is for when we have passed control on to DeleteTenantFlow:
|
||||
// if it errors, we will call make_broken when tenant is already in Stopping.
|
||||
@@ -762,7 +742,7 @@ impl Tenant {
|
||||
// Make the tenant broken so that set_stopping will not hang waiting for it to leave
|
||||
// the Attaching state. This is an over-reaction (nothing really broke, the tenant is
|
||||
// just shutting down), but ensures progress.
|
||||
make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"), BrokenVerbosity::Info);
|
||||
make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"));
|
||||
return Ok(());
|
||||
},
|
||||
)
|
||||
@@ -784,7 +764,7 @@ impl Tenant {
|
||||
match res {
|
||||
Ok(p) => Some(p),
|
||||
Err(e) => {
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(e));
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
@@ -808,7 +788,7 @@ impl Tenant {
|
||||
{
|
||||
Ok(should_resume_deletion) => should_resume_deletion,
|
||||
Err(err) => {
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(err), BrokenVerbosity::Error);
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(err));
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
@@ -838,7 +818,7 @@ impl Tenant {
|
||||
.await;
|
||||
|
||||
if let Err(e) = deleted {
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(e));
|
||||
}
|
||||
|
||||
return Ok(());
|
||||
@@ -859,7 +839,7 @@ impl Tenant {
|
||||
tenant_clone.activate(broker_client, None, &ctx);
|
||||
}
|
||||
Err(e) => {
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(e));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1411,7 +1391,7 @@ impl Tenant {
|
||||
/// the same timeline ID already exists, returns CreateTimelineError::AlreadyExists.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(crate) async fn create_timeline(
|
||||
self: &Arc<Tenant>,
|
||||
&self,
|
||||
new_timeline_id: TimelineId,
|
||||
ancestor_timeline_id: Option<TimelineId>,
|
||||
mut ancestor_start_lsn: Option<Lsn>,
|
||||
@@ -1559,7 +1539,7 @@ impl Tenant {
|
||||
})?;
|
||||
}
|
||||
|
||||
loaded_timeline.activate(self.clone(), broker_client, None, ctx);
|
||||
loaded_timeline.activate(broker_client, None, ctx);
|
||||
|
||||
Ok(loaded_timeline)
|
||||
}
|
||||
@@ -1731,12 +1711,7 @@ impl Tenant {
|
||||
let mut activated_timelines = 0;
|
||||
|
||||
for timeline in timelines_to_activate {
|
||||
timeline.activate(
|
||||
self.clone(),
|
||||
broker_client.clone(),
|
||||
background_jobs_can_start,
|
||||
ctx,
|
||||
);
|
||||
timeline.activate(broker_client.clone(), background_jobs_can_start, ctx);
|
||||
activated_timelines += 1;
|
||||
}
|
||||
|
||||
@@ -2068,12 +2043,7 @@ impl Tenant {
|
||||
TenantState::Active { .. } => {
|
||||
return Ok(());
|
||||
}
|
||||
TenantState::Broken { reason, .. } => {
|
||||
// This is fatal, and reported distinctly from the general case of "will never be active" because
|
||||
// it's logically a 500 to external API users (broken is always a bug).
|
||||
return Err(GetActiveTenantError::Broken(reason));
|
||||
}
|
||||
TenantState::Stopping { .. } => {
|
||||
TenantState::Broken { .. } | TenantState::Stopping { .. } => {
|
||||
// There's no chance the tenant can transition back into ::Active
|
||||
return Err(GetActiveTenantError::WillNotBecomeActive(current_state));
|
||||
}
|
||||
@@ -2151,7 +2121,7 @@ impl Tenant {
|
||||
|
||||
// Shut down the timeline's remote client: this means that the indices we write
|
||||
// for child shards will not be invalidated by the parent shard deleting layers.
|
||||
tl_client.shutdown().await;
|
||||
tl_client.shutdown().await?;
|
||||
|
||||
// Download methods can still be used after shutdown, as they don't flow through the remote client's
|
||||
// queue. In principal the RemoteTimelineClient could provide this without downloading it, but this
|
||||
@@ -3072,13 +3042,8 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
|
||||
let (pgdata_zstd, tar_zst_size) = create_zst_tarball(pgdata_path, &temp_path).await?;
|
||||
const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024;
|
||||
if tar_zst_size > INITDB_TAR_ZST_WARN_LIMIT {
|
||||
warn!(
|
||||
"compressed {temp_path} size of {tar_zst_size} is above limit {INITDB_TAR_ZST_WARN_LIMIT}."
|
||||
);
|
||||
}
|
||||
let (pgdata_zstd, tar_zst_size) =
|
||||
import_datadir::create_tar_zst(pgdata_path, &temp_path).await?;
|
||||
|
||||
pausable_failpoint!("before-initdb-upload");
|
||||
|
||||
@@ -3178,7 +3143,7 @@ impl Tenant {
|
||||
|
||||
let buf_read =
|
||||
BufReader::with_capacity(remote_timeline_client::BUFFER_SIZE, initdb_tar_zst);
|
||||
extract_zst_tarball(&pgdata_path, buf_read)
|
||||
import_datadir::extract_tar_zst(&pgdata_path, buf_read)
|
||||
.await
|
||||
.context("extract initdb tar")?;
|
||||
} else {
|
||||
@@ -3653,9 +3618,6 @@ pub(crate) mod harness {
|
||||
heatmap_period: Some(tenant_conf.heatmap_period),
|
||||
lazy_slru_download: Some(tenant_conf.lazy_slru_download),
|
||||
timeline_get_throttle: Some(tenant_conf.timeline_get_throttle),
|
||||
image_layer_creation_check_threshold: Some(
|
||||
tenant_conf.image_layer_creation_check_threshold,
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -4790,7 +4752,6 @@ mod tests {
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[ignore]
|
||||
async fn test_random_updates() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_random_updates")?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
@@ -5151,7 +5112,6 @@ mod tests {
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[ignore]
|
||||
async fn test_read_at_max_lsn() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_read_at_max_lsn")?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
@@ -39,7 +39,7 @@ pub mod defaults {
|
||||
pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
|
||||
pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
|
||||
pub const DEFAULT_COMPACTION_ALGORITHM: super::CompactionAlgorithm =
|
||||
super::CompactionAlgorithm::Tiered;
|
||||
super::CompactionAlgorithm::Legacy;
|
||||
|
||||
pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
|
||||
|
||||
@@ -57,9 +57,6 @@ pub mod defaults {
|
||||
// throughputs up to 1GiB/s per timeline.
|
||||
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
|
||||
pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
|
||||
// By default ingest enough WAL for two new L0 layers before checking if new image
|
||||
// image layers should be created.
|
||||
pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
|
||||
|
||||
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
|
||||
}
|
||||
@@ -199,17 +196,16 @@ impl LocationConf {
|
||||
/// For use when attaching/re-attaching: update the generation stored in this
|
||||
/// structure. If we were in a secondary state, promote to attached (posession
|
||||
/// of a fresh generation implies this).
|
||||
pub(crate) fn attach_in_generation(&mut self, mode: AttachmentMode, generation: Generation) {
|
||||
pub(crate) fn attach_in_generation(&mut self, generation: Generation) {
|
||||
match &mut self.mode {
|
||||
LocationMode::Attached(attach_conf) => {
|
||||
attach_conf.generation = generation;
|
||||
attach_conf.attach_mode = mode;
|
||||
}
|
||||
LocationMode::Secondary(_) => {
|
||||
// We are promoted to attached by the control plane's re-attach response
|
||||
self.mode = LocationMode::Attached(AttachedLocationConfig {
|
||||
generation,
|
||||
attach_mode: mode,
|
||||
attach_mode: AttachmentMode::Single,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -365,10 +361,6 @@ pub struct TenantConf {
|
||||
pub lazy_slru_download: bool,
|
||||
|
||||
pub timeline_get_throttle: pageserver_api::models::ThrottleConfig,
|
||||
|
||||
// How much WAL must be ingested before checking again whether a new image layer is required.
|
||||
// Expresed in multiples of checkpoint distance.
|
||||
pub image_layer_creation_check_threshold: u8,
|
||||
}
|
||||
|
||||
/// Same as TenantConf, but this struct preserves the information about
|
||||
@@ -461,9 +453,6 @@ pub struct TenantConfOpt {
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub timeline_get_throttle: Option<pageserver_api::models::ThrottleConfig>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub image_layer_creation_check_threshold: Option<u8>,
|
||||
}
|
||||
|
||||
impl TenantConfOpt {
|
||||
@@ -518,9 +507,6 @@ impl TenantConfOpt {
|
||||
.timeline_get_throttle
|
||||
.clone()
|
||||
.unwrap_or(global_conf.timeline_get_throttle),
|
||||
image_layer_creation_check_threshold: self
|
||||
.image_layer_creation_check_threshold
|
||||
.unwrap_or(global_conf.image_layer_creation_check_threshold),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -561,7 +547,6 @@ impl Default for TenantConf {
|
||||
heatmap_period: Duration::ZERO,
|
||||
lazy_slru_download: false,
|
||||
timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
|
||||
image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -635,7 +620,6 @@ impl From<TenantConfOpt> for models::TenantConfig {
|
||||
heatmap_period: value.heatmap_period.map(humantime),
|
||||
lazy_slru_download: value.lazy_slru_download,
|
||||
timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
|
||||
image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -111,7 +111,6 @@ async fn create_local_delete_mark(
|
||||
let _ = std::fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.create(true)
|
||||
.truncate(true)
|
||||
.open(&marker_path)
|
||||
.with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
|
||||
|
||||
|
||||
@@ -2,13 +2,13 @@
|
||||
//! page server.
|
||||
|
||||
use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
|
||||
use futures::stream::StreamExt;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::models::LocationConfigMode;
|
||||
use pageserver_api::models::ShardParameters;
|
||||
use pageserver_api::shard::{
|
||||
ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
|
||||
};
|
||||
use pageserver_api::upcall_api::ReAttachResponseTenant;
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use std::borrow::Cow;
|
||||
use std::cmp::Ordering;
|
||||
@@ -16,7 +16,6 @@ use std::collections::{BTreeMap, HashMap};
|
||||
use std::ops::Deref;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
use sysinfo::SystemExt;
|
||||
use tokio::fs;
|
||||
use utils::timeout::{timeout_cancellable, TimeoutCancellableError};
|
||||
|
||||
@@ -40,10 +39,10 @@ use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
|
||||
use crate::task_mgr::{self, TaskKind};
|
||||
use crate::tenant::config::{
|
||||
AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig,
|
||||
TenantConfOpt,
|
||||
};
|
||||
use crate::tenant::delete::DeleteTenantFlow;
|
||||
use crate::tenant::span::debug_assert_current_span_has_tenant_id;
|
||||
use crate::tenant::storage_layer::inmemory_layer;
|
||||
use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
|
||||
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX};
|
||||
|
||||
@@ -126,46 +125,6 @@ pub(crate) enum ShardSelector {
|
||||
Page(Key),
|
||||
}
|
||||
|
||||
/// A convenience for use with the re_attach ControlPlaneClient function: rather
|
||||
/// than the serializable struct, we build this enum that encapsulates
|
||||
/// the invariant that attached tenants always have generations.
|
||||
///
|
||||
/// This represents the subset of a LocationConfig that we receive during re-attach.
|
||||
pub(crate) enum TenantStartupMode {
|
||||
Attached((AttachmentMode, Generation)),
|
||||
Secondary,
|
||||
}
|
||||
|
||||
impl TenantStartupMode {
|
||||
/// Return the generation & mode that should be used when starting
|
||||
/// this tenant.
|
||||
///
|
||||
/// If this returns None, the re-attach struct is in an invalid state and
|
||||
/// should be ignored in the response.
|
||||
fn from_reattach_tenant(rart: ReAttachResponseTenant) -> Option<Self> {
|
||||
match (rart.mode, rart.gen) {
|
||||
(LocationConfigMode::Detached, _) => None,
|
||||
(LocationConfigMode::Secondary, _) => Some(Self::Secondary),
|
||||
(LocationConfigMode::AttachedMulti, Some(g)) => {
|
||||
Some(Self::Attached((AttachmentMode::Multi, Generation::new(g))))
|
||||
}
|
||||
(LocationConfigMode::AttachedSingle, Some(g)) => {
|
||||
Some(Self::Attached((AttachmentMode::Single, Generation::new(g))))
|
||||
}
|
||||
(LocationConfigMode::AttachedStale, Some(g)) => {
|
||||
Some(Self::Attached((AttachmentMode::Stale, Generation::new(g))))
|
||||
}
|
||||
_ => {
|
||||
tracing::warn!(
|
||||
"Received invalid re-attach state for tenant {}: {rart:?}",
|
||||
rart.id
|
||||
);
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TenantsMap {
|
||||
/// Convenience function for typical usage, where we want to get a `Tenant` object, for
|
||||
/// working with attached tenants. If the TenantId is in the map but in Secondary state,
|
||||
@@ -312,7 +271,7 @@ pub struct TenantManager {
|
||||
|
||||
fn emergency_generations(
|
||||
tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
|
||||
) -> HashMap<TenantShardId, TenantStartupMode> {
|
||||
) -> HashMap<TenantShardId, Generation> {
|
||||
tenant_confs
|
||||
.iter()
|
||||
.filter_map(|(tid, lc)| {
|
||||
@@ -320,15 +279,12 @@ fn emergency_generations(
|
||||
Ok(lc) => lc,
|
||||
Err(_) => return None,
|
||||
};
|
||||
Some((
|
||||
*tid,
|
||||
match &lc.mode {
|
||||
LocationMode::Attached(alc) => {
|
||||
TenantStartupMode::Attached((alc.attach_mode, alc.generation))
|
||||
}
|
||||
LocationMode::Secondary(_) => TenantStartupMode::Secondary,
|
||||
},
|
||||
))
|
||||
let gen = match &lc.mode {
|
||||
LocationMode::Attached(alc) => Some(alc.generation),
|
||||
LocationMode::Secondary(_) => None,
|
||||
};
|
||||
|
||||
gen.map(|g| (*tid, g))
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
@@ -338,7 +294,7 @@ async fn init_load_generations(
|
||||
tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
|
||||
resources: &TenantSharedResources,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<Option<HashMap<TenantShardId, TenantStartupMode>>> {
|
||||
) -> anyhow::Result<Option<HashMap<TenantShardId, Generation>>> {
|
||||
let generations = if conf.control_plane_emergency_mode {
|
||||
error!(
|
||||
"Emergency mode! Tenants will be attached unsafely using their last known generation"
|
||||
@@ -348,12 +304,7 @@ async fn init_load_generations(
|
||||
info!("Calling control plane API to re-attach tenants");
|
||||
// If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
|
||||
match client.re_attach(conf).await {
|
||||
Ok(tenants) => tenants
|
||||
.into_iter()
|
||||
.flat_map(|(id, rart)| {
|
||||
TenantStartupMode::from_reattach_tenant(rart).map(|tsm| (id, tsm))
|
||||
})
|
||||
.collect(),
|
||||
Ok(tenants) => tenants,
|
||||
Err(RetryForeverError::ShuttingDown) => {
|
||||
anyhow::bail!("Shut down while waiting for control plane re-attach response")
|
||||
}
|
||||
@@ -371,17 +322,9 @@ async fn init_load_generations(
|
||||
// Must only do this if remote storage is enabled, otherwise deletion queue
|
||||
// is not running and channel push will fail.
|
||||
if resources.remote_storage.is_some() {
|
||||
let attached_tenants = generations
|
||||
.iter()
|
||||
.flat_map(|(id, start_mode)| {
|
||||
match start_mode {
|
||||
TenantStartupMode::Attached((_mode, generation)) => Some(generation),
|
||||
TenantStartupMode::Secondary => None,
|
||||
}
|
||||
.map(|gen| (*id, *gen))
|
||||
})
|
||||
.collect();
|
||||
resources.deletion_queue_client.recover(attached_tenants)?;
|
||||
resources
|
||||
.deletion_queue_client
|
||||
.recover(generations.clone())?;
|
||||
}
|
||||
|
||||
Ok(Some(generations))
|
||||
@@ -544,23 +487,12 @@ pub async fn init_tenant_mgr(
|
||||
|
||||
let ctx = RequestContext::todo_child(TaskKind::Startup, DownloadBehavior::Warn);
|
||||
|
||||
// Initialize dynamic limits that depend on system resources
|
||||
let system_memory =
|
||||
sysinfo::System::new_with_specifics(sysinfo::RefreshKind::new().with_memory())
|
||||
.total_memory();
|
||||
let max_ephemeral_layer_bytes =
|
||||
conf.ephemeral_bytes_per_memory_kb as u64 * (system_memory / 1024);
|
||||
tracing::info!("Initialized ephemeral layer size limit to {max_ephemeral_layer_bytes}, for {system_memory} bytes of memory");
|
||||
inmemory_layer::GLOBAL_RESOURCES.max_dirty_bytes.store(
|
||||
max_ephemeral_layer_bytes,
|
||||
std::sync::atomic::Ordering::Relaxed,
|
||||
);
|
||||
|
||||
// Scan local filesystem for attached tenants
|
||||
let tenant_configs = init_load_tenant_configs(conf).await?;
|
||||
|
||||
// Determine which tenants are to be secondary or attached, and in which generation
|
||||
let tenant_modes = init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;
|
||||
// Determine which tenants are to be attached
|
||||
let tenant_generations =
|
||||
init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;
|
||||
|
||||
tracing::info!(
|
||||
"Attaching {} tenants at startup, warming up {} at a time",
|
||||
@@ -590,102 +522,97 @@ pub async fn init_tenant_mgr(
|
||||
}
|
||||
};
|
||||
|
||||
// FIXME: if we were attached, and get demoted to secondary on re-attach, we
|
||||
// don't have a place to get a config.
|
||||
// (https://github.com/neondatabase/neon/issues/5377)
|
||||
const DEFAULT_SECONDARY_CONF: SecondaryLocationConfig =
|
||||
SecondaryLocationConfig { warm: true };
|
||||
|
||||
// Update the location config according to the re-attach response
|
||||
if let Some(tenant_modes) = &tenant_modes {
|
||||
let generation = if let Some(generations) = &tenant_generations {
|
||||
// We have a generation map: treat it as the authority for whether
|
||||
// this tenant is really attached.
|
||||
match tenant_modes.get(&tenant_shard_id) {
|
||||
None => {
|
||||
info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response");
|
||||
if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
|
||||
error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
|
||||
"Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}",
|
||||
);
|
||||
}
|
||||
|
||||
// We deleted local content: move on to next tenant, don't try and spawn this one.
|
||||
continue;
|
||||
}
|
||||
Some(TenantStartupMode::Secondary) => {
|
||||
if !matches!(location_conf.mode, LocationMode::Secondary(_)) {
|
||||
location_conf.mode = LocationMode::Secondary(DEFAULT_SECONDARY_CONF);
|
||||
}
|
||||
}
|
||||
Some(TenantStartupMode::Attached((attach_mode, generation))) => {
|
||||
let old_gen_higher = match &location_conf.mode {
|
||||
LocationMode::Attached(AttachedLocationConfig {
|
||||
generation: old_generation,
|
||||
attach_mode: _attach_mode,
|
||||
}) => {
|
||||
if old_generation > generation {
|
||||
Some(old_generation)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
_ => None,
|
||||
};
|
||||
if let Some(old_generation) = old_gen_higher {
|
||||
if let Some(gen) = generations.get(&tenant_shard_id) {
|
||||
if let LocationMode::Attached(attached) = &location_conf.mode {
|
||||
if attached.generation > *gen {
|
||||
tracing::error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
|
||||
"Control plane gave decreasing generation ({generation:?}) in re-attach response for tenant that was attached in generation {:?}, demoting to secondary",
|
||||
old_generation
|
||||
"Control plane gave decreasing generation ({gen:?}) in re-attach response for tenant that was attached in generation {:?}, demoting to secondary",
|
||||
attached.generation
|
||||
);
|
||||
|
||||
// We cannot safely attach this tenant given a bogus generation number, but let's avoid throwing away
|
||||
// local disk content: demote to secondary rather than detaching.
|
||||
location_conf.mode = LocationMode::Secondary(DEFAULT_SECONDARY_CONF);
|
||||
} else {
|
||||
location_conf.attach_in_generation(*attach_mode, *generation);
|
||||
tenants.insert(
|
||||
tenant_shard_id,
|
||||
TenantSlot::Secondary(SecondaryTenant::new(
|
||||
tenant_shard_id,
|
||||
location_conf.shard,
|
||||
location_conf.tenant_conf.clone(),
|
||||
&SecondaryLocationConfig { warm: false },
|
||||
)),
|
||||
);
|
||||
}
|
||||
}
|
||||
*gen
|
||||
} else {
|
||||
match &location_conf.mode {
|
||||
LocationMode::Secondary(secondary_config) => {
|
||||
// We do not require the control plane's permission for secondary mode
|
||||
// tenants, because they do no remote writes and hence require no
|
||||
// generation number
|
||||
info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Loaded tenant in secondary mode");
|
||||
tenants.insert(
|
||||
tenant_shard_id,
|
||||
TenantSlot::Secondary(SecondaryTenant::new(
|
||||
tenant_shard_id,
|
||||
location_conf.shard,
|
||||
location_conf.tenant_conf,
|
||||
secondary_config,
|
||||
)),
|
||||
);
|
||||
}
|
||||
LocationMode::Attached(_) => {
|
||||
// TODO: augment re-attach API to enable the control plane to
|
||||
// instruct us about secondary attachments. That way, instead of throwing
|
||||
// away local state, we can gracefully fall back to secondary here, if the control
|
||||
// plane tells us so.
|
||||
// (https://github.com/neondatabase/neon/issues/5377)
|
||||
info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response");
|
||||
if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
|
||||
error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
|
||||
"Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}",
|
||||
);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
// Legacy mode: no generation information, any tenant present
|
||||
// on local disk may activate
|
||||
info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Starting tenant in legacy mode, no generation",);
|
||||
Generation::none()
|
||||
};
|
||||
|
||||
// Presence of a generation number implies attachment: attach the tenant
|
||||
// if it wasn't already, and apply the generation number.
|
||||
location_conf.attach_in_generation(generation);
|
||||
Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;
|
||||
|
||||
let shard_identity = location_conf.shard;
|
||||
let slot = match location_conf.mode {
|
||||
LocationMode::Attached(attached_conf) => {
|
||||
match tenant_spawn(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
&tenant_dir_path,
|
||||
resources.clone(),
|
||||
AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
|
||||
shard_identity,
|
||||
Some(init_order.clone()),
|
||||
&TENANTS,
|
||||
SpawnMode::Lazy,
|
||||
&ctx,
|
||||
) {
|
||||
Ok(tenant) => TenantSlot::Attached(tenant),
|
||||
Err(e) => {
|
||||
error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}");
|
||||
continue;
|
||||
}
|
||||
}
|
||||
match tenant_spawn(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
&tenant_dir_path,
|
||||
resources.clone(),
|
||||
AttachedTenantConf::try_from(location_conf)?,
|
||||
shard_identity,
|
||||
Some(init_order.clone()),
|
||||
&TENANTS,
|
||||
SpawnMode::Lazy,
|
||||
&ctx,
|
||||
) {
|
||||
Ok(tenant) => {
|
||||
tenants.insert(tenant_shard_id, TenantSlot::Attached(tenant));
|
||||
}
|
||||
LocationMode::Secondary(secondary_conf) => TenantSlot::Secondary(SecondaryTenant::new(
|
||||
tenant_shard_id,
|
||||
shard_identity,
|
||||
location_conf.tenant_conf,
|
||||
&secondary_conf,
|
||||
)),
|
||||
};
|
||||
|
||||
tenants.insert(tenant_shard_id, slot);
|
||||
Err(e) => {
|
||||
error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
info!("Processed {} local tenants at startup", tenants.len());
|
||||
@@ -888,6 +815,16 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
|
||||
// caller will log how long we took
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum SetNewTenantConfigError {
|
||||
#[error(transparent)]
|
||||
GetTenant(#[from] GetTenantError),
|
||||
#[error(transparent)]
|
||||
Persist(anyhow::Error),
|
||||
#[error(transparent)]
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum UpsertLocationError {
|
||||
#[error("Bad config request: {0}")]
|
||||
@@ -913,21 +850,32 @@ impl TenantManager {
|
||||
self.conf
|
||||
}
|
||||
|
||||
/// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or currently
|
||||
/// undergoing a state change (i.e. slot is InProgress).
|
||||
///
|
||||
/// The return Tenant is not guaranteed to be active: check its status after obtaing it, or
|
||||
/// use [`Tenant::wait_to_become_active`] before using it if you will do I/O on it.
|
||||
/// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or is not fitting to the query.
|
||||
/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
|
||||
pub(crate) fn get_attached_tenant_shard(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
active_only: bool,
|
||||
) -> Result<Arc<Tenant>, GetTenantError> {
|
||||
let locked = self.tenants.read().unwrap();
|
||||
|
||||
let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?;
|
||||
|
||||
match peek_slot {
|
||||
Some(TenantSlot::Attached(tenant)) => Ok(Arc::clone(tenant)),
|
||||
Some(TenantSlot::Attached(tenant)) => match tenant.current_state() {
|
||||
TenantState::Broken {
|
||||
reason,
|
||||
backtrace: _,
|
||||
} if active_only => Err(GetTenantError::Broken(reason)),
|
||||
TenantState::Active => Ok(Arc::clone(tenant)),
|
||||
_ => {
|
||||
if active_only {
|
||||
Err(GetTenantError::NotActive(tenant_shard_id))
|
||||
} else {
|
||||
Ok(Arc::clone(tenant))
|
||||
}
|
||||
}
|
||||
},
|
||||
Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)),
|
||||
None | Some(TenantSlot::Secondary(_)) => {
|
||||
Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
|
||||
@@ -1420,8 +1368,7 @@ impl TenantManager {
|
||||
.wait_to_become_active(activation_timeout)
|
||||
.await
|
||||
.map_err(|e| match e {
|
||||
GetActiveTenantError::WillNotBecomeActive(_)
|
||||
| GetActiveTenantError::Broken(_) => {
|
||||
GetActiveTenantError::WillNotBecomeActive(_) => {
|
||||
DeleteTenantError::InvalidState(tenant.current_state())
|
||||
}
|
||||
GetActiveTenantError::Cancelled => DeleteTenantError::Cancelled,
|
||||
@@ -1448,30 +1395,29 @@ impl TenantManager {
|
||||
result
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))]
|
||||
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), new_shard_count=%new_shard_count.literal()))]
|
||||
pub(crate) async fn shard_split(
|
||||
&self,
|
||||
tenant: Arc<Tenant>,
|
||||
tenant_shard_id: TenantShardId,
|
||||
new_shard_count: ShardCount,
|
||||
new_stripe_size: Option<ShardStripeSize>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<TenantShardId>> {
|
||||
let tenant_shard_id = *tenant.get_tenant_shard_id();
|
||||
let r = self
|
||||
.do_shard_split(tenant, new_shard_count, new_stripe_size, ctx)
|
||||
.do_shard_split(tenant_shard_id, new_shard_count, new_stripe_size, ctx)
|
||||
.await;
|
||||
if r.is_err() {
|
||||
// Shard splitting might have left the original shard in a partially shut down state (it
|
||||
// stops the shard's remote timeline client). Reset it to ensure we leave things in
|
||||
// a working state.
|
||||
if self.get(tenant_shard_id).is_some() {
|
||||
tracing::warn!("Resetting after shard split failure");
|
||||
tracing::warn!("Resetting {tenant_shard_id} after shard split failure");
|
||||
if let Err(e) = self.reset_tenant(tenant_shard_id, false, ctx).await {
|
||||
// Log this error because our return value will still be the original error, not this one. This is
|
||||
// a severe error: if this happens, we might be leaving behind a tenant that is not fully functional
|
||||
// (e.g. has uploads disabled). We can't do anything else: if reset fails then shutting the tenant down or
|
||||
// setting it broken probably won't help either.
|
||||
tracing::error!("Failed to reset: {e}");
|
||||
tracing::error!("Failed to reset {tenant_shard_id}: {e}");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1481,12 +1427,12 @@ impl TenantManager {
|
||||
|
||||
pub(crate) async fn do_shard_split(
|
||||
&self,
|
||||
tenant: Arc<Tenant>,
|
||||
tenant_shard_id: TenantShardId,
|
||||
new_shard_count: ShardCount,
|
||||
new_stripe_size: Option<ShardStripeSize>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<TenantShardId>> {
|
||||
let tenant_shard_id = *tenant.get_tenant_shard_id();
|
||||
let tenant = get_tenant(tenant_shard_id, true)?;
|
||||
|
||||
// Validate the incoming request
|
||||
if new_shard_count.count() <= tenant_shard_id.shard_count.count() {
|
||||
@@ -1532,6 +1478,7 @@ impl TenantManager {
|
||||
// If [`Tenant::split_prepare`] fails, we must reload the tenant, because it might
|
||||
// have been left in a partially-shut-down state.
|
||||
tracing::warn!("Failed to prepare for split: {e}, reloading Tenant before returning");
|
||||
self.reset_tenant(tenant_shard_id, false, ctx).await?;
|
||||
return Err(e);
|
||||
}
|
||||
|
||||
@@ -1715,9 +1662,9 @@ impl TenantManager {
|
||||
.layers
|
||||
.read()
|
||||
.await
|
||||
.likely_resident_layers()
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
.resident_layers()
|
||||
.collect::<Vec<_>>()
|
||||
.await;
|
||||
for layer in timeline_layers {
|
||||
let relative_path = layer
|
||||
.local_path()
|
||||
@@ -1929,23 +1876,38 @@ impl TenantManager {
|
||||
removal_result
|
||||
}
|
||||
|
||||
pub(crate) fn list_tenants(
|
||||
pub(crate) async fn set_new_tenant_config(
|
||||
&self,
|
||||
) -> Result<Vec<(TenantShardId, TenantState, Generation)>, TenantMapListError> {
|
||||
let tenants = TENANTS.read().unwrap();
|
||||
let m = match &*tenants {
|
||||
TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
|
||||
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m,
|
||||
};
|
||||
Ok(m.iter()
|
||||
.filter_map(|(id, tenant)| match tenant {
|
||||
TenantSlot::Attached(tenant) => {
|
||||
Some((*id, tenant.current_state(), tenant.generation()))
|
||||
}
|
||||
TenantSlot::Secondary(_) => None,
|
||||
TenantSlot::InProgress(_) => None,
|
||||
})
|
||||
.collect())
|
||||
new_tenant_conf: TenantConfOpt,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<(), SetNewTenantConfigError> {
|
||||
// Legacy API: does not support sharding
|
||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||
|
||||
info!("configuring tenant {tenant_id}");
|
||||
let tenant = get_tenant(tenant_shard_id, true)?;
|
||||
|
||||
if !tenant.tenant_shard_id().shard_count.is_unsharded() {
|
||||
// Note that we use ShardParameters::default below.
|
||||
return Err(SetNewTenantConfigError::Other(anyhow::anyhow!(
|
||||
"This API may only be used on single-sharded tenants, use the /location_config API for sharded tenants"
|
||||
)));
|
||||
}
|
||||
|
||||
// This is a legacy API that only operates on attached tenants: the preferred
|
||||
// API to use is the location_config/ endpoint, which lets the caller provide
|
||||
// the full LocationConf.
|
||||
let location_conf = LocationConf::attached_single(
|
||||
new_tenant_conf.clone(),
|
||||
tenant.generation,
|
||||
&ShardParameters::default(),
|
||||
);
|
||||
|
||||
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &location_conf)
|
||||
.await
|
||||
.map_err(SetNewTenantConfigError::Persist)?;
|
||||
tenant.set_new_tenant_config(new_tenant_conf);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1958,12 +1920,51 @@ pub(crate) enum GetTenantError {
|
||||
|
||||
#[error("Tenant {0} is not active")]
|
||||
NotActive(TenantShardId),
|
||||
/// Broken is logically a subset of NotActive, but a distinct error is useful as
|
||||
/// NotActive is usually a retryable state for API purposes, whereas Broken
|
||||
/// is a stuck error state
|
||||
#[error("Tenant is broken: {0}")]
|
||||
Broken(String),
|
||||
|
||||
// Initializing or shutting down: cannot authoritatively say whether we have this tenant
|
||||
#[error("Tenant map is not available: {0}")]
|
||||
MapState(#[from] TenantMapError),
|
||||
}
|
||||
|
||||
/// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
|
||||
/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
|
||||
///
|
||||
/// This method is cancel-safe.
|
||||
pub(crate) fn get_tenant(
|
||||
tenant_shard_id: TenantShardId,
|
||||
active_only: bool,
|
||||
) -> Result<Arc<Tenant>, GetTenantError> {
|
||||
let locked = TENANTS.read().unwrap();
|
||||
|
||||
let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?;
|
||||
|
||||
match peek_slot {
|
||||
Some(TenantSlot::Attached(tenant)) => match tenant.current_state() {
|
||||
TenantState::Broken {
|
||||
reason,
|
||||
backtrace: _,
|
||||
} if active_only => Err(GetTenantError::Broken(reason)),
|
||||
TenantState::Active => Ok(Arc::clone(tenant)),
|
||||
_ => {
|
||||
if active_only {
|
||||
Err(GetTenantError::NotActive(tenant_shard_id))
|
||||
} else {
|
||||
Ok(Arc::clone(tenant))
|
||||
}
|
||||
}
|
||||
},
|
||||
Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)),
|
||||
None | Some(TenantSlot::Secondary(_)) => {
|
||||
Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum GetActiveTenantError {
|
||||
/// We may time out either while TenantSlot is InProgress, or while the Tenant
|
||||
@@ -1987,12 +1988,6 @@ pub(crate) enum GetActiveTenantError {
|
||||
/// Tenant exists, but is in a state that cannot become active (e.g. Stopping, Broken)
|
||||
#[error("will not become active. Current state: {0}")]
|
||||
WillNotBecomeActive(TenantState),
|
||||
|
||||
/// Broken is logically a subset of WillNotBecomeActive, but a distinct error is useful as
|
||||
/// WillNotBecomeActive is a permitted error under some circumstances, whereas broken should
|
||||
/// never happen.
|
||||
#[error("Tenant is broken: {0}")]
|
||||
Broken(String),
|
||||
}
|
||||
|
||||
/// Get a [`Tenant`] in its active state. If the tenant_id is currently in [`TenantSlot::InProgress`]
|
||||
@@ -2148,7 +2143,7 @@ pub(crate) async fn load_tenant(
|
||||
|
||||
let mut location_conf =
|
||||
Tenant::load_tenant_config(conf, &tenant_shard_id).map_err(TenantMapInsertError::Other)?;
|
||||
location_conf.attach_in_generation(AttachmentMode::Single, generation);
|
||||
location_conf.attach_in_generation(generation);
|
||||
|
||||
Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;
|
||||
|
||||
@@ -2212,6 +2207,27 @@ pub(crate) enum TenantMapListError {
|
||||
Initializing,
|
||||
}
|
||||
|
||||
///
|
||||
/// Get list of tenants, for the mgmt API
|
||||
///
|
||||
pub(crate) async fn list_tenants(
|
||||
) -> Result<Vec<(TenantShardId, TenantState, Generation)>, TenantMapListError> {
|
||||
let tenants = TENANTS.read().unwrap();
|
||||
let m = match &*tenants {
|
||||
TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
|
||||
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m,
|
||||
};
|
||||
Ok(m.iter()
|
||||
.filter_map(|(id, tenant)| match tenant {
|
||||
TenantSlot::Attached(tenant) => {
|
||||
Some((*id, tenant.current_state(), tenant.generation()))
|
||||
}
|
||||
TenantSlot::Secondary(_) => None,
|
||||
TenantSlot::InProgress(_) => None,
|
||||
})
|
||||
.collect())
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum TenantMapInsertError {
|
||||
#[error(transparent)]
|
||||
|
||||
@@ -217,7 +217,7 @@ use crate::task_mgr::shutdown_token;
|
||||
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::remote_timeline_client::download::download_retry;
|
||||
use crate::tenant::storage_layer::AsLayerDesc;
|
||||
use crate::tenant::upload_queue::{Delete, UploadQueueStoppedDeletable};
|
||||
use crate::tenant::upload_queue::Delete;
|
||||
use crate::tenant::TIMELINES_SEGMENT_NAME;
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
@@ -266,6 +266,15 @@ pub enum MaybeDeletedIndexPart {
|
||||
Deleted(IndexPart),
|
||||
}
|
||||
|
||||
/// Errors that can arise when calling [`RemoteTimelineClient::stop`].
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum StopError {
|
||||
/// Returned if the upload queue was never initialized.
|
||||
/// See [`RemoteTimelineClient::init_upload_queue`] and [`RemoteTimelineClient::init_upload_queue_for_empty_remote`].
|
||||
#[error("queue is not initialized")]
|
||||
QueueUninitialized,
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum PersistIndexPartWithDeletedFlagError {
|
||||
#[error("another task is already setting the deleted_flag, started at {0:?}")]
|
||||
@@ -390,10 +399,15 @@ impl RemoteTimelineClient {
|
||||
"bug: it is responsibility of the caller to provide index part from MaybeDeletedIndexPart::Deleted"
|
||||
))?;
|
||||
|
||||
{
|
||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||
upload_queue.initialize_with_current_remote_index_part(index_part)?;
|
||||
self.update_remote_physical_size_gauge(Some(index_part));
|
||||
}
|
||||
// also locks upload queue, without dropping the guard above it will be a deadlock
|
||||
self.stop().expect("initialized line above");
|
||||
|
||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||
upload_queue.initialize_with_current_remote_index_part(index_part)?;
|
||||
self.update_remote_physical_size_gauge(Some(index_part));
|
||||
self.stop_impl(&mut upload_queue);
|
||||
|
||||
upload_queue
|
||||
.stopped_mut()
|
||||
@@ -407,8 +421,7 @@ impl RemoteTimelineClient {
|
||||
match &mut *self.upload_queue.lock().unwrap() {
|
||||
UploadQueue::Uninitialized => None,
|
||||
UploadQueue::Initialized(q) => q.get_last_remote_consistent_lsn_projected(),
|
||||
UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => None,
|
||||
UploadQueue::Stopped(UploadQueueStopped::Deletable(q)) => q
|
||||
UploadQueue::Stopped(q) => q
|
||||
.upload_queue_for_deletion
|
||||
.get_last_remote_consistent_lsn_projected(),
|
||||
}
|
||||
@@ -418,8 +431,7 @@ impl RemoteTimelineClient {
|
||||
match &mut *self.upload_queue.lock().unwrap() {
|
||||
UploadQueue::Uninitialized => None,
|
||||
UploadQueue::Initialized(q) => Some(q.get_last_remote_consistent_lsn_visible()),
|
||||
UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => None,
|
||||
UploadQueue::Stopped(UploadQueueStopped::Deletable(q)) => Some(
|
||||
UploadQueue::Stopped(q) => Some(
|
||||
q.upload_queue_for_deletion
|
||||
.get_last_remote_consistent_lsn_visible(),
|
||||
),
|
||||
@@ -886,7 +898,7 @@ impl RemoteTimelineClient {
|
||||
/// Wait for all previously scheduled operations to complete, and then stop.
|
||||
///
|
||||
/// Not cancellation safe
|
||||
pub(crate) async fn shutdown(self: &Arc<Self>) {
|
||||
pub(crate) async fn shutdown(self: &Arc<Self>) -> Result<(), StopError> {
|
||||
// On cancellation the queue is left in ackward state of refusing new operations but
|
||||
// proper stop is yet to be called. On cancel the original or some later task must call
|
||||
// `stop` or `shutdown`.
|
||||
@@ -897,12 +909,8 @@ impl RemoteTimelineClient {
|
||||
let fut = {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = match &mut *guard {
|
||||
UploadQueue::Stopped(_) => return,
|
||||
UploadQueue::Uninitialized => {
|
||||
// transition into Stopped state
|
||||
self.stop_impl(&mut guard);
|
||||
return;
|
||||
}
|
||||
UploadQueue::Stopped(_) => return Ok(()),
|
||||
UploadQueue::Uninitialized => return Err(StopError::QueueUninitialized),
|
||||
UploadQueue::Initialized(ref mut init) => init,
|
||||
};
|
||||
|
||||
@@ -934,7 +942,7 @@ impl RemoteTimelineClient {
|
||||
}
|
||||
}
|
||||
|
||||
self.stop();
|
||||
self.stop()
|
||||
}
|
||||
|
||||
/// Set the deleted_at field in the remote index file.
|
||||
@@ -1316,7 +1324,12 @@ impl RemoteTimelineClient {
|
||||
// upload finishes or times out soon enough.
|
||||
if cancel.is_cancelled() {
|
||||
info!("upload task cancelled by shutdown request");
|
||||
self.stop();
|
||||
match self.stop() {
|
||||
Ok(()) => {}
|
||||
Err(StopError::QueueUninitialized) => {
|
||||
unreachable!("we never launch an upload task if the queue is uninitialized, and once it is initialized, we never go back")
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1571,23 +1584,17 @@ impl RemoteTimelineClient {
|
||||
/// In-progress operations will still be running after this function returns.
|
||||
/// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))`
|
||||
/// to wait for them to complete, after calling this function.
|
||||
pub(crate) fn stop(&self) {
|
||||
pub(crate) fn stop(&self) -> Result<(), StopError> {
|
||||
// Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue
|
||||
// into stopped state, thereby dropping all off the queued *ops* which haven't become *tasks* yet.
|
||||
// The other *tasks* will come here and observe an already shut down queue and hence simply wrap up their business.
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
self.stop_impl(&mut guard);
|
||||
}
|
||||
|
||||
fn stop_impl(&self, guard: &mut std::sync::MutexGuard<UploadQueue>) {
|
||||
match &mut **guard {
|
||||
UploadQueue::Uninitialized => {
|
||||
info!("UploadQueue is in state Uninitialized, nothing to do");
|
||||
**guard = UploadQueue::Stopped(UploadQueueStopped::Uninitialized);
|
||||
}
|
||||
match &mut *guard {
|
||||
UploadQueue::Uninitialized => Err(StopError::QueueUninitialized),
|
||||
UploadQueue::Stopped(_) => {
|
||||
// nothing to do
|
||||
info!("another concurrent task already shut down the queue");
|
||||
Ok(())
|
||||
}
|
||||
UploadQueue::Initialized(initialized) => {
|
||||
info!("shutting down upload queue");
|
||||
@@ -1620,13 +1627,11 @@ impl RemoteTimelineClient {
|
||||
};
|
||||
|
||||
let upload_queue = std::mem::replace(
|
||||
&mut **guard,
|
||||
UploadQueue::Stopped(UploadQueueStopped::Deletable(
|
||||
UploadQueueStoppedDeletable {
|
||||
upload_queue_for_deletion,
|
||||
deleted_at: SetDeletedFlagProgress::NotRunning,
|
||||
},
|
||||
)),
|
||||
&mut *guard,
|
||||
UploadQueue::Stopped(UploadQueueStopped {
|
||||
upload_queue_for_deletion,
|
||||
deleted_at: SetDeletedFlagProgress::NotRunning,
|
||||
}),
|
||||
);
|
||||
if let UploadQueue::Initialized(qi) = upload_queue {
|
||||
qi
|
||||
@@ -1655,6 +1660,10 @@ impl RemoteTimelineClient {
|
||||
// which is exactly what we want to happen.
|
||||
drop(op);
|
||||
}
|
||||
|
||||
// We're done.
|
||||
drop(guard);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,7 +15,6 @@ use crate::{
|
||||
tenant::{
|
||||
config::SecondaryLocationConfig,
|
||||
debug_assert_current_span_has_tenant_and_timeline_id,
|
||||
ephemeral_file::is_ephemeral_file,
|
||||
remote_timeline_client::{
|
||||
index::LayerFileMetadata, is_temp_download_file, FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
@@ -535,11 +534,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
.await
|
||||
.maybe_fatal_err(&context_msg)?;
|
||||
|
||||
tracing::debug!(
|
||||
"Wrote local heatmap to {}, with {} timelines",
|
||||
heatmap_path,
|
||||
heatmap.timelines.len()
|
||||
);
|
||||
tracing::debug!("Wrote local heatmap to {}", heatmap_path);
|
||||
|
||||
// Clean up any local layers that aren't in the heatmap. We do this first for all timelines, on the general
|
||||
// principle that deletions should be done before writes wherever possible, and so that we can use this
|
||||
@@ -552,10 +547,6 @@ impl<'a> TenantDownloader<'a> {
|
||||
// Download the layers in the heatmap
|
||||
for timeline in heatmap.timelines {
|
||||
if self.secondary_state.cancel.is_cancelled() {
|
||||
tracing::debug!(
|
||||
"Cancelled before downloading timeline {}",
|
||||
timeline.timeline_id
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
@@ -773,13 +764,10 @@ impl<'a> TenantDownloader<'a> {
|
||||
}
|
||||
};
|
||||
|
||||
tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
|
||||
|
||||
// Download heatmap layers that are not present on local disk, or update their
|
||||
// access time if they are already present.
|
||||
for layer in timeline.layers {
|
||||
if self.secondary_state.cancel.is_cancelled() {
|
||||
tracing::debug!("Cancelled -- dropping out of layer loop");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
@@ -962,10 +950,7 @@ async fn init_timeline_state(
|
||||
// Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
|
||||
warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config");
|
||||
continue;
|
||||
} else if crate::is_temporary(&file_path)
|
||||
|| is_temp_download_file(&file_path)
|
||||
|| is_ephemeral_file(file_name)
|
||||
{
|
||||
} else if crate::is_temporary(&file_path) || is_temp_download_file(&file_path) {
|
||||
// Temporary files are frequently left behind from restarting during downloads
|
||||
tracing::info!("Cleaning up temporary file {file_path}");
|
||||
if let Err(e) = tokio::fs::remove_file(&file_path)
|
||||
|
||||
@@ -9,7 +9,6 @@ use crate::{
|
||||
metrics::SECONDARY_MODE,
|
||||
tenant::{
|
||||
config::AttachmentMode,
|
||||
mgr::GetTenantError,
|
||||
mgr::TenantManager,
|
||||
remote_timeline_client::remote_heatmap_path,
|
||||
span::debug_assert_current_span_has_tenant_id,
|
||||
@@ -293,11 +292,8 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
|
||||
"Starting heatmap write on command");
|
||||
let tenant = self
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(*tenant_shard_id)
|
||||
.get_attached_tenant_shard(*tenant_shard_id, true)
|
||||
.map_err(|e| anyhow::anyhow!(e))?;
|
||||
if !tenant.is_active() {
|
||||
return Err(GetTenantError::NotActive(*tenant_shard_id).into());
|
||||
}
|
||||
|
||||
Ok(UploadPending {
|
||||
// Ignore our state for last digest: this forces an upload even if nothing has changed
|
||||
|
||||
@@ -300,7 +300,6 @@ where
|
||||
|
||||
let tenant_shard_id = job.get_tenant_shard_id();
|
||||
let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
|
||||
tracing::info!("Command already running, waiting for it");
|
||||
barrier
|
||||
} else {
|
||||
let running = self.spawn_now(job);
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
pub mod delta_layer;
|
||||
mod filename;
|
||||
pub mod image_layer;
|
||||
pub(crate) mod inmemory_layer;
|
||||
mod inmemory_layer;
|
||||
pub(crate) mod layer;
|
||||
mod layer_desc;
|
||||
|
||||
|
||||
@@ -47,7 +47,6 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use bytes::BytesMut;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use futures::StreamExt;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::LayerAccessKind;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
@@ -947,34 +946,6 @@ impl DeltaLayerInner {
|
||||
Ok(planner.finish())
|
||||
}
|
||||
|
||||
fn get_min_read_buffer_size(
|
||||
planned_reads: &[VectoredRead],
|
||||
read_size_soft_max: usize,
|
||||
) -> usize {
|
||||
let Some(largest_read) = planned_reads.iter().max_by_key(|read| read.size()) else {
|
||||
return read_size_soft_max;
|
||||
};
|
||||
|
||||
let largest_read_size = largest_read.size();
|
||||
if largest_read_size > read_size_soft_max {
|
||||
// If the read is oversized, it should only contain one key.
|
||||
let offenders = largest_read
|
||||
.blobs_at
|
||||
.as_slice()
|
||||
.iter()
|
||||
.map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn))
|
||||
.join(", ");
|
||||
tracing::warn!(
|
||||
"Oversized vectored read ({} > {}) for keys {}",
|
||||
largest_read_size,
|
||||
read_size_soft_max,
|
||||
offenders
|
||||
);
|
||||
}
|
||||
|
||||
largest_read_size
|
||||
}
|
||||
|
||||
async fn do_reads_and_update_state(
|
||||
&self,
|
||||
reads: Vec<VectoredRead>,
|
||||
@@ -988,8 +959,7 @@ impl DeltaLayerInner {
|
||||
.expect("Layer is loaded with max vectored bytes config")
|
||||
.0
|
||||
.into();
|
||||
let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes);
|
||||
let mut buf = Some(BytesMut::with_capacity(buf_size));
|
||||
let mut buf = Some(BytesMut::with_capacity(max_vectored_read_bytes));
|
||||
|
||||
// Note that reads are processed in reverse order (from highest key+lsn).
|
||||
// This is the order that `ReconstructState` requires such that it can
|
||||
@@ -1016,7 +986,7 @@ impl DeltaLayerInner {
|
||||
|
||||
// We have "lost" the buffer since the lower level IO api
|
||||
// doesn't return the buffer on error. Allocate a new one.
|
||||
buf = Some(BytesMut::with_capacity(buf_size));
|
||||
buf = Some(BytesMut::with_capacity(max_vectored_read_bytes));
|
||||
|
||||
continue;
|
||||
}
|
||||
@@ -1240,16 +1210,9 @@ impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for Del
|
||||
mod test {
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use itertools::MinMaxResult;
|
||||
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
|
||||
use rand::RngCore;
|
||||
|
||||
use super::*;
|
||||
use crate::{
|
||||
context::DownloadBehavior,
|
||||
task_mgr::TaskKind,
|
||||
tenant::{disk_btree::tests::TestDisk, harness::TenantHarness},
|
||||
DEFAULT_PG_VERSION,
|
||||
context::DownloadBehavior, task_mgr::TaskKind, tenant::disk_btree::tests::TestDisk,
|
||||
};
|
||||
|
||||
/// Construct an index for a fictional delta layer and and then
|
||||
@@ -1369,229 +1332,4 @@ mod test {
|
||||
|
||||
assert_eq!(planned_blobs, expected_blobs);
|
||||
}
|
||||
|
||||
mod constants {
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
/// Offset used by all lsns in this test
|
||||
pub(super) const LSN_OFFSET: Lsn = Lsn(0x08);
|
||||
/// Number of unique keys including in the test data
|
||||
pub(super) const KEY_COUNT: u8 = 60;
|
||||
/// Max number of different lsns for each key
|
||||
pub(super) const MAX_ENTRIES_PER_KEY: u8 = 20;
|
||||
/// Possible value sizes for each key along with a probability weight
|
||||
pub(super) const VALUE_SIZES: [(usize, u8); 3] = [(100, 2), (1024, 2), (1024 * 1024, 1)];
|
||||
/// Probability that there will be a gap between the current key and the next one (33.3%)
|
||||
pub(super) const KEY_GAP_CHANGES: [(bool, u8); 2] = [(true, 1), (false, 2)];
|
||||
/// The minimum size of a key range in all the generated reads
|
||||
pub(super) const MIN_RANGE_SIZE: i128 = 10;
|
||||
/// The number of ranges included in each vectored read
|
||||
pub(super) const RANGES_COUNT: u8 = 2;
|
||||
/// The number of vectored reads performed
|
||||
pub(super) const READS_COUNT: u8 = 100;
|
||||
/// Soft max size of a vectored read. Will be violated if we have to read keys
|
||||
/// with values larger than the limit
|
||||
pub(super) const MAX_VECTORED_READ_BYTES: usize = 64 * 1024;
|
||||
}
|
||||
|
||||
struct Entry {
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
value: Vec<u8>,
|
||||
}
|
||||
|
||||
fn generate_entries(rng: &mut StdRng) -> Vec<Entry> {
|
||||
let mut current_key = Key::MIN;
|
||||
|
||||
let mut entries = Vec::new();
|
||||
for _ in 0..constants::KEY_COUNT {
|
||||
let count = rng.gen_range(1..constants::MAX_ENTRIES_PER_KEY);
|
||||
let mut lsns_iter =
|
||||
std::iter::successors(Some(Lsn(constants::LSN_OFFSET.0 + 0x08)), |lsn| {
|
||||
Some(Lsn(lsn.0 + 0x08))
|
||||
});
|
||||
let mut lsns = Vec::new();
|
||||
while lsns.len() < count as usize {
|
||||
let take = rng.gen_bool(0.5);
|
||||
let lsn = lsns_iter.next().unwrap();
|
||||
if take {
|
||||
lsns.push(lsn);
|
||||
}
|
||||
}
|
||||
|
||||
for lsn in lsns {
|
||||
let size = constants::VALUE_SIZES
|
||||
.choose_weighted(rng, |item| item.1)
|
||||
.unwrap()
|
||||
.0;
|
||||
let mut buf = vec![0; size];
|
||||
rng.fill_bytes(&mut buf);
|
||||
|
||||
entries.push(Entry {
|
||||
key: current_key,
|
||||
lsn,
|
||||
value: buf,
|
||||
})
|
||||
}
|
||||
|
||||
let gap = constants::KEY_GAP_CHANGES
|
||||
.choose_weighted(rng, |item| item.1)
|
||||
.unwrap()
|
||||
.0;
|
||||
if gap {
|
||||
current_key = current_key.add(2);
|
||||
} else {
|
||||
current_key = current_key.add(1);
|
||||
}
|
||||
}
|
||||
|
||||
entries
|
||||
}
|
||||
|
||||
struct EntriesMeta {
|
||||
key_range: Range<Key>,
|
||||
lsn_range: Range<Lsn>,
|
||||
index: BTreeMap<(Key, Lsn), Vec<u8>>,
|
||||
}
|
||||
|
||||
fn get_entries_meta(entries: &[Entry]) -> EntriesMeta {
|
||||
let key_range = match entries.iter().minmax_by_key(|e| e.key) {
|
||||
MinMaxResult::MinMax(min, max) => min.key..max.key.next(),
|
||||
_ => panic!("More than one entry is always expected"),
|
||||
};
|
||||
|
||||
let lsn_range = match entries.iter().minmax_by_key(|e| e.lsn) {
|
||||
MinMaxResult::MinMax(min, max) => min.lsn..Lsn(max.lsn.0 + 1),
|
||||
_ => panic!("More than one entry is always expected"),
|
||||
};
|
||||
|
||||
let mut index = BTreeMap::new();
|
||||
for entry in entries.iter() {
|
||||
index.insert((entry.key, entry.lsn), entry.value.clone());
|
||||
}
|
||||
|
||||
EntriesMeta {
|
||||
key_range,
|
||||
lsn_range,
|
||||
index,
|
||||
}
|
||||
}
|
||||
|
||||
fn pick_random_keyspace(rng: &mut StdRng, key_range: &Range<Key>) -> KeySpace {
|
||||
let start = key_range.start.to_i128();
|
||||
let end = key_range.end.to_i128();
|
||||
|
||||
let mut keyspace = KeySpace::default();
|
||||
|
||||
for _ in 0..constants::RANGES_COUNT {
|
||||
let mut range: Option<Range<Key>> = Option::default();
|
||||
while range.is_none() || keyspace.overlaps(range.as_ref().unwrap()) {
|
||||
let range_start = rng.gen_range(start..end);
|
||||
let range_end_offset = range_start + constants::MIN_RANGE_SIZE;
|
||||
if range_end_offset >= end {
|
||||
range = Some(Key::from_i128(range_start)..Key::from_i128(end));
|
||||
} else {
|
||||
let range_end = rng.gen_range((range_start + constants::MIN_RANGE_SIZE)..end);
|
||||
range = Some(Key::from_i128(range_start)..Key::from_i128(range_end));
|
||||
}
|
||||
}
|
||||
keyspace.ranges.push(range.unwrap());
|
||||
}
|
||||
|
||||
keyspace
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_delta_layer_vectored_read_end_to_end() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read")?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let timeline_id = TimelineId::generate();
|
||||
let timeline = tenant
|
||||
.create_test_timeline(timeline_id, constants::LSN_OFFSET, DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
|
||||
tracing::info!("Generating test data ...");
|
||||
|
||||
let rng = &mut StdRng::seed_from_u64(0);
|
||||
let entries = generate_entries(rng);
|
||||
let entries_meta = get_entries_meta(&entries);
|
||||
|
||||
tracing::info!("Done generating {} entries", entries.len());
|
||||
|
||||
tracing::info!("Writing test data to delta layer ...");
|
||||
let mut writer = DeltaLayerWriter::new(
|
||||
harness.conf,
|
||||
timeline_id,
|
||||
harness.tenant_shard_id,
|
||||
entries_meta.key_range.start,
|
||||
entries_meta.lsn_range.clone(),
|
||||
)
|
||||
.await?;
|
||||
|
||||
for entry in entries {
|
||||
let (_, res) = writer
|
||||
.put_value_bytes(entry.key, entry.lsn, entry.value, false)
|
||||
.await;
|
||||
res?;
|
||||
}
|
||||
|
||||
let resident = writer.finish(entries_meta.key_range.end, &timeline).await?;
|
||||
|
||||
let inner = resident.get_inner_delta(&ctx).await?;
|
||||
|
||||
let file_size = inner.file.metadata().await?.len();
|
||||
tracing::info!(
|
||||
"Done writing test data to delta layer. Resulting file size is: {}",
|
||||
file_size
|
||||
);
|
||||
|
||||
for i in 0..constants::READS_COUNT {
|
||||
tracing::info!("Doing vectored read {}/{}", i + 1, constants::READS_COUNT);
|
||||
|
||||
let block_reader = FileBlockReader::new(&inner.file, inner.file_id);
|
||||
let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
inner.index_start_blk,
|
||||
inner.index_root_blk,
|
||||
block_reader,
|
||||
);
|
||||
|
||||
let planner = VectoredReadPlanner::new(constants::MAX_VECTORED_READ_BYTES);
|
||||
let mut reconstruct_state = ValuesReconstructState::new();
|
||||
let keyspace = pick_random_keyspace(rng, &entries_meta.key_range);
|
||||
let data_end_offset = inner.index_start_blk as u64 * PAGE_SZ as u64;
|
||||
|
||||
let vectored_reads = DeltaLayerInner::plan_reads(
|
||||
keyspace.clone(),
|
||||
entries_meta.lsn_range.clone(),
|
||||
data_end_offset,
|
||||
index_reader,
|
||||
planner,
|
||||
&mut reconstruct_state,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let vectored_blob_reader = VectoredBlobReader::new(&inner.file);
|
||||
let buf_size = DeltaLayerInner::get_min_read_buffer_size(
|
||||
&vectored_reads,
|
||||
constants::MAX_VECTORED_READ_BYTES,
|
||||
);
|
||||
let mut buf = Some(BytesMut::with_capacity(buf_size));
|
||||
|
||||
for read in vectored_reads {
|
||||
let blobs_buf = vectored_blob_reader
|
||||
.read_blobs(&read, buf.take().expect("Should have a buffer"))
|
||||
.await?;
|
||||
for meta in blobs_buf.blobs.iter() {
|
||||
let value = &blobs_buf.buf[meta.start..meta.end];
|
||||
assert_eq!(value, entries_meta.index[&(meta.meta.key, meta.meta.lsn)]);
|
||||
}
|
||||
|
||||
buf = Some(blobs_buf.buf);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -44,7 +44,6 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use hex;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::LayerAccessKind;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
@@ -541,25 +540,7 @@ impl ImageLayerInner {
|
||||
|
||||
let vectored_blob_reader = VectoredBlobReader::new(&self.file);
|
||||
for read in reads.into_iter() {
|
||||
let buf_size = read.size();
|
||||
|
||||
if buf_size > max_vectored_read_bytes {
|
||||
// If the read is oversized, it should only contain one key.
|
||||
let offenders = read
|
||||
.blobs_at
|
||||
.as_slice()
|
||||
.iter()
|
||||
.map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn))
|
||||
.join(", ");
|
||||
tracing::warn!(
|
||||
"Oversized vectored read ({} > {}) for keys {}",
|
||||
buf_size,
|
||||
max_vectored_read_bytes,
|
||||
offenders
|
||||
);
|
||||
}
|
||||
|
||||
let buf = BytesMut::with_capacity(buf_size);
|
||||
let buf = BytesMut::with_capacity(max_vectored_read_bytes);
|
||||
let res = vectored_blob_reader.read_blobs(&read, buf).await;
|
||||
|
||||
match res {
|
||||
|
||||
@@ -23,12 +23,8 @@ use tracing::*;
|
||||
use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
|
||||
// avoid binding to Write (conflicts with std::io::Write)
|
||||
// while being able to use std::fmt::Write's methods
|
||||
use crate::metrics::TIMELINE_EPHEMERAL_BYTES;
|
||||
use std::cmp::Ordering;
|
||||
use std::fmt::Write as _;
|
||||
use std::ops::Range;
|
||||
use std::sync::atomic::Ordering as AtomicOrdering;
|
||||
use std::sync::atomic::{AtomicU64, AtomicUsize};
|
||||
use tokio::sync::{RwLock, RwLockWriteGuard};
|
||||
|
||||
use super::{
|
||||
@@ -74,8 +70,6 @@ pub struct InMemoryLayerInner {
|
||||
/// Each serialized Value is preceded by a 'u32' length field.
|
||||
/// PerSeg::page_versions map stores offsets into this file.
|
||||
file: EphemeralFile,
|
||||
|
||||
resource_units: GlobalResourceUnits,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for InMemoryLayerInner {
|
||||
@@ -84,121 +78,6 @@ impl std::fmt::Debug for InMemoryLayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
/// State shared by all in-memory (ephemeral) layers. Updated infrequently during background ticks in Timeline,
|
||||
/// to minimize contention.
|
||||
///
|
||||
/// This global state is used to implement behaviors that require a global view of the system, e.g.
|
||||
/// rolling layers proactively to limit the total amount of dirty data.
|
||||
pub(crate) struct GlobalResources {
|
||||
// Limit on how high dirty_bytes may grow before we start freezing layers to reduce it.
|
||||
// Zero means unlimited.
|
||||
pub(crate) max_dirty_bytes: AtomicU64,
|
||||
// How many bytes are in all EphemeralFile objects
|
||||
dirty_bytes: AtomicU64,
|
||||
// How many layers are contributing to dirty_bytes
|
||||
dirty_layers: AtomicUsize,
|
||||
}
|
||||
|
||||
// Per-timeline RAII struct for its contribution to [`GlobalResources`]
|
||||
struct GlobalResourceUnits {
|
||||
// How many dirty bytes have I added to the global dirty_bytes: this guard object is responsible
|
||||
// for decrementing the global counter by this many bytes when dropped.
|
||||
dirty_bytes: u64,
|
||||
}
|
||||
|
||||
impl GlobalResourceUnits {
|
||||
// Hint for the layer append path to update us when the layer size differs from the last
|
||||
// call to update_size by this much. If we don't reach this threshold, we'll still get
|
||||
// updated when the Timeline "ticks" in the background.
|
||||
const MAX_SIZE_DRIFT: u64 = 10 * 1024 * 1024;
|
||||
|
||||
fn new() -> Self {
|
||||
GLOBAL_RESOURCES
|
||||
.dirty_layers
|
||||
.fetch_add(1, AtomicOrdering::Relaxed);
|
||||
Self { dirty_bytes: 0 }
|
||||
}
|
||||
|
||||
/// Do not call this frequently: all timelines will write to these same global atomics,
|
||||
/// so this is a relatively expensive operation. Wait at least a few seconds between calls.
|
||||
///
|
||||
/// Returns the effective layer size limit that should be applied, if any, to keep
|
||||
/// the total number of dirty bytes below the configured maximum.
|
||||
fn publish_size(&mut self, size: u64) -> Option<u64> {
|
||||
let new_global_dirty_bytes = match size.cmp(&self.dirty_bytes) {
|
||||
Ordering::Equal => GLOBAL_RESOURCES.dirty_bytes.load(AtomicOrdering::Relaxed),
|
||||
Ordering::Greater => {
|
||||
let delta = size - self.dirty_bytes;
|
||||
let old = GLOBAL_RESOURCES
|
||||
.dirty_bytes
|
||||
.fetch_add(delta, AtomicOrdering::Relaxed);
|
||||
old + delta
|
||||
}
|
||||
Ordering::Less => {
|
||||
let delta = self.dirty_bytes - size;
|
||||
let old = GLOBAL_RESOURCES
|
||||
.dirty_bytes
|
||||
.fetch_sub(delta, AtomicOrdering::Relaxed);
|
||||
old - delta
|
||||
}
|
||||
};
|
||||
|
||||
// This is a sloppy update: concurrent updates to the counter will race, and the exact
|
||||
// value of the metric might not be the exact latest value of GLOBAL_RESOURCES::dirty_bytes.
|
||||
// That's okay: as long as the metric contains some recent value, it doesn't have to always
|
||||
// be literally the last update.
|
||||
TIMELINE_EPHEMERAL_BYTES.set(new_global_dirty_bytes);
|
||||
|
||||
self.dirty_bytes = size;
|
||||
|
||||
let max_dirty_bytes = GLOBAL_RESOURCES
|
||||
.max_dirty_bytes
|
||||
.load(AtomicOrdering::Relaxed);
|
||||
if max_dirty_bytes > 0 && new_global_dirty_bytes > max_dirty_bytes {
|
||||
// Set the layer file limit to the average layer size: this implies that all above-average
|
||||
// sized layers will be elegible for freezing. They will be frozen in the order they
|
||||
// next enter publish_size.
|
||||
Some(
|
||||
new_global_dirty_bytes
|
||||
/ GLOBAL_RESOURCES.dirty_layers.load(AtomicOrdering::Relaxed) as u64,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
// Call publish_size if the input size differs from last published size by more than
|
||||
// the drift limit
|
||||
fn maybe_publish_size(&mut self, size: u64) {
|
||||
let publish = match size.cmp(&self.dirty_bytes) {
|
||||
Ordering::Equal => false,
|
||||
Ordering::Greater => size - self.dirty_bytes > Self::MAX_SIZE_DRIFT,
|
||||
Ordering::Less => self.dirty_bytes - size > Self::MAX_SIZE_DRIFT,
|
||||
};
|
||||
|
||||
if publish {
|
||||
self.publish_size(size);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for GlobalResourceUnits {
|
||||
fn drop(&mut self) {
|
||||
GLOBAL_RESOURCES
|
||||
.dirty_layers
|
||||
.fetch_sub(1, AtomicOrdering::Relaxed);
|
||||
|
||||
// Subtract our contribution to the global total dirty bytes
|
||||
self.publish_size(0);
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) static GLOBAL_RESOURCES: GlobalResources = GlobalResources {
|
||||
max_dirty_bytes: AtomicU64::new(0),
|
||||
dirty_bytes: AtomicU64::new(0),
|
||||
dirty_layers: AtomicUsize::new(0),
|
||||
};
|
||||
|
||||
impl InMemoryLayer {
|
||||
pub(crate) fn get_timeline_id(&self) -> TimelineId {
|
||||
self.timeline_id
|
||||
@@ -214,10 +93,6 @@ impl InMemoryLayer {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn try_len(&self) -> Option<u64> {
|
||||
self.inner.try_read().map(|i| i.file.len()).ok()
|
||||
}
|
||||
|
||||
pub(crate) fn assert_writable(&self) {
|
||||
assert!(self.end_lsn.get().is_none());
|
||||
}
|
||||
@@ -453,7 +328,6 @@ impl InMemoryLayer {
|
||||
inner: RwLock::new(InMemoryLayerInner {
|
||||
index: HashMap::new(),
|
||||
file,
|
||||
resource_units: GlobalResourceUnits::new(),
|
||||
}),
|
||||
})
|
||||
}
|
||||
@@ -504,18 +378,9 @@ impl InMemoryLayer {
|
||||
warn!("Key {} at {} already exists", key, lsn);
|
||||
}
|
||||
|
||||
let size = locked_inner.file.len();
|
||||
locked_inner.resource_units.maybe_publish_size(size);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn tick(&self) -> Option<u64> {
|
||||
let mut inner = self.inner.write().await;
|
||||
let size = inner.file.len();
|
||||
inner.resource_units.publish_size(size)
|
||||
}
|
||||
|
||||
pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range<Key>, Lsn)]) -> Result<()> {
|
||||
// TODO: Currently, we just leak the storage for any deleted keys
|
||||
Ok(())
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,119 +0,0 @@
|
||||
//! failpoints for unit tests, implying `#[cfg(test)]`.
|
||||
//!
|
||||
//! These are not accessible over http.
|
||||
|
||||
use super::*;
|
||||
|
||||
impl Layer {
|
||||
/// Enable a failpoint from a unit test.
|
||||
pub(super) fn enable_failpoint(&self, failpoint: Failpoint) {
|
||||
self.0.failpoints.lock().unwrap().push(failpoint);
|
||||
}
|
||||
}
|
||||
|
||||
impl LayerInner {
|
||||
/// Query if this failpoint is enabled, as in, arrive at a failpoint.
|
||||
///
|
||||
/// Calls to this method need to be `#[cfg(test)]` guarded.
|
||||
pub(super) async fn failpoint(&self, kind: FailpointKind) -> Result<(), FailpointHit> {
|
||||
let fut = {
|
||||
let mut fps = self.failpoints.lock().unwrap();
|
||||
// find the *last* failpoint for cases in which we need to use multiple for the same
|
||||
// thing (two blocked evictions)
|
||||
let fp = fps.iter_mut().rfind(|x| x.kind() == kind);
|
||||
|
||||
let Some(fp) = fp else {
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
fp.hit()
|
||||
};
|
||||
|
||||
fut.await
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub(crate) enum FailpointKind {
|
||||
/// Failpoint acts as an accurate cancelled by drop here; see the only site of use.
|
||||
AfterDeterminingLayerNeedsNoDownload,
|
||||
/// Failpoint for stalling eviction starting
|
||||
WaitBeforeStartingEvicting,
|
||||
/// Failpoint hit in the spawned task
|
||||
WaitBeforeDownloading,
|
||||
}
|
||||
|
||||
pub(crate) enum Failpoint {
|
||||
AfterDeterminingLayerNeedsNoDownload,
|
||||
WaitBeforeStartingEvicting(
|
||||
Option<utils::completion::Completion>,
|
||||
utils::completion::Barrier,
|
||||
),
|
||||
WaitBeforeDownloading(
|
||||
Option<utils::completion::Completion>,
|
||||
utils::completion::Barrier,
|
||||
),
|
||||
}
|
||||
|
||||
impl Failpoint {
|
||||
fn kind(&self) -> FailpointKind {
|
||||
match self {
|
||||
Failpoint::AfterDeterminingLayerNeedsNoDownload => {
|
||||
FailpointKind::AfterDeterminingLayerNeedsNoDownload
|
||||
}
|
||||
Failpoint::WaitBeforeStartingEvicting(..) => FailpointKind::WaitBeforeStartingEvicting,
|
||||
Failpoint::WaitBeforeDownloading(..) => FailpointKind::WaitBeforeDownloading,
|
||||
}
|
||||
}
|
||||
|
||||
fn hit(&mut self) -> impl std::future::Future<Output = Result<(), FailpointHit>> + 'static {
|
||||
use futures::future::FutureExt;
|
||||
|
||||
// use boxed futures to avoid Either hurdles
|
||||
match self {
|
||||
Failpoint::AfterDeterminingLayerNeedsNoDownload => {
|
||||
let kind = self.kind();
|
||||
|
||||
async move { Err(FailpointHit(kind)) }.boxed()
|
||||
}
|
||||
Failpoint::WaitBeforeStartingEvicting(arrival, b)
|
||||
| Failpoint::WaitBeforeDownloading(arrival, b) => {
|
||||
// first one signals arrival
|
||||
drop(arrival.take());
|
||||
|
||||
let b = b.clone();
|
||||
|
||||
async move {
|
||||
tracing::trace!("waiting on a failpoint barrier");
|
||||
b.wait().await;
|
||||
tracing::trace!("done waiting on a failpoint barrier");
|
||||
Ok(())
|
||||
}
|
||||
.boxed()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for FailpointKind {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
std::fmt::Debug::fmt(self, f)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct FailpointHit(FailpointKind);
|
||||
|
||||
impl std::fmt::Display for FailpointHit {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
std::fmt::Debug::fmt(self, f)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for FailpointHit {}
|
||||
|
||||
impl From<FailpointHit> for DownloadError {
|
||||
fn from(value: FailpointHit) -> Self {
|
||||
DownloadError::Failpoint(value.0)
|
||||
}
|
||||
}
|
||||
@@ -1,13 +1,14 @@
|
||||
use futures::StreamExt;
|
||||
use pageserver_api::key::CONTROLFILE_KEY;
|
||||
use tokio::task::JoinSet;
|
||||
use tracing::Instrument;
|
||||
use utils::{
|
||||
completion::{self, Completion},
|
||||
id::TimelineId,
|
||||
};
|
||||
|
||||
use super::failpoints::{Failpoint, FailpointKind};
|
||||
use super::*;
|
||||
use crate::context::DownloadBehavior;
|
||||
use crate::{context::DownloadBehavior, task_mgr::BACKGROUND_RUNTIME};
|
||||
use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness};
|
||||
|
||||
/// Used in tests to advance a future to wanted await point, and not futher.
|
||||
@@ -20,7 +21,7 @@ const FOREVER: std::time::Duration = std::time::Duration::from_secs(ADVANCE.as_s
|
||||
/// Demonstrate the API and resident -> evicted -> resident -> deleted transitions.
|
||||
#[tokio::test]
|
||||
async fn smoke_test() {
|
||||
let handle = tokio::runtime::Handle::current();
|
||||
let handle = BACKGROUND_RUNTIME.handle();
|
||||
|
||||
let h = TenantHarness::create("smoke_test").unwrap();
|
||||
let span = h.span();
|
||||
@@ -37,7 +38,7 @@ async fn smoke_test() {
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
layers.likely_resident_layers().collect::<Vec<_>>()
|
||||
layers.resident_layers().collect::<Vec<_>>().await
|
||||
};
|
||||
|
||||
assert_eq!(layers.len(), 1);
|
||||
@@ -87,7 +88,7 @@ async fn smoke_test() {
|
||||
//
|
||||
// ZERO for timeout does not work reliably, so first take up all spawn_blocking slots to
|
||||
// artificially slow it down.
|
||||
let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(&handle).await;
|
||||
let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;
|
||||
|
||||
match layer
|
||||
.evict_and_wait(std::time::Duration::ZERO)
|
||||
@@ -98,7 +99,7 @@ async fn smoke_test() {
|
||||
// expected, but note that the eviction is "still ongoing"
|
||||
helper.release().await;
|
||||
// exhaust spawn_blocking pool to ensure it is now complete
|
||||
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle)
|
||||
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle)
|
||||
.await;
|
||||
}
|
||||
other => unreachable!("{other:?}"),
|
||||
@@ -107,7 +108,7 @@ async fn smoke_test() {
|
||||
// only way to query if a layer is resident is to acquire a ResidentLayer instance.
|
||||
// Layer::keep_resident never downloads, but it might initialize if the layer file is found
|
||||
// downloaded locally.
|
||||
let none = layer.keep_resident().await;
|
||||
let none = layer.keep_resident().await.unwrap();
|
||||
assert!(
|
||||
none.is_none(),
|
||||
"Expected none, because eviction removed the local file, found: {none:?}"
|
||||
@@ -166,7 +167,6 @@ async fn smoke_test() {
|
||||
rtc.wait_completion().await.unwrap();
|
||||
|
||||
assert_eq!(rtc.get_remote_physical_size(), 0);
|
||||
assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
|
||||
}
|
||||
|
||||
/// This test demonstrates a previous hang when a eviction and deletion were requested at the same
|
||||
@@ -174,7 +174,7 @@ async fn smoke_test() {
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn evict_and_wait_on_wanted_deleted() {
|
||||
// this is the runtime on which Layer spawns the blocking tasks on
|
||||
let handle = tokio::runtime::Handle::current();
|
||||
let handle = BACKGROUND_RUNTIME.handle();
|
||||
|
||||
let h = TenantHarness::create("evict_and_wait_on_wanted_deleted").unwrap();
|
||||
utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
|
||||
@@ -188,7 +188,7 @@ async fn evict_and_wait_on_wanted_deleted() {
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
layers.likely_resident_layers().collect::<Vec<_>>()
|
||||
layers.resident_layers().collect::<Vec<_>>().await
|
||||
};
|
||||
|
||||
assert_eq!(layers.len(), 1);
|
||||
@@ -213,11 +213,11 @@ async fn evict_and_wait_on_wanted_deleted() {
|
||||
drop(resident);
|
||||
|
||||
// make sure the eviction task gets to run
|
||||
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
|
||||
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await;
|
||||
|
||||
let resident = layer.keep_resident().await;
|
||||
assert!(
|
||||
resident.is_none(),
|
||||
matches!(resident, Ok(None)),
|
||||
"keep_resident should not have re-initialized: {resident:?}"
|
||||
);
|
||||
|
||||
@@ -235,332 +235,24 @@ async fn evict_and_wait_on_wanted_deleted() {
|
||||
layers.finish_gc_timeline(&[layer]);
|
||||
}
|
||||
|
||||
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
|
||||
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await;
|
||||
|
||||
assert_eq!(1, LAYER_IMPL_METRICS.started_deletes.get());
|
||||
assert_eq!(1, LAYER_IMPL_METRICS.completed_deletes.get());
|
||||
assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
|
||||
assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get());
|
||||
assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
|
||||
}
|
||||
|
||||
/// This test ensures we are able to read the layer while the layer eviction has been
|
||||
/// started but not completed.
|
||||
#[test]
|
||||
fn read_wins_pending_eviction() {
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.max_blocking_threads(1)
|
||||
.enable_all()
|
||||
.start_paused(true)
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
rt.block_on(async move {
|
||||
// this is the runtime on which Layer spawns the blocking tasks on
|
||||
let handle = tokio::runtime::Handle::current();
|
||||
let h = TenantHarness::create("read_wins_pending_eviction").unwrap();
|
||||
let (tenant, ctx) = h.load().await;
|
||||
let span = h.span();
|
||||
let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
|
||||
|
||||
let timeline = tenant
|
||||
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
layers.likely_resident_layers().collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
assert_eq!(layers.len(), 1);
|
||||
|
||||
layers.swap_remove(0)
|
||||
};
|
||||
|
||||
// setup done
|
||||
|
||||
let resident = layer.keep_resident().await.unwrap();
|
||||
|
||||
let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER));
|
||||
|
||||
// drive the future to await on the status channel
|
||||
tokio::time::timeout(ADVANCE, &mut evict_and_wait)
|
||||
.await
|
||||
.expect_err("should had been a timeout since we are holding the layer resident");
|
||||
assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
|
||||
|
||||
let (completion, barrier) = utils::completion::channel();
|
||||
let (arrival, arrived_at_barrier) = utils::completion::channel();
|
||||
layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
|
||||
Some(arrival),
|
||||
barrier,
|
||||
));
|
||||
|
||||
// now the eviction cannot proceed because the threads are consumed while completion exists
|
||||
drop(resident);
|
||||
arrived_at_barrier.wait().await;
|
||||
assert!(!layer.is_likely_resident());
|
||||
|
||||
// because no actual eviction happened, we get to just reinitialize the DownloadedLayer
|
||||
layer
|
||||
.0
|
||||
.get_or_maybe_download(false, None)
|
||||
.instrument(download_span)
|
||||
.await
|
||||
.expect("should had reinitialized without downloading");
|
||||
|
||||
assert!(layer.is_likely_resident());
|
||||
|
||||
// reinitialization notifies of new resident status, which should error out all evict_and_wait
|
||||
let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait)
|
||||
.await
|
||||
.expect("no timeout, because get_or_maybe_download re-initialized")
|
||||
.expect_err("eviction should not have succeeded because re-initialized");
|
||||
|
||||
// works as intended: evictions lose to "downloads"
|
||||
assert!(matches!(e, EvictionError::Downloaded), "{e:?}");
|
||||
assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
|
||||
|
||||
// this is not wrong: the eviction is technically still "on the way" as it's still queued
|
||||
// because of a failpoint
|
||||
assert_eq!(
|
||||
0,
|
||||
LAYER_IMPL_METRICS
|
||||
.cancelled_evictions
|
||||
.values()
|
||||
.map(|ctr| ctr.get())
|
||||
.sum::<u64>()
|
||||
);
|
||||
|
||||
drop(completion);
|
||||
|
||||
tokio::time::sleep(ADVANCE).await;
|
||||
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0(&handle, 1)
|
||||
.await;
|
||||
|
||||
assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
|
||||
|
||||
// now we finally can observe the original eviction failing
|
||||
// it would had been possible to observe it earlier, but here it is guaranteed to have
|
||||
// happened.
|
||||
assert_eq!(
|
||||
1,
|
||||
LAYER_IMPL_METRICS
|
||||
.cancelled_evictions
|
||||
.values()
|
||||
.map(|ctr| ctr.get())
|
||||
.sum::<u64>()
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
1,
|
||||
LAYER_IMPL_METRICS.cancelled_evictions[EvictionCancelled::AlreadyReinitialized].get()
|
||||
);
|
||||
|
||||
assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
|
||||
});
|
||||
}
|
||||
|
||||
/// Use failpoint to delay an eviction starting to get a VersionCheckFailed.
|
||||
#[test]
|
||||
fn multiple_pending_evictions_in_order() {
|
||||
let name = "multiple_pending_evictions_in_order";
|
||||
let in_order = true;
|
||||
multiple_pending_evictions_scenario(name, in_order);
|
||||
}
|
||||
|
||||
/// Use failpoint to reorder later eviction before first to get a UnexpectedEvictedState.
|
||||
#[test]
|
||||
fn multiple_pending_evictions_out_of_order() {
|
||||
let name = "multiple_pending_evictions_out_of_order";
|
||||
let in_order = false;
|
||||
multiple_pending_evictions_scenario(name, in_order);
|
||||
}
|
||||
|
||||
fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.max_blocking_threads(1)
|
||||
.enable_all()
|
||||
.start_paused(true)
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
rt.block_on(async move {
|
||||
// this is the runtime on which Layer spawns the blocking tasks on
|
||||
let handle = tokio::runtime::Handle::current();
|
||||
let h = TenantHarness::create(name).unwrap();
|
||||
let (tenant, ctx) = h.load().await;
|
||||
let span = h.span();
|
||||
let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
|
||||
|
||||
let timeline = tenant
|
||||
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
layers.likely_resident_layers().collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
assert_eq!(layers.len(), 1);
|
||||
|
||||
layers.swap_remove(0)
|
||||
};
|
||||
|
||||
// setup done
|
||||
|
||||
let resident = layer.keep_resident().await.unwrap();
|
||||
|
||||
let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER));
|
||||
|
||||
// drive the future to await on the status channel
|
||||
tokio::time::timeout(ADVANCE, &mut evict_and_wait)
|
||||
.await
|
||||
.expect_err("should had been a timeout since we are holding the layer resident");
|
||||
assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
|
||||
|
||||
let (completion1, barrier) = utils::completion::channel();
|
||||
let mut completion1 = Some(completion1);
|
||||
let (arrival, arrived_at_barrier) = utils::completion::channel();
|
||||
layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
|
||||
Some(arrival),
|
||||
barrier,
|
||||
));
|
||||
|
||||
// now the eviction cannot proceed because we are simulating arbitrary long delay for the
|
||||
// eviction task start.
|
||||
drop(resident);
|
||||
assert!(!layer.is_likely_resident());
|
||||
|
||||
arrived_at_barrier.wait().await;
|
||||
|
||||
// because no actual eviction happened, we get to just reinitialize the DownloadedLayer
|
||||
layer
|
||||
.0
|
||||
.get_or_maybe_download(false, None)
|
||||
.instrument(download_span)
|
||||
.await
|
||||
.expect("should had reinitialized without downloading");
|
||||
|
||||
assert!(layer.is_likely_resident());
|
||||
|
||||
// reinitialization notifies of new resident status, which should error out all evict_and_wait
|
||||
let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait)
|
||||
.await
|
||||
.expect("no timeout, because get_or_maybe_download re-initialized")
|
||||
.expect_err("eviction should not have succeeded because re-initialized");
|
||||
|
||||
// works as intended: evictions lose to "downloads"
|
||||
assert!(matches!(e, EvictionError::Downloaded), "{e:?}");
|
||||
assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
|
||||
|
||||
// this is not wrong: the eviction is technically still "on the way" as it's still queued
|
||||
// because of a failpoint
|
||||
assert_eq!(
|
||||
0,
|
||||
LAYER_IMPL_METRICS
|
||||
.cancelled_evictions
|
||||
.values()
|
||||
.map(|ctr| ctr.get())
|
||||
.sum::<u64>()
|
||||
);
|
||||
|
||||
assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
|
||||
|
||||
// configure another failpoint for the second eviction -- evictions are per initialization,
|
||||
// so now that we've reinitialized the inner, we get to run two of them at the same time.
|
||||
let (completion2, barrier) = utils::completion::channel();
|
||||
let (arrival, arrived_at_barrier) = utils::completion::channel();
|
||||
layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
|
||||
Some(arrival),
|
||||
barrier,
|
||||
));
|
||||
|
||||
let mut second_eviction = std::pin::pin!(layer.evict_and_wait(FOREVER));
|
||||
|
||||
// advance to the wait on the queue
|
||||
tokio::time::timeout(ADVANCE, &mut second_eviction)
|
||||
.await
|
||||
.expect_err("timeout because failpoint is blocking");
|
||||
|
||||
arrived_at_barrier.wait().await;
|
||||
|
||||
assert_eq!(2, LAYER_IMPL_METRICS.started_evictions.get());
|
||||
|
||||
let mut release_earlier_eviction = |expected_reason| {
|
||||
assert_eq!(
|
||||
0,
|
||||
LAYER_IMPL_METRICS.cancelled_evictions[expected_reason].get(),
|
||||
);
|
||||
|
||||
drop(completion1.take().unwrap());
|
||||
|
||||
let handle = &handle;
|
||||
|
||||
async move {
|
||||
tokio::time::sleep(ADVANCE).await;
|
||||
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0(
|
||||
handle, 1,
|
||||
)
|
||||
.await;
|
||||
|
||||
assert_eq!(
|
||||
1,
|
||||
LAYER_IMPL_METRICS.cancelled_evictions[expected_reason].get(),
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
if in_order {
|
||||
release_earlier_eviction(EvictionCancelled::VersionCheckFailed).await;
|
||||
}
|
||||
|
||||
// release the later eviction which is for the current version
|
||||
drop(completion2);
|
||||
tokio::time::sleep(ADVANCE).await;
|
||||
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0(&handle, 1)
|
||||
.await;
|
||||
|
||||
if !in_order {
|
||||
release_earlier_eviction(EvictionCancelled::UnexpectedEvictedState).await;
|
||||
}
|
||||
|
||||
tokio::time::timeout(ADVANCE, &mut second_eviction)
|
||||
.await
|
||||
.expect("eviction goes through now that spawn_blocking is unclogged")
|
||||
.expect("eviction should succeed, because version matches");
|
||||
|
||||
assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get());
|
||||
|
||||
// ensure the cancelled are unchanged
|
||||
assert_eq!(
|
||||
1,
|
||||
LAYER_IMPL_METRICS
|
||||
.cancelled_evictions
|
||||
.values()
|
||||
.map(|ctr| ctr.get())
|
||||
.sum::<u64>()
|
||||
);
|
||||
|
||||
assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
|
||||
});
|
||||
}
|
||||
|
||||
/// The test ensures with a failpoint that a pending eviction is not cancelled by what is currently
|
||||
/// a `Layer::keep_resident` call.
|
||||
/// This test shows that ensures we are able to read the layer while the layer eviction has been
|
||||
/// started but not completed due to spawn_blocking pool being blocked.
|
||||
///
|
||||
/// This matters because cancelling the eviction would leave us in a state where the file is on
|
||||
/// disk but the layer internal state says it has not been initialized. Futhermore, it allows us to
|
||||
/// have non-repairing `Layer::is_likely_resident`.
|
||||
/// Here `Layer::keep_resident` is used to "simulate" reads, because it cannot download.
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
|
||||
let handle = tokio::runtime::Handle::current();
|
||||
let h =
|
||||
TenantHarness::create("cancelled_get_or_maybe_download_does_not_cancel_eviction").unwrap();
|
||||
async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
|
||||
// this is the runtime on which Layer spawns the blocking tasks on
|
||||
let handle = BACKGROUND_RUNTIME.handle();
|
||||
let h = TenantHarness::create("residency_check_while_evict_and_wait_on_clogged_spawn_blocking")
|
||||
.unwrap();
|
||||
let (tenant, ctx) = h.load().await;
|
||||
|
||||
let timeline = tenant
|
||||
@@ -571,7 +263,7 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
layers.likely_resident_layers().collect::<Vec<_>>()
|
||||
layers.resident_layers().collect::<Vec<_>>().await
|
||||
};
|
||||
|
||||
assert_eq!(layers.len(), 1);
|
||||
@@ -579,154 +271,90 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
|
||||
layers.swap_remove(0)
|
||||
};
|
||||
|
||||
// this failpoint will simulate the `get_or_maybe_download` becoming cancelled (by returning an
|
||||
// Err) at the right time as in "during" the `LayerInner::needs_download`.
|
||||
layer.enable_failpoint(Failpoint::AfterDeterminingLayerNeedsNoDownload);
|
||||
// setup done
|
||||
|
||||
let (completion, barrier) = utils::completion::channel();
|
||||
let (arrival, arrived_at_barrier) = utils::completion::channel();
|
||||
|
||||
layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
|
||||
Some(arrival),
|
||||
barrier,
|
||||
));
|
||||
|
||||
tokio::time::timeout(ADVANCE, layer.evict_and_wait(FOREVER))
|
||||
.await
|
||||
.expect_err("should had advanced to waiting on channel");
|
||||
|
||||
arrived_at_barrier.wait().await;
|
||||
|
||||
// simulate a cancelled read which is cancelled before it gets to re-initialize
|
||||
let e = layer
|
||||
.0
|
||||
.get_or_maybe_download(false, None)
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert!(
|
||||
matches!(
|
||||
e,
|
||||
DownloadError::Failpoint(FailpointKind::AfterDeterminingLayerNeedsNoDownload)
|
||||
),
|
||||
"{e:?}"
|
||||
);
|
||||
|
||||
assert!(
|
||||
layer.0.needs_download().await.unwrap().is_none(),
|
||||
"file is still on disk"
|
||||
);
|
||||
|
||||
// release the eviction task
|
||||
drop(completion);
|
||||
tokio::time::sleep(ADVANCE).await;
|
||||
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
|
||||
|
||||
// failpoint is still enabled, but it is not hit
|
||||
let e = layer
|
||||
.0
|
||||
.get_or_maybe_download(false, None)
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert!(matches!(e, DownloadError::DownloadRequired), "{e:?}");
|
||||
|
||||
// failpoint is not counted as cancellation either
|
||||
assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn evict_and_wait_does_not_wait_for_download() {
|
||||
// let handle = tokio::runtime::Handle::current();
|
||||
let h = TenantHarness::create("evict_and_wait_does_not_wait_for_download").unwrap();
|
||||
let (tenant, ctx) = h.load().await;
|
||||
let span = h.span();
|
||||
let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
|
||||
|
||||
let timeline = tenant
|
||||
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
layers.likely_resident_layers().collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
assert_eq!(layers.len(), 1);
|
||||
|
||||
layers.swap_remove(0)
|
||||
};
|
||||
|
||||
// kind of forced setup: start an eviction but do not allow it progress until we are
|
||||
// downloading
|
||||
let (eviction_can_continue, barrier) = utils::completion::channel();
|
||||
let (arrival, eviction_arrived) = utils::completion::channel();
|
||||
layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting(
|
||||
Some(arrival),
|
||||
barrier,
|
||||
));
|
||||
let resident = layer.keep_resident().await.unwrap();
|
||||
|
||||
let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER));
|
||||
|
||||
// use this once-awaited other_evict to synchronize with the eviction
|
||||
let other_evict = layer.evict_and_wait(FOREVER);
|
||||
|
||||
// drive the future to await on the status channel
|
||||
tokio::time::timeout(ADVANCE, &mut evict_and_wait)
|
||||
.await
|
||||
.expect_err("should had advanced");
|
||||
eviction_arrived.wait().await;
|
||||
drop(eviction_can_continue);
|
||||
other_evict.await.unwrap();
|
||||
.expect_err("should had been a timeout since we are holding the layer resident");
|
||||
assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
|
||||
|
||||
// now the layer is evicted, and the "evict_and_wait" is waiting on the receiver
|
||||
assert!(!layer.is_likely_resident());
|
||||
// clog up BACKGROUND_RUNTIME spawn_blocking
|
||||
let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;
|
||||
|
||||
// following new evict_and_wait will fail until we've completed the download
|
||||
let e = layer.evict_and_wait(FOREVER).await.unwrap_err();
|
||||
assert!(matches!(e, EvictionError::NotFound), "{e:?}");
|
||||
// now the eviction cannot proceed because the threads are consumed while completion exists
|
||||
drop(resident);
|
||||
|
||||
let (download_can_continue, barrier) = utils::completion::channel();
|
||||
let (arrival, _download_arrived) = utils::completion::channel();
|
||||
layer.enable_failpoint(Failpoint::WaitBeforeDownloading(Some(arrival), barrier));
|
||||
// because no actual eviction happened, we get to just reinitialize the DownloadedLayer
|
||||
layer
|
||||
.keep_resident()
|
||||
.await
|
||||
.expect("keep_resident should had reinitialized without downloading")
|
||||
.expect("ResidentLayer");
|
||||
|
||||
let mut download = std::pin::pin!(layer
|
||||
.0
|
||||
.get_or_maybe_download(true, None)
|
||||
.instrument(download_span));
|
||||
// because the keep_resident check alters wanted evicted without sending a message, we will
|
||||
// never get completed
|
||||
let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait)
|
||||
.await
|
||||
.expect("no timeout, because keep_resident re-initialized")
|
||||
.expect_err("eviction should not have succeeded because re-initialized");
|
||||
|
||||
assert!(
|
||||
!layer.is_likely_resident(),
|
||||
"during download layer is evicted"
|
||||
// works as intended: evictions lose to "downloads"
|
||||
assert!(matches!(e, EvictionError::Downloaded), "{e:?}");
|
||||
assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
|
||||
|
||||
// this is not wrong: the eviction is technically still "on the way" as it's still queued
|
||||
// because spawn_blocking is clogged up
|
||||
assert_eq!(
|
||||
0,
|
||||
LAYER_IMPL_METRICS
|
||||
.cancelled_evictions
|
||||
.values()
|
||||
.map(|ctr| ctr.get())
|
||||
.sum::<u64>()
|
||||
);
|
||||
|
||||
tokio::time::timeout(ADVANCE, &mut download)
|
||||
let mut second_eviction = std::pin::pin!(layer.evict_and_wait(FOREVER));
|
||||
|
||||
// advance to the wait on the queue
|
||||
tokio::time::timeout(ADVANCE, &mut second_eviction)
|
||||
.await
|
||||
.expect_err("should had timed out because of failpoint");
|
||||
.expect_err("timeout because spawn_blocking is clogged");
|
||||
|
||||
// now we finally get to continue, and because the latest state is downloading, we deduce that
|
||||
// original eviction succeeded
|
||||
evict_and_wait.await.unwrap();
|
||||
// in this case we don't leak started evictions, but I think there is still a chance of that
|
||||
// happening, because we could have upgrades race multiple evictions while only one of them
|
||||
// happens?
|
||||
assert_eq!(2, LAYER_IMPL_METRICS.started_evictions.get());
|
||||
|
||||
// however a new evict_and_wait will fail
|
||||
let e = layer.evict_and_wait(FOREVER).await.unwrap_err();
|
||||
assert!(matches!(e, EvictionError::NotFound), "{e:?}");
|
||||
helper.release().await;
|
||||
|
||||
assert!(!layer.is_likely_resident());
|
||||
// the second_eviction gets to run here
|
||||
//
|
||||
// synchronize to be *strictly* after the second_eviction spawn_blocking run
|
||||
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await;
|
||||
|
||||
drop(download_can_continue);
|
||||
download.await.expect("download should had succeeded");
|
||||
assert!(layer.is_likely_resident());
|
||||
tokio::time::timeout(ADVANCE, &mut second_eviction)
|
||||
.await
|
||||
.expect("eviction goes through now that spawn_blocking is unclogged")
|
||||
.expect("eviction should succeed, because version matches");
|
||||
|
||||
// only now can we evict
|
||||
layer.evict_and_wait(FOREVER).await.unwrap();
|
||||
}
|
||||
assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get());
|
||||
|
||||
#[test]
|
||||
fn layer_size() {
|
||||
assert_eq!(std::mem::size_of::<LayerAccessStats>(), 2040);
|
||||
assert_eq!(std::mem::size_of::<PersistentLayerDesc>(), 104);
|
||||
assert_eq!(std::mem::size_of::<LayerInner>(), 2328);
|
||||
// it also has the utf8 path
|
||||
// now we finally can observe the original spawn_blocking failing
|
||||
// it would had been possible to observe it earlier, but here it is guaranteed to have
|
||||
// happened.
|
||||
assert_eq!(
|
||||
1,
|
||||
LAYER_IMPL_METRICS
|
||||
.cancelled_evictions
|
||||
.values()
|
||||
.map(|ctr| ctr.get())
|
||||
.sum::<u64>()
|
||||
);
|
||||
}
|
||||
|
||||
struct SpawnBlockingPoolHelper {
|
||||
@@ -743,41 +371,31 @@ impl SpawnBlockingPoolHelper {
|
||||
///
|
||||
/// This should be no issue nowdays, because nextest runs each test in it's own process.
|
||||
async fn consume_all_spawn_blocking_threads(handle: &tokio::runtime::Handle) -> Self {
|
||||
let default_max_blocking_threads = 512;
|
||||
|
||||
Self::consume_all_spawn_blocking_threads0(handle, default_max_blocking_threads).await
|
||||
}
|
||||
|
||||
async fn consume_all_spawn_blocking_threads0(
|
||||
handle: &tokio::runtime::Handle,
|
||||
threads: usize,
|
||||
) -> Self {
|
||||
assert_ne!(threads, 0);
|
||||
|
||||
let (completion, barrier) = completion::channel();
|
||||
let (started, starts_completed) = completion::channel();
|
||||
let (tx, mut rx) = tokio::sync::mpsc::channel(8);
|
||||
|
||||
let assumed_max_blocking_threads = 512;
|
||||
|
||||
let mut blocking_tasks = JoinSet::new();
|
||||
|
||||
for _ in 0..threads {
|
||||
for _ in 0..assumed_max_blocking_threads {
|
||||
let barrier = barrier.clone();
|
||||
let started = started.clone();
|
||||
let tx = tx.clone();
|
||||
blocking_tasks.spawn_blocking_on(
|
||||
move || {
|
||||
drop(started);
|
||||
tx.blocking_send(()).unwrap();
|
||||
drop(tx);
|
||||
tokio::runtime::Handle::current().block_on(barrier.wait());
|
||||
},
|
||||
handle,
|
||||
);
|
||||
}
|
||||
|
||||
drop(started);
|
||||
|
||||
starts_completed.wait().await;
|
||||
|
||||
drop(barrier);
|
||||
|
||||
tracing::trace!("consumed all threads");
|
||||
for _ in 0..assumed_max_blocking_threads {
|
||||
rx.recv().await.unwrap();
|
||||
}
|
||||
|
||||
SpawnBlockingPoolHelper {
|
||||
awaited_by_spawn_blocking_tasks: completion,
|
||||
@@ -797,22 +415,13 @@ impl SpawnBlockingPoolHelper {
|
||||
while let Some(res) = blocking_tasks.join_next().await {
|
||||
res.expect("none of the tasks should had panicked");
|
||||
}
|
||||
|
||||
tracing::trace!("released all threads");
|
||||
}
|
||||
|
||||
/// In the tests it is used as an easy way of making sure something scheduled on the target
|
||||
/// runtimes `spawn_blocking` has completed, because it must've been scheduled and completed
|
||||
/// before our tasks have a chance to schedule and complete.
|
||||
async fn consume_and_release_all_of_spawn_blocking_threads(handle: &tokio::runtime::Handle) {
|
||||
Self::consume_and_release_all_of_spawn_blocking_threads0(handle, 512).await
|
||||
}
|
||||
|
||||
async fn consume_and_release_all_of_spawn_blocking_threads0(
|
||||
handle: &tokio::runtime::Handle,
|
||||
threads: usize,
|
||||
) {
|
||||
Self::consume_all_spawn_blocking_threads0(handle, threads)
|
||||
Self::consume_all_spawn_blocking_threads(handle)
|
||||
.await
|
||||
.release()
|
||||
.await
|
||||
@@ -826,7 +435,7 @@ fn spawn_blocking_pool_helper_actually_works() {
|
||||
// because the amount is not configurable for our helper, expect the same amount as
|
||||
// BACKGROUND_RUNTIME using the tokio defaults would have.
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.max_blocking_threads(1)
|
||||
.max_blocking_threads(512)
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
@@ -836,8 +445,7 @@ fn spawn_blocking_pool_helper_actually_works() {
|
||||
rt.block_on(async move {
|
||||
// this will not return until all threads are spun up and actually executing the code
|
||||
// waiting on `consumed` to be `SpawnBlockingPoolHelper::release`'d.
|
||||
let consumed =
|
||||
SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads0(handle, 1).await;
|
||||
let consumed = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;
|
||||
|
||||
println!("consumed");
|
||||
|
||||
|
||||
@@ -13,13 +13,14 @@ use bytes::Bytes;
|
||||
use camino::Utf8Path;
|
||||
use enumset::EnumSet;
|
||||
use fail::fail_point;
|
||||
use futures::stream::StreamExt;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::{
|
||||
key::AUX_FILES_KEY,
|
||||
keyspace::KeySpaceAccum,
|
||||
models::{
|
||||
CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
|
||||
EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, TimelineState,
|
||||
EvictionPolicy, LayerMapInfo, TimelineState,
|
||||
},
|
||||
reltag::BlockNumber,
|
||||
shard::{ShardIdentity, TenantShardId},
|
||||
@@ -36,7 +37,6 @@ use tracing::*;
|
||||
use utils::{
|
||||
bin_ser::BeSer,
|
||||
sync::gate::{Gate, GateGuard},
|
||||
vec_map::VecMap,
|
||||
};
|
||||
|
||||
use std::ops::{Deref, Range};
|
||||
@@ -54,7 +54,6 @@ use std::{
|
||||
ops::ControlFlow,
|
||||
};
|
||||
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::tenant::timeline::logical_size::CurrentLogicalSize;
|
||||
use crate::tenant::{
|
||||
layer_map::{LayerMap, SearchResult},
|
||||
@@ -65,6 +64,7 @@ use crate::{
|
||||
disk_usage_eviction_task::DiskUsageEvictionInfo,
|
||||
pgdatadir_mapping::CollectKeySpaceError,
|
||||
};
|
||||
use crate::{deletion_queue::DeletionQueueClient, tenant::remote_timeline_client::StopError};
|
||||
use crate::{
|
||||
disk_usage_eviction_task::finite_f32,
|
||||
tenant::storage_layer::{
|
||||
@@ -309,8 +309,6 @@ pub struct Timeline {
|
||||
/// Configuration: how often should the partitioning be recalculated.
|
||||
repartition_threshold: u64,
|
||||
|
||||
last_image_layer_creation_check_at: AtomicLsn,
|
||||
|
||||
/// Current logical size of the "datadir", at the last LSN.
|
||||
current_logical_size: LogicalSize,
|
||||
|
||||
@@ -1144,79 +1142,6 @@ impl Timeline {
|
||||
self.flush_frozen_layers_and_wait().await
|
||||
}
|
||||
|
||||
/// If there is no writer, and conditions for rolling the latest layer are met, then freeze it.
|
||||
///
|
||||
/// This is for use in background housekeeping, to provide guarantees of layers closing eventually
|
||||
/// even if there are no ongoing writes to drive that.
|
||||
async fn maybe_freeze_ephemeral_layer(&self) {
|
||||
let Ok(_write_guard) = self.write_lock.try_lock() else {
|
||||
// If the write lock is held, there is an active wal receiver: rolling open layers
|
||||
// is their responsibility while they hold this lock.
|
||||
return;
|
||||
};
|
||||
|
||||
let Ok(layers_guard) = self.layers.try_read() else {
|
||||
// Don't block if the layer lock is busy
|
||||
return;
|
||||
};
|
||||
|
||||
let Some(open_layer) = &layers_guard.layer_map().open_layer else {
|
||||
// No open layer, no work to do.
|
||||
return;
|
||||
};
|
||||
|
||||
let Some(current_size) = open_layer.try_len() else {
|
||||
// Unexpected: since we hold the write guard, nobody else should be writing to this layer, so
|
||||
// read lock to get size should always succeed.
|
||||
tracing::warn!("Lock conflict while reading size of open layer");
|
||||
return;
|
||||
};
|
||||
|
||||
let current_lsn = self.get_last_record_lsn();
|
||||
|
||||
let checkpoint_distance_override = open_layer.tick().await;
|
||||
|
||||
if let Some(size_override) = checkpoint_distance_override {
|
||||
if current_size > size_override {
|
||||
// This is not harmful, but it only happens in relatively rare cases where
|
||||
// time-based checkpoints are not happening fast enough to keep the amount of
|
||||
// ephemeral data within configured limits. It's a sign of stress on the system.
|
||||
tracing::info!("Early-rolling open layer at size {current_size} (limit {size_override}) due to dirty data pressure");
|
||||
}
|
||||
}
|
||||
|
||||
let checkpoint_distance =
|
||||
checkpoint_distance_override.unwrap_or(self.get_checkpoint_distance());
|
||||
|
||||
if self.should_roll(
|
||||
current_size,
|
||||
current_size,
|
||||
checkpoint_distance,
|
||||
self.get_last_record_lsn(),
|
||||
self.last_freeze_at.load(),
|
||||
*self.last_freeze_ts.read().unwrap(),
|
||||
) {
|
||||
match open_layer.info() {
|
||||
InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => {
|
||||
// We may reach this point if the layer was already frozen by not yet flushed: flushing
|
||||
// happens asynchronously in the background.
|
||||
tracing::debug!(
|
||||
"Not freezing open layer, it's already frozen ({lsn_start}..{lsn_end})"
|
||||
);
|
||||
}
|
||||
InMemoryLayerInfo::Open { .. } => {
|
||||
// Upgrade to a write lock and freeze the layer
|
||||
drop(layers_guard);
|
||||
let mut layers_guard = self.layers.write().await;
|
||||
layers_guard
|
||||
.try_freeze_in_memory_layer(current_lsn, &self.last_freeze_at)
|
||||
.await;
|
||||
}
|
||||
}
|
||||
self.flush_frozen_layers();
|
||||
}
|
||||
}
|
||||
|
||||
/// Outermost timeline compaction operation; downloads needed layers.
|
||||
pub(crate) async fn compact(
|
||||
self: &Arc<Self>,
|
||||
@@ -1239,11 +1164,6 @@ impl Timeline {
|
||||
(guard, permit)
|
||||
};
|
||||
|
||||
// Prior to compaction, check if an open ephemeral layer should be closed: this provides
|
||||
// background enforcement of checkpoint interval if there is no active WAL receiver, to avoid keeping
|
||||
// an ephemeral layer open forever when idle.
|
||||
self.maybe_freeze_ephemeral_layer().await;
|
||||
|
||||
// this wait probably never needs any "long time spent" logging, because we already nag if
|
||||
// compaction task goes over it's period (20s) which is quite often in production.
|
||||
let (_guard, _permit) = tokio::select! {
|
||||
@@ -1276,7 +1196,6 @@ impl Timeline {
|
||||
|
||||
pub(crate) fn activate(
|
||||
self: &Arc<Self>,
|
||||
parent: Arc<crate::tenant::Tenant>,
|
||||
broker_client: BrokerClientChannel,
|
||||
background_jobs_can_start: Option<&completion::Barrier>,
|
||||
ctx: &RequestContext,
|
||||
@@ -1287,7 +1206,7 @@ impl Timeline {
|
||||
}
|
||||
self.launch_wal_receiver(ctx, broker_client);
|
||||
self.set_state(TimelineState::Active);
|
||||
self.launch_eviction_task(parent, background_jobs_can_start);
|
||||
self.launch_eviction_task(background_jobs_can_start);
|
||||
}
|
||||
|
||||
/// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then
|
||||
@@ -1322,7 +1241,11 @@ impl Timeline {
|
||||
// what is problematic is the shutting down of RemoteTimelineClient, because
|
||||
// obviously it does not make sense to stop while we wait for it, but what
|
||||
// about corner cases like s3 suddenly hanging up?
|
||||
client.shutdown().await;
|
||||
if let Err(e) = client.shutdown().await {
|
||||
// Non-fatal. Shutdown is infallible. Failures to flush just mean that
|
||||
// we have some extra WAL replay to do next time the timeline starts.
|
||||
warn!("failed to flush to remote storage: {e:#}");
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
@@ -1359,7 +1282,12 @@ impl Timeline {
|
||||
// Shut down remote timeline client: this gracefully moves its metadata into its Stopping state in
|
||||
// case our caller wants to use that for a deletion
|
||||
if let Some(remote_client) = self.remote_client.as_ref() {
|
||||
remote_client.stop();
|
||||
match remote_client.stop() {
|
||||
Ok(()) => {}
|
||||
Err(StopError::QueueUninitialized) => {
|
||||
// Shutting down during initialization is legal
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tracing::debug!("Waiting for tasks...");
|
||||
@@ -1515,53 +1443,6 @@ impl Timeline {
|
||||
Err(EvictionError::Timeout) => Ok(Some(false)),
|
||||
}
|
||||
}
|
||||
|
||||
fn should_roll(
|
||||
&self,
|
||||
layer_size: u64,
|
||||
projected_layer_size: u64,
|
||||
checkpoint_distance: u64,
|
||||
projected_lsn: Lsn,
|
||||
last_freeze_at: Lsn,
|
||||
last_freeze_ts: Instant,
|
||||
) -> bool {
|
||||
let distance = projected_lsn.widening_sub(last_freeze_at);
|
||||
|
||||
// Rolling the open layer can be triggered by:
|
||||
// 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that
|
||||
// the safekeepers need to store. For sharded tenants, we multiply by shard count to
|
||||
// account for how writes are distributed across shards: we expect each node to consume
|
||||
// 1/count of the LSN on average.
|
||||
// 2. The size of the currently open layer.
|
||||
// 3. The time since the last roll. It helps safekeepers to regard pageserver as caught
|
||||
// up and suspend activity.
|
||||
if distance >= checkpoint_distance as i128 * self.shard_identity.count.count() as i128 {
|
||||
info!(
|
||||
"Will roll layer at {} with layer size {} due to LSN distance ({})",
|
||||
projected_lsn, layer_size, distance
|
||||
);
|
||||
|
||||
true
|
||||
} else if projected_layer_size >= checkpoint_distance {
|
||||
info!(
|
||||
"Will roll layer at {} with layer size {} due to layer size ({})",
|
||||
projected_lsn, layer_size, projected_layer_size
|
||||
);
|
||||
|
||||
true
|
||||
} else if distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() {
|
||||
info!(
|
||||
"Will roll layer at {} with layer size {} due to time since last flush ({:?})",
|
||||
projected_lsn,
|
||||
layer_size,
|
||||
last_freeze_ts.elapsed()
|
||||
);
|
||||
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Number of times we will compute partition within a checkpoint distance.
|
||||
@@ -1634,15 +1515,6 @@ impl Timeline {
|
||||
.unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold)
|
||||
}
|
||||
|
||||
fn get_image_layer_creation_check_threshold(&self) -> u8 {
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
||||
tenant_conf.image_layer_creation_check_threshold.unwrap_or(
|
||||
self.conf
|
||||
.default_tenant_conf
|
||||
.image_layer_creation_check_threshold,
|
||||
)
|
||||
}
|
||||
|
||||
pub(super) fn tenant_conf_updated(&self) {
|
||||
// NB: Most tenant conf options are read by background loops, so,
|
||||
// changes will automatically be picked up.
|
||||
@@ -1780,7 +1652,6 @@ impl Timeline {
|
||||
},
|
||||
partitioning: tokio::sync::Mutex::new((KeyPartitioning::new(), Lsn(0))),
|
||||
repartition_threshold: 0,
|
||||
last_image_layer_creation_check_at: AtomicLsn::new(0),
|
||||
|
||||
last_received_wal: Mutex::new(None),
|
||||
rel_size_cache: RwLock::new(HashMap::new()),
|
||||
@@ -1809,7 +1680,6 @@ impl Timeline {
|
||||
};
|
||||
result.repartition_threshold =
|
||||
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
|
||||
|
||||
result
|
||||
.metrics
|
||||
.last_record_gauge
|
||||
@@ -2572,7 +2442,7 @@ impl Timeline {
|
||||
|
||||
let guard = self.layers.read().await;
|
||||
|
||||
let resident = guard.likely_resident_layers().map(|layer| {
|
||||
let resident = guard.resident_layers().map(|layer| {
|
||||
let last_activity_ts = layer.access_stats().latest_activity_or_now();
|
||||
|
||||
HeatMapLayer::new(
|
||||
@@ -2582,7 +2452,7 @@ impl Timeline {
|
||||
)
|
||||
});
|
||||
|
||||
let layers = resident.collect();
|
||||
let layers = resident.collect().await;
|
||||
|
||||
Some(HeatMapTimeline::new(self.timeline_id, layers))
|
||||
}
|
||||
@@ -2726,10 +2596,6 @@ impl Timeline {
|
||||
// Get all the data needed to reconstruct the page version from this layer.
|
||||
// But if we have an older cached page image, no need to go past that.
|
||||
let lsn_floor = max(cached_lsn + 1, start_lsn);
|
||||
|
||||
let open_layer = open_layer.clone();
|
||||
drop(guard);
|
||||
|
||||
result = match open_layer
|
||||
.get_value_reconstruct_data(
|
||||
key,
|
||||
@@ -2747,7 +2613,10 @@ impl Timeline {
|
||||
traversal_path.push((
|
||||
result,
|
||||
cont_lsn,
|
||||
Box::new(move || open_layer.traversal_id()),
|
||||
Box::new({
|
||||
let open_layer = Arc::clone(open_layer);
|
||||
move || open_layer.traversal_id()
|
||||
}),
|
||||
));
|
||||
continue 'outer;
|
||||
}
|
||||
@@ -2757,10 +2626,6 @@ impl Timeline {
|
||||
if cont_lsn > start_lsn {
|
||||
//info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display());
|
||||
let lsn_floor = max(cached_lsn + 1, start_lsn);
|
||||
|
||||
let frozen_layer = frozen_layer.clone();
|
||||
drop(guard);
|
||||
|
||||
result = match frozen_layer
|
||||
.get_value_reconstruct_data(
|
||||
key,
|
||||
@@ -2778,7 +2643,10 @@ impl Timeline {
|
||||
traversal_path.push((
|
||||
result,
|
||||
cont_lsn,
|
||||
Box::new(move || frozen_layer.traversal_id()),
|
||||
Box::new({
|
||||
let frozen_layer = Arc::clone(frozen_layer);
|
||||
move || frozen_layer.traversal_id()
|
||||
}),
|
||||
));
|
||||
continue 'outer;
|
||||
}
|
||||
@@ -2786,8 +2654,6 @@ impl Timeline {
|
||||
|
||||
if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
|
||||
let layer = guard.get_from_desc(&layer);
|
||||
drop(guard);
|
||||
|
||||
// Get all the data needed to reconstruct the page version from this layer.
|
||||
// But if we have an older cached page image, no need to go past that.
|
||||
let lsn_floor = max(cached_lsn + 1, lsn_floor);
|
||||
@@ -3514,24 +3380,6 @@ impl Timeline {
|
||||
|
||||
// Is it time to create a new image layer for the given partition?
|
||||
async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
|
||||
let last = self.last_image_layer_creation_check_at.load();
|
||||
if lsn != Lsn(0) {
|
||||
let distance = lsn
|
||||
.checked_sub(last)
|
||||
.expect("Attempt to compact with LSN going backwards");
|
||||
|
||||
let min_distance = self.get_image_layer_creation_check_threshold() as u64
|
||||
* self.get_checkpoint_distance();
|
||||
|
||||
// Skip the expensive delta layer counting below if we've not ingested
|
||||
// sufficient WAL since the last check.
|
||||
if distance.0 < min_distance {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
self.last_image_layer_creation_check_at.store(lsn);
|
||||
|
||||
let threshold = self.get_image_creation_threshold();
|
||||
|
||||
let guard = self.layers.read().await;
|
||||
@@ -4454,7 +4302,7 @@ impl Timeline {
|
||||
let mut max_layer_size: Option<u64> = None;
|
||||
|
||||
let resident_layers = guard
|
||||
.likely_resident_layers()
|
||||
.resident_layers()
|
||||
.map(|layer| {
|
||||
let file_size = layer.layer_desc().file_size;
|
||||
max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));
|
||||
@@ -4467,7 +4315,8 @@ impl Timeline {
|
||||
relative_last_activity: finite_f32::FiniteF32::ZERO,
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
.collect()
|
||||
.await;
|
||||
|
||||
DiskUsageEvictionInfo {
|
||||
max_layer_size,
|
||||
@@ -4612,6 +4461,49 @@ impl<'a> TimelineWriter<'a> {
|
||||
res
|
||||
}
|
||||
|
||||
/// "Tick" the timeline writer: it will roll the open layer if required
|
||||
/// and do nothing else.
|
||||
pub(crate) async fn tick(&mut self) -> anyhow::Result<()> {
|
||||
self.open_layer_if_present().await?;
|
||||
|
||||
let last_record_lsn = self.get_last_record_lsn();
|
||||
let action = self.get_open_layer_action(last_record_lsn, 0);
|
||||
if action == OpenLayerAction::Roll {
|
||||
self.roll_layer(last_record_lsn).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Populate the timeline writer state only if an in-memory layer
|
||||
/// is already open.
|
||||
async fn open_layer_if_present(&mut self) -> anyhow::Result<()> {
|
||||
assert!(self.write_guard.is_none());
|
||||
|
||||
let open_layer = {
|
||||
let guard = self.layers.read().await;
|
||||
let layers = guard.layer_map();
|
||||
match layers.open_layer {
|
||||
Some(ref open_layer) => open_layer.clone(),
|
||||
None => {
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let initial_size = open_layer.size().await?;
|
||||
let last_freeze_at = self.last_freeze_at.load();
|
||||
let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
|
||||
self.write_guard.replace(TimelineWriterState::new(
|
||||
open_layer,
|
||||
initial_size,
|
||||
last_freeze_at,
|
||||
last_freeze_ts,
|
||||
));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_open_layer_action(
|
||||
&mut self,
|
||||
at: Lsn,
|
||||
@@ -4683,29 +4575,59 @@ impl<'a> TimelineWriter<'a> {
|
||||
return OpenLayerAction::None;
|
||||
}
|
||||
|
||||
if self.tl.should_roll(
|
||||
state.current_size,
|
||||
state.current_size + new_value_size,
|
||||
self.get_checkpoint_distance(),
|
||||
lsn,
|
||||
state.cached_last_freeze_at,
|
||||
state.cached_last_freeze_ts,
|
||||
) {
|
||||
let distance = lsn.widening_sub(state.cached_last_freeze_at);
|
||||
let proposed_open_layer_size = state.current_size + new_value_size;
|
||||
|
||||
// Rolling the open layer can be triggered by:
|
||||
// 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that
|
||||
// the safekeepers need to store. For sharded tenants, we multiply by shard count to
|
||||
// account for how writes are distributed across shards: we expect each node to consume
|
||||
// 1/count of the LSN on average.
|
||||
// 2. The size of the currently open layer.
|
||||
// 3. The time since the last roll. It helps safekeepers to regard pageserver as caught
|
||||
// up and suspend activity.
|
||||
if distance
|
||||
>= self.get_checkpoint_distance() as i128 * self.shard_identity.count.count() as i128
|
||||
{
|
||||
info!(
|
||||
"Will roll layer at {} with layer size {} due to LSN distance ({})",
|
||||
lsn, state.current_size, distance
|
||||
);
|
||||
|
||||
OpenLayerAction::Roll
|
||||
} else if proposed_open_layer_size >= self.get_checkpoint_distance() {
|
||||
info!(
|
||||
"Will roll layer at {} with layer size {} due to layer size ({})",
|
||||
lsn, state.current_size, proposed_open_layer_size
|
||||
);
|
||||
|
||||
OpenLayerAction::Roll
|
||||
} else if distance > 0
|
||||
&& state.cached_last_freeze_ts.elapsed() >= self.get_checkpoint_timeout()
|
||||
{
|
||||
info!(
|
||||
"Will roll layer at {} with layer size {} due to time since last flush ({:?})",
|
||||
lsn,
|
||||
state.current_size,
|
||||
state.cached_last_freeze_ts.elapsed()
|
||||
);
|
||||
|
||||
OpenLayerAction::Roll
|
||||
} else {
|
||||
OpenLayerAction::None
|
||||
}
|
||||
}
|
||||
|
||||
/// Put a batch of keys at the specified Lsns.
|
||||
/// Put a batch keys at the specified Lsns.
|
||||
///
|
||||
/// The batch is sorted by Lsn (enforced by usage of [`utils::vec_map::VecMap`].
|
||||
/// The batch should be sorted by Lsn such that it's safe
|
||||
/// to roll the open layer mid batch.
|
||||
pub(crate) async fn put_batch(
|
||||
&mut self,
|
||||
batch: VecMap<Lsn, (Key, Value)>,
|
||||
batch: Vec<(Key, Lsn, Value)>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
for (lsn, (key, val)) in batch {
|
||||
for (key, lsn, val) in batch {
|
||||
self.put(key, lsn, &val, ctx).await?
|
||||
}
|
||||
|
||||
@@ -4791,6 +4713,7 @@ mod tests {
|
||||
.keep_resident()
|
||||
.await
|
||||
.expect("no download => no downloading errors")
|
||||
.expect("should had been resident")
|
||||
.drop_eviction_guard();
|
||||
|
||||
let forever = std::time::Duration::from_secs(120);
|
||||
@@ -4801,7 +4724,7 @@ mod tests {
|
||||
let (first, second) = tokio::join!(first, second);
|
||||
|
||||
let res = layer.keep_resident().await;
|
||||
assert!(res.is_none(), "{res:?}");
|
||||
assert!(matches!(res, Ok(None)), "{res:?}");
|
||||
|
||||
match (first, second) {
|
||||
(Ok(()), Ok(())) => {
|
||||
|
||||
@@ -16,7 +16,9 @@ use crate::{
|
||||
tenant::{
|
||||
debug_assert_current_span_has_tenant_and_timeline_id,
|
||||
metadata::TimelineMetadata,
|
||||
remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
|
||||
remote_timeline_client::{
|
||||
self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
|
||||
},
|
||||
CreateTimelineCause, DeleteTimelineError, Tenant,
|
||||
},
|
||||
};
|
||||
@@ -48,7 +50,19 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
|
||||
|
||||
// Prevent new uploads from starting.
|
||||
if let Some(remote_client) = timeline.remote_client.as_ref() {
|
||||
remote_client.stop();
|
||||
let res = remote_client.stop();
|
||||
match res {
|
||||
Ok(()) => {}
|
||||
Err(e) => match e {
|
||||
remote_timeline_client::StopError::QueueUninitialized => {
|
||||
// This case shouldn't happen currently because the
|
||||
// load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart.
|
||||
// That is, before we declare the Tenant as Active.
|
||||
// But we only allow calls to delete_timeline on Active tenants.
|
||||
return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs")));
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// Stop & wait for the remaining timeline tasks, including upload tasks.
|
||||
|
||||
@@ -51,7 +51,6 @@ pub struct EvictionTaskTenantState {
|
||||
impl Timeline {
|
||||
pub(super) fn launch_eviction_task(
|
||||
self: &Arc<Self>,
|
||||
parent: Arc<Tenant>,
|
||||
background_tasks_can_start: Option<&completion::Barrier>,
|
||||
) {
|
||||
let self_clone = Arc::clone(self);
|
||||
@@ -73,14 +72,14 @@ impl Timeline {
|
||||
_ = completion::Barrier::maybe_wait(background_tasks_can_start) => {}
|
||||
};
|
||||
|
||||
self_clone.eviction_task(parent, cancel).await;
|
||||
self_clone.eviction_task(cancel).await;
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
|
||||
async fn eviction_task(self: Arc<Self>, tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
async fn eviction_task(self: Arc<Self>, cancel: CancellationToken) {
|
||||
use crate::tenant::tasks::random_init_delay;
|
||||
|
||||
// acquire the gate guard only once within a useful span
|
||||
@@ -104,7 +103,7 @@ impl Timeline {
|
||||
loop {
|
||||
let policy = self.get_eviction_policy();
|
||||
let cf = self
|
||||
.eviction_iteration(&tenant, &policy, &cancel, &guard, &ctx)
|
||||
.eviction_iteration(&policy, &cancel, &guard, &ctx)
|
||||
.await;
|
||||
|
||||
match cf {
|
||||
@@ -124,7 +123,6 @@ impl Timeline {
|
||||
#[instrument(skip_all, fields(policy_kind = policy.discriminant_str()))]
|
||||
async fn eviction_iteration(
|
||||
self: &Arc<Self>,
|
||||
tenant: &Tenant,
|
||||
policy: &EvictionPolicy,
|
||||
cancel: &CancellationToken,
|
||||
gate: &GateGuard,
|
||||
@@ -139,7 +137,7 @@ impl Timeline {
|
||||
}
|
||||
EvictionPolicy::LayerAccessThreshold(p) => {
|
||||
match self
|
||||
.eviction_iteration_threshold(tenant, p, cancel, gate, ctx)
|
||||
.eviction_iteration_threshold(p, cancel, gate, ctx)
|
||||
.await
|
||||
{
|
||||
ControlFlow::Break(()) => return ControlFlow::Break(()),
|
||||
@@ -148,11 +146,7 @@ impl Timeline {
|
||||
(p.period, p.threshold)
|
||||
}
|
||||
EvictionPolicy::OnlyImitiate(p) => {
|
||||
if self
|
||||
.imitiate_only(tenant, p, cancel, gate, ctx)
|
||||
.await
|
||||
.is_break()
|
||||
{
|
||||
if self.imitiate_only(p, cancel, gate, ctx).await.is_break() {
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
(p.period, p.threshold)
|
||||
@@ -181,7 +175,6 @@ impl Timeline {
|
||||
|
||||
async fn eviction_iteration_threshold(
|
||||
self: &Arc<Self>,
|
||||
tenant: &Tenant,
|
||||
p: &EvictionPolicyLayerAccessThreshold,
|
||||
cancel: &CancellationToken,
|
||||
gate: &GateGuard,
|
||||
@@ -200,10 +193,7 @@ impl Timeline {
|
||||
_ = self.cancel.cancelled() => return ControlFlow::Break(()),
|
||||
};
|
||||
|
||||
match self
|
||||
.imitate_layer_accesses(tenant, p, cancel, gate, ctx)
|
||||
.await
|
||||
{
|
||||
match self.imitate_layer_accesses(p, cancel, gate, ctx).await {
|
||||
ControlFlow::Break(()) => return ControlFlow::Break(()),
|
||||
ControlFlow::Continue(()) => (),
|
||||
}
|
||||
@@ -235,18 +225,24 @@ impl Timeline {
|
||||
{
|
||||
let guard = self.layers.read().await;
|
||||
let layers = guard.layer_map();
|
||||
for layer in layers.iter_historic_layers() {
|
||||
let layer = guard.get_from_desc(&layer);
|
||||
for hist_layer in layers.iter_historic_layers() {
|
||||
let hist_layer = guard.get_from_desc(&hist_layer);
|
||||
|
||||
// guard against eviction while we inspect it; it might be that eviction_task and
|
||||
// disk_usage_eviction_task both select the same layers to be evicted, and
|
||||
// seemingly free up double the space. both succeeding is of no consequence.
|
||||
let guard = match hist_layer.keep_resident().await {
|
||||
Ok(Some(l)) => l,
|
||||
Ok(None) => continue,
|
||||
Err(e) => {
|
||||
// these should not happen, but we cannot make them statically impossible right
|
||||
// now.
|
||||
tracing::warn!(layer=%hist_layer, "failed to keep the layer resident: {e:#}");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
if !layer.is_likely_resident() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let last_activity_ts = layer.access_stats().latest_activity_or_now();
|
||||
let last_activity_ts = hist_layer.access_stats().latest_activity_or_now();
|
||||
|
||||
let no_activity_for = match now.duration_since(last_activity_ts) {
|
||||
Ok(d) => d,
|
||||
@@ -269,8 +265,9 @@ impl Timeline {
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let layer = guard.drop_eviction_guard();
|
||||
if no_activity_for > p.threshold {
|
||||
// this could cause a lot of allocations in some cases
|
||||
js.spawn(async move {
|
||||
layer
|
||||
.evict_and_wait(std::time::Duration::from_secs(5))
|
||||
@@ -325,7 +322,6 @@ impl Timeline {
|
||||
/// disk usage based eviction task.
|
||||
async fn imitiate_only(
|
||||
self: &Arc<Self>,
|
||||
tenant: &Tenant,
|
||||
p: &EvictionPolicyLayerAccessThreshold,
|
||||
cancel: &CancellationToken,
|
||||
gate: &GateGuard,
|
||||
@@ -342,8 +338,7 @@ impl Timeline {
|
||||
_ = self.cancel.cancelled() => return ControlFlow::Break(()),
|
||||
};
|
||||
|
||||
self.imitate_layer_accesses(tenant, p, cancel, gate, ctx)
|
||||
.await
|
||||
self.imitate_layer_accesses(p, cancel, gate, ctx).await
|
||||
}
|
||||
|
||||
/// If we evict layers but keep cached values derived from those layers, then
|
||||
@@ -373,7 +368,6 @@ impl Timeline {
|
||||
#[instrument(skip_all)]
|
||||
async fn imitate_layer_accesses(
|
||||
&self,
|
||||
tenant: &Tenant,
|
||||
p: &EvictionPolicyLayerAccessThreshold,
|
||||
cancel: &CancellationToken,
|
||||
gate: &GateGuard,
|
||||
@@ -409,11 +403,17 @@ impl Timeline {
|
||||
// Make one of the tenant's timelines draw the short straw and run the calculation.
|
||||
// The others wait until the calculation is done so that they take into account the
|
||||
// imitated accesses that the winner made.
|
||||
let tenant = match crate::tenant::mgr::get_tenant(self.tenant_shard_id, true) {
|
||||
Ok(t) => t,
|
||||
Err(_) => {
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
};
|
||||
let mut state = tenant.eviction_task_tenant_state.lock().await;
|
||||
match state.last_layer_access_imitation {
|
||||
Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
|
||||
_ => {
|
||||
self.imitate_synthetic_size_calculation_worker(tenant, cancel, ctx)
|
||||
self.imitate_synthetic_size_calculation_worker(&tenant, cancel, ctx)
|
||||
.await;
|
||||
state.last_layer_access_imitation = Some(tokio::time::Instant::now());
|
||||
}
|
||||
@@ -487,7 +487,7 @@ impl Timeline {
|
||||
#[instrument(skip_all)]
|
||||
async fn imitate_synthetic_size_calculation_worker(
|
||||
&self,
|
||||
tenant: &Tenant,
|
||||
tenant: &Arc<Tenant>,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) {
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use futures::StreamExt;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
use tracing::trace;
|
||||
@@ -240,16 +241,29 @@ impl LayerManager {
|
||||
layer.delete_on_drop();
|
||||
}
|
||||
|
||||
pub(crate) fn likely_resident_layers(&self) -> impl Iterator<Item = Layer> + '_ {
|
||||
pub(crate) fn resident_layers(&self) -> impl futures::stream::Stream<Item = Layer> + '_ {
|
||||
// for small layer maps, we most likely have all resident, but for larger more are likely
|
||||
// to be evicted assuming lots of layers correlated with longer lifespan.
|
||||
|
||||
self.layer_map().iter_historic_layers().filter_map(|desc| {
|
||||
self.layer_fmgr
|
||||
.0
|
||||
.get(&desc.key())
|
||||
.filter(|l| l.is_likely_resident())
|
||||
.cloned()
|
||||
let layers = self
|
||||
.layer_map()
|
||||
.iter_historic_layers()
|
||||
.map(|desc| self.get_from_desc(&desc));
|
||||
|
||||
let layers = futures::stream::iter(layers);
|
||||
|
||||
layers.filter_map(|layer| async move {
|
||||
// TODO(#6028): this query does not really need to see the ResidentLayer
|
||||
match layer.keep_resident().await {
|
||||
Ok(Some(layer)) => Some(layer.drop_eviction_guard()),
|
||||
Ok(None) => None,
|
||||
Err(e) => {
|
||||
// these should not happen, but we cannot make them statically impossible right
|
||||
// now.
|
||||
tracing::warn!(%layer, "failed to keep the layer resident: {e:#}");
|
||||
None
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@@ -86,7 +86,6 @@ impl<'t> UninitializedTimeline<'t> {
|
||||
/// Prepares timeline data by loading it from the basebackup archive.
|
||||
pub(crate) async fn import_basebackup_from_tar(
|
||||
self,
|
||||
tenant: Arc<Tenant>,
|
||||
copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin),
|
||||
base_lsn: Lsn,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
@@ -115,7 +114,7 @@ impl<'t> UninitializedTimeline<'t> {
|
||||
|
||||
// All the data has been imported. Insert the Timeline into the tenant's timelines map
|
||||
let tl = self.finish_creation()?;
|
||||
tl.activate(tenant, broker_client, None, ctx);
|
||||
tl.activate(broker_client, None, ctx);
|
||||
Ok(tl)
|
||||
}
|
||||
|
||||
|
||||
@@ -33,9 +33,11 @@ use crate::tenant::timeline::walreceiver::connection_manager::{
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::future::Future;
|
||||
use std::num::NonZeroU64;
|
||||
use std::ops::ControlFlow;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use tokio::select;
|
||||
use tokio::sync::watch;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
@@ -89,27 +91,31 @@ impl WalReceiver {
|
||||
async move {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
debug!("WAL receiver manager started, connecting to broker");
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
let mut connection_manager_state = ConnectionManagerState::new(
|
||||
timeline,
|
||||
conf,
|
||||
);
|
||||
while !cancel.is_cancelled() {
|
||||
let loop_step_result = connection_manager_loop_step(
|
||||
&mut broker_client,
|
||||
&mut connection_manager_state,
|
||||
&walreceiver_ctx,
|
||||
&cancel,
|
||||
&loop_status,
|
||||
).await;
|
||||
match loop_step_result {
|
||||
Ok(()) => continue,
|
||||
Err(_cancelled) => {
|
||||
trace!("Connection manager loop ended, shutting down");
|
||||
loop {
|
||||
select! {
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
trace!("WAL receiver shutdown requested, shutting down");
|
||||
break;
|
||||
}
|
||||
},
|
||||
loop_step_result = connection_manager_loop_step(
|
||||
&mut broker_client,
|
||||
&mut connection_manager_state,
|
||||
&walreceiver_ctx,
|
||||
&loop_status,
|
||||
) => match loop_step_result {
|
||||
ControlFlow::Continue(()) => continue,
|
||||
ControlFlow::Break(()) => {
|
||||
trace!("Connection manager loop ended, shutting down");
|
||||
break;
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
connection_manager_state.shutdown().await;
|
||||
*loop_status.write().unwrap() = None;
|
||||
Ok(())
|
||||
@@ -191,9 +197,6 @@ impl<E: Clone> TaskHandle<E> {
|
||||
}
|
||||
}
|
||||
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// Cancellation-safe.
|
||||
async fn next_task_event(&mut self) -> TaskEvent<E> {
|
||||
match self.events_receiver.changed().await {
|
||||
Ok(()) => TaskEvent::Update((self.events_receiver.borrow()).clone()),
|
||||
|
||||
@@ -17,7 +17,7 @@ use crate::metrics::{
|
||||
WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED,
|
||||
WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES,
|
||||
};
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::task_mgr::{shutdown_token, TaskKind};
|
||||
use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline};
|
||||
use anyhow::Context;
|
||||
use chrono::{NaiveDateTime, Utc};
|
||||
@@ -27,7 +27,7 @@ use storage_broker::proto::SafekeeperTimelineInfo;
|
||||
use storage_broker::proto::SubscribeSafekeeperInfoRequest;
|
||||
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
|
||||
use storage_broker::{BrokerClientChannel, Code, Streaming};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tokio::select;
|
||||
use tracing::*;
|
||||
|
||||
use postgres_connection::PgConnectionConfig;
|
||||
@@ -45,33 +45,27 @@ use super::{
|
||||
TaskEvent, TaskHandle,
|
||||
};
|
||||
|
||||
pub(crate) struct Cancelled;
|
||||
|
||||
/// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker.
|
||||
/// Based on the updates, desides whether to start, keep or stop a WAL receiver task.
|
||||
/// If storage broker subscription is cancelled, exits.
|
||||
///
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// Not cancellation-safe. Use `cancel` token to request cancellation.
|
||||
pub(super) async fn connection_manager_loop_step(
|
||||
broker_client: &mut BrokerClientChannel,
|
||||
connection_manager_state: &mut ConnectionManagerState,
|
||||
ctx: &RequestContext,
|
||||
cancel: &CancellationToken,
|
||||
manager_status: &std::sync::RwLock<Option<ConnectionManagerStatus>>,
|
||||
) -> Result<(), Cancelled> {
|
||||
match tokio::select! {
|
||||
_ = cancel.cancelled() => { return Err(Cancelled); },
|
||||
st = connection_manager_state.timeline.wait_to_become_active(ctx) => { st }
|
||||
} {
|
||||
) -> ControlFlow<(), ()> {
|
||||
match connection_manager_state
|
||||
.timeline
|
||||
.wait_to_become_active(ctx)
|
||||
.await
|
||||
{
|
||||
Ok(()) => {}
|
||||
Err(new_state) => {
|
||||
debug!(
|
||||
?new_state,
|
||||
"state changed, stopping wal connection manager loop"
|
||||
);
|
||||
return Err(Cancelled);
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -92,7 +86,7 @@ pub(super) async fn connection_manager_loop_step(
|
||||
// Subscribe to the broker updates. Stream shares underlying TCP connection
|
||||
// with other streams on this client (other connection managers). When
|
||||
// object goes out of scope, stream finishes in drop() automatically.
|
||||
let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?;
|
||||
let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id).await;
|
||||
debug!("Subscribed for broker timeline updates");
|
||||
|
||||
loop {
|
||||
@@ -100,7 +94,6 @@ pub(super) async fn connection_manager_loop_step(
|
||||
|
||||
// These things are happening concurrently:
|
||||
//
|
||||
// - cancellation request
|
||||
// - keep receiving WAL on the current connection
|
||||
// - if the shared state says we need to change connection, disconnect and return
|
||||
// - this runs in a separate task and we receive updates via a watch channel
|
||||
@@ -108,11 +101,7 @@ pub(super) async fn connection_manager_loop_step(
|
||||
// - receive updates from broker
|
||||
// - this might change the current desired connection
|
||||
// - timeline state changes to something that does not allow walreceiver to run concurrently
|
||||
|
||||
// NB: make sure each of the select expressions are cancellation-safe
|
||||
// (no need for arms to be cancellation-safe).
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => { return Err(Cancelled); }
|
||||
select! {
|
||||
Some(wal_connection_update) = async {
|
||||
match connection_manager_state.wal_connection.as_mut() {
|
||||
Some(wal_connection) => Some(wal_connection.connection_task.next_task_event().await),
|
||||
@@ -144,7 +133,7 @@ pub(super) async fn connection_manager_loop_step(
|
||||
},
|
||||
|
||||
// Got a new update from the broker
|
||||
broker_update = broker_subscription.message() /* TODO: review cancellation-safety */ => {
|
||||
broker_update = broker_subscription.message() => {
|
||||
match broker_update {
|
||||
Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update),
|
||||
Err(status) => {
|
||||
@@ -158,17 +147,16 @@ pub(super) async fn connection_manager_loop_step(
|
||||
warn!("broker subscription failed: {status}");
|
||||
}
|
||||
}
|
||||
return Ok(());
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
Ok(None) => {
|
||||
error!("broker subscription stream ended"); // can't happen
|
||||
return Ok(());
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
new_event = async {
|
||||
// Reminder: this match arm needs to be cancellation-safe.
|
||||
loop {
|
||||
if connection_manager_state.timeline.current_state() == TimelineState::Loading {
|
||||
warn!("wal connection manager should only be launched after timeline has become active");
|
||||
@@ -194,11 +182,11 @@ pub(super) async fn connection_manager_loop_step(
|
||||
}
|
||||
} => match new_event {
|
||||
ControlFlow::Continue(()) => {
|
||||
return Ok(());
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
ControlFlow::Break(()) => {
|
||||
debug!("Timeline is no longer active, stopping wal connection manager loop");
|
||||
return Err(Cancelled);
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
},
|
||||
|
||||
@@ -230,15 +218,16 @@ pub(super) async fn connection_manager_loop_step(
|
||||
async fn subscribe_for_timeline_updates(
|
||||
broker_client: &mut BrokerClientChannel,
|
||||
id: TenantTimelineId,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Streaming<SafekeeperTimelineInfo>, Cancelled> {
|
||||
) -> Streaming<SafekeeperTimelineInfo> {
|
||||
let mut attempt = 0;
|
||||
let cancel = shutdown_token();
|
||||
|
||||
loop {
|
||||
exponential_backoff(
|
||||
attempt,
|
||||
DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
cancel,
|
||||
&cancel,
|
||||
)
|
||||
.await;
|
||||
attempt += 1;
|
||||
@@ -252,14 +241,9 @@ async fn subscribe_for_timeline_updates(
|
||||
subscription_key: Some(key),
|
||||
};
|
||||
|
||||
match {
|
||||
tokio::select! {
|
||||
r = broker_client.subscribe_safekeeper_info(request) => { r }
|
||||
_ = cancel.cancelled() => { return Err(Cancelled); }
|
||||
}
|
||||
} {
|
||||
match broker_client.subscribe_safekeeper_info(request).await {
|
||||
Ok(resp) => {
|
||||
return Ok(resp.into_inner());
|
||||
return resp.into_inner();
|
||||
}
|
||||
Err(e) => {
|
||||
// Safekeeper nodes can stop pushing timeline updates to the broker, when no new writes happen and
|
||||
@@ -502,10 +486,6 @@ impl ConnectionManagerState {
|
||||
|
||||
/// Drops the current connection (if any) and updates retry timeout for the next
|
||||
/// connection attempt to the same safekeeper.
|
||||
///
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// Not cancellation-safe.
|
||||
async fn drop_old_connection(&mut self, needs_shutdown: bool) {
|
||||
let wal_connection = match self.wal_connection.take() {
|
||||
Some(wal_connection) => wal_connection,
|
||||
@@ -513,14 +493,7 @@ impl ConnectionManagerState {
|
||||
};
|
||||
|
||||
if needs_shutdown {
|
||||
wal_connection
|
||||
.connection_task
|
||||
.shutdown()
|
||||
// This here is why this function isn't cancellation-safe.
|
||||
// If we got cancelled here, then self.wal_connection is already None and we lose track of the task.
|
||||
// Even if our caller diligently calls Self::shutdown(), it will find a self.wal_connection=None
|
||||
// and thus be ineffective.
|
||||
.await;
|
||||
wal_connection.connection_task.shutdown().await;
|
||||
}
|
||||
|
||||
let retry = self
|
||||
@@ -865,9 +838,6 @@ impl ConnectionManagerState {
|
||||
}
|
||||
}
|
||||
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// Not cancellation-safe.
|
||||
pub(super) async fn shutdown(mut self) {
|
||||
if let Some(wal_connection) = self.wal_connection.take() {
|
||||
wal_connection.connection_task.shutdown().await;
|
||||
|
||||
@@ -389,6 +389,17 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
// This is a hack. It piggybacks on the keepalive messages sent by the
|
||||
// safekeeper in order to enforce `checkpoint_timeout` on the currently
|
||||
// open layer. This hack doesn't provide a bound on the total size of
|
||||
// in-memory layers on a pageserver. See https://github.com/neondatabase/neon/issues/6916.
|
||||
let mut writer = timeline.writer().await;
|
||||
if let Err(err) = writer.tick().await {
|
||||
warn!("Timeline writer tick failed: {err}");
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(last_lsn) = status_update {
|
||||
let timeline_remote_consistent_lsn = timeline
|
||||
.get_remote_consistent_lsn_visible()
|
||||
|
||||
@@ -121,16 +121,11 @@ pub(super) enum SetDeletedFlagProgress {
|
||||
Successful(NaiveDateTime),
|
||||
}
|
||||
|
||||
pub(super) struct UploadQueueStoppedDeletable {
|
||||
pub(super) struct UploadQueueStopped {
|
||||
pub(super) upload_queue_for_deletion: UploadQueueInitialized,
|
||||
pub(super) deleted_at: SetDeletedFlagProgress,
|
||||
}
|
||||
|
||||
pub(super) enum UploadQueueStopped {
|
||||
Deletable(UploadQueueStoppedDeletable),
|
||||
Uninitialized,
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum NotInitialized {
|
||||
#[error("queue is in state Uninitialized")]
|
||||
@@ -254,15 +249,12 @@ impl UploadQueue {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn stopped_mut(&mut self) -> anyhow::Result<&mut UploadQueueStoppedDeletable> {
|
||||
pub(crate) fn stopped_mut(&mut self) -> anyhow::Result<&mut UploadQueueStopped> {
|
||||
match self {
|
||||
UploadQueue::Initialized(_) | UploadQueue::Uninitialized => {
|
||||
anyhow::bail!("queue is in state {}", self.as_str())
|
||||
}
|
||||
UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => {
|
||||
anyhow::bail!("queue is in state Stopped(Uninitialized)")
|
||||
}
|
||||
UploadQueue::Stopped(UploadQueueStopped::Deletable(deletable)) => Ok(deletable),
|
||||
UploadQueue::Stopped(stopped) => Ok(stopped),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -61,7 +61,7 @@ pub struct VectoredRead {
|
||||
}
|
||||
|
||||
impl VectoredRead {
|
||||
pub fn size(&self) -> usize {
|
||||
fn size(&self) -> usize {
|
||||
(self.end - self.start) as usize
|
||||
}
|
||||
}
|
||||
|
||||
@@ -782,7 +782,7 @@ where
|
||||
}
|
||||
}
|
||||
// NB: don't use `buf.is_empty()` here; it is from the
|
||||
// `impl Deref for Slice { Target = [u8] }`; the &[u8]
|
||||
// `impl Deref for Slice { Target = [u8] }`; the the &[u8]
|
||||
// returned by it only covers the initialized portion of `buf`.
|
||||
// Whereas we're interested in ensuring that we filled the entire
|
||||
// buffer that the user passed in.
|
||||
|
||||
@@ -111,7 +111,6 @@ static PageServer page_servers[MAX_SHARDS];
|
||||
|
||||
static bool pageserver_flush(shardno_t shard_no);
|
||||
static void pageserver_disconnect(shardno_t shard_no);
|
||||
static void pageserver_disconnect_shard(shardno_t shard_no);
|
||||
|
||||
static bool
|
||||
PagestoreShmemIsValid(void)
|
||||
@@ -488,31 +487,9 @@ retry:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Reset prefetch and drop connection to the shard.
|
||||
* It also drops connection to all other shards involved in prefetch.
|
||||
*/
|
||||
|
||||
static void
|
||||
pageserver_disconnect(shardno_t shard_no)
|
||||
{
|
||||
if (page_servers[shard_no].conn)
|
||||
{
|
||||
/*
|
||||
* If the connection to any pageserver is lost, we throw away the
|
||||
* whole prefetch queue, even for other pageservers. It should not
|
||||
* cause big problems, because connection loss is supposed to be a
|
||||
* rare event.
|
||||
*/
|
||||
prefetch_on_ps_disconnect();
|
||||
}
|
||||
pageserver_disconnect_shard(shard_no);
|
||||
}
|
||||
|
||||
/*
|
||||
* Disconnect from specified shard
|
||||
*/
|
||||
static void
|
||||
pageserver_disconnect_shard(shardno_t shard_no)
|
||||
{
|
||||
/*
|
||||
* If anything goes wrong while we were sending a request, it's not clear
|
||||
@@ -526,6 +503,14 @@ pageserver_disconnect_shard(shardno_t shard_no)
|
||||
neon_shard_log(shard_no, LOG, "dropping connection to page server due to error");
|
||||
PQfinish(page_servers[shard_no].conn);
|
||||
page_servers[shard_no].conn = NULL;
|
||||
|
||||
/*
|
||||
* If the connection to any pageserver is lost, we throw away the
|
||||
* whole prefetch queue, even for other pageservers. It should not
|
||||
* cause big problems, because connection loss is supposed to be a
|
||||
* rare event.
|
||||
*/
|
||||
prefetch_on_ps_disconnect();
|
||||
}
|
||||
if (page_servers[shard_no].wes != NULL)
|
||||
{
|
||||
@@ -691,8 +676,7 @@ page_server_api api =
|
||||
{
|
||||
.send = pageserver_send,
|
||||
.flush = pageserver_flush,
|
||||
.receive = pageserver_receive,
|
||||
.disconnect = pageserver_disconnect_shard
|
||||
.receive = pageserver_receive
|
||||
};
|
||||
|
||||
static bool
|
||||
|
||||
@@ -312,7 +312,7 @@ pg_cluster_size(PG_FUNCTION_ARGS)
|
||||
{
|
||||
int64 size;
|
||||
|
||||
size = GetNeonCurrentClusterSize();
|
||||
size = GetZenithCurrentClusterSize();
|
||||
|
||||
if (size == 0)
|
||||
PG_RETURN_NULL();
|
||||
|
||||
@@ -26,8 +26,6 @@ extern void pg_init_libpagestore(void);
|
||||
extern void pg_init_walproposer(void);
|
||||
|
||||
extern uint64 BackpressureThrottlingTime(void);
|
||||
extern void SetNeonCurrentClusterSize(uint64 size);
|
||||
extern uint64 GetNeonCurrentClusterSize(void);
|
||||
extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
|
||||
|
||||
extern void PGDLLEXPORT WalProposerSync(int argc, char *argv[]);
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user