mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-14 03:30:36 +00:00
Compare commits
1 Commits
release-52
...
skyzh/test
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9fb22c1060 |
21
.github/workflows/build_and_test.yml
vendored
21
.github/workflows/build_and_test.yml
vendored
@@ -1121,16 +1121,10 @@ jobs:
|
||||
run: |
|
||||
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
|
||||
-f deployPgSniRouter=false \
|
||||
-f deployProxy=false \
|
||||
-f deployStorage=true \
|
||||
-f deployStorageBroker=true \
|
||||
-f branch=main \
|
||||
-f dockerTag=${{needs.tag.outputs.build-tag}} \
|
||||
-f deployPreprodRegion=true
|
||||
|
||||
# TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions
|
||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
|
||||
-f deployPgSniRouter=false \
|
||||
-f deployProxy=false \
|
||||
@@ -1139,15 +1133,6 @@ jobs:
|
||||
-f branch=main \
|
||||
-f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
|
||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
|
||||
-f deployPgSniRouter=true \
|
||||
-f deployProxy=true \
|
||||
-f deployStorage=false \
|
||||
-f deployStorageBroker=false \
|
||||
-f branch=main \
|
||||
-f dockerTag=${{needs.tag.outputs.build-tag}} \
|
||||
-f deployPreprodRegion=true
|
||||
|
||||
gh workflow --repo neondatabase/aws run deploy-proxy-prod.yml --ref main \
|
||||
-f deployPgSniRouter=true \
|
||||
-f deployProxy=true \
|
||||
|
||||
90
.github/workflows/trigger-e2e-tests.yml
vendored
90
.github/workflows/trigger-e2e-tests.yml
vendored
@@ -62,14 +62,14 @@ jobs:
|
||||
|
||||
trigger-e2e-tests:
|
||||
needs: [ tag ]
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
env:
|
||||
TAG: ${{ needs.tag.outputs.build-tag }}
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
||||
options: --init
|
||||
steps:
|
||||
- name: check if ecr image are present
|
||||
env:
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
run: |
|
||||
for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
|
||||
OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
|
||||
@@ -79,55 +79,41 @@ jobs:
|
||||
fi
|
||||
done
|
||||
|
||||
- name: Set e2e-platforms
|
||||
id: e2e-platforms
|
||||
env:
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
# Default set of platforms to run e2e tests on
|
||||
platforms='["docker", "k8s"]'
|
||||
|
||||
# If the PR changes vendor/, pgxn/ or libs/vm_monitor/ directories, or Dockerfile.compute-node, add k8s-neonvm to the list of platforms.
|
||||
# If the workflow run is not a pull request, add k8s-neonvm to the list.
|
||||
if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then
|
||||
for f in $(gh api "/repos/${GITHUB_REPOSITORY}/pulls/${PR_NUMBER}/files" --paginate --jq '.[].filename'); do
|
||||
case "$f" in
|
||||
vendor/*|pgxn/*|libs/vm_monitor/*|Dockerfile.compute-node)
|
||||
platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
|
||||
;;
|
||||
*)
|
||||
# no-op
|
||||
;;
|
||||
esac
|
||||
done
|
||||
else
|
||||
platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
|
||||
fi
|
||||
|
||||
echo "e2e-platforms=${platforms}" | tee -a $GITHUB_OUTPUT
|
||||
|
||||
- name: Set PR's status to pending and request a remote CI test
|
||||
env:
|
||||
E2E_PLATFORMS: ${{ steps.e2e-platforms.outputs.e2e-platforms }}
|
||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
run: |
|
||||
REMOTE_REPO="${GITHUB_REPOSITORY_OWNER}/cloud"
|
||||
# For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit
|
||||
# but we need to use a real sha of a latest commit in the PR's branch for the e2e job,
|
||||
# to place a job run status update later.
|
||||
COMMIT_SHA=${{ github.event.pull_request.head.sha }}
|
||||
# For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those
|
||||
COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
|
||||
|
||||
gh api "/repos/${GITHUB_REPOSITORY}/statuses/${COMMIT_SHA}" \
|
||||
--method POST \
|
||||
--raw-field "state=pending" \
|
||||
--raw-field "description=[$REMOTE_REPO] Remote CI job is about to start" \
|
||||
--raw-field "context=neon-cloud-e2e"
|
||||
REMOTE_REPO="${{ github.repository_owner }}/cloud"
|
||||
|
||||
gh workflow --repo ${REMOTE_REPO} \
|
||||
run testing.yml \
|
||||
--ref "main" \
|
||||
--raw-field "ci_job_name=neon-cloud-e2e" \
|
||||
--raw-field "commit_hash=$COMMIT_SHA" \
|
||||
--raw-field "remote_repo=${GITHUB_REPOSITORY}" \
|
||||
--raw-field "storage_image_tag=${TAG}" \
|
||||
--raw-field "compute_image_tag=${TAG}" \
|
||||
--raw-field "concurrency_group=${E2E_CONCURRENCY_GROUP}" \
|
||||
--raw-field "e2e-platforms=${E2E_PLATFORMS}"
|
||||
curl -f -X POST \
|
||||
https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
|
||||
-H "Accept: application/vnd.github.v3+json" \
|
||||
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
|
||||
--data \
|
||||
"{
|
||||
\"state\": \"pending\",
|
||||
\"context\": \"neon-cloud-e2e\",
|
||||
\"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
|
||||
}"
|
||||
|
||||
curl -f -X POST \
|
||||
https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
|
||||
-H "Accept: application/vnd.github.v3+json" \
|
||||
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
|
||||
--data \
|
||||
"{
|
||||
\"ref\": \"main\",
|
||||
\"inputs\": {
|
||||
\"ci_job_name\": \"neon-cloud-e2e\",
|
||||
\"commit_hash\": \"$COMMIT_SHA\",
|
||||
\"remote_repo\": \"${{ github.repository }}\",
|
||||
\"storage_image_tag\": \"${TAG}\",
|
||||
\"compute_image_tag\": \"${TAG}\",
|
||||
\"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
|
||||
}
|
||||
}"
|
||||
|
||||
143
Cargo.lock
generated
143
Cargo.lock
generated
@@ -276,6 +276,7 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"aws-config",
|
||||
"aws-sdk-secretsmanager",
|
||||
"bytes",
|
||||
"camino",
|
||||
"clap",
|
||||
@@ -346,9 +347,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-credential-types"
|
||||
version = "1.1.8"
|
||||
version = "1.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fa8587ae17c8e967e4b05a62d495be2fb7701bec52a97f7acfe8a29f938384c8"
|
||||
checksum = "33cc49dcdd31c8b6e79850a179af4c367669150c7ac0135f176c61bec81a70f7"
|
||||
dependencies = [
|
||||
"aws-smithy-async",
|
||||
"aws-smithy-runtime-api",
|
||||
@@ -358,9 +359,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-runtime"
|
||||
version = "1.1.8"
|
||||
version = "1.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b13dc54b4b49f8288532334bba8f87386a40571c47c37b1304979b556dc613c8"
|
||||
checksum = "eb031bff99877c26c28895766f7bb8484a05e24547e370768d6cc9db514662aa"
|
||||
dependencies = [
|
||||
"aws-credential-types",
|
||||
"aws-sigv4",
|
||||
@@ -380,29 +381,6 @@ dependencies = [
|
||||
"uuid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aws-sdk-iam"
|
||||
version = "1.17.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b8ae76026bfb1b80a6aed0bb400c1139cd9c0563e26bce1986cd021c6a968c7b"
|
||||
dependencies = [
|
||||
"aws-credential-types",
|
||||
"aws-runtime",
|
||||
"aws-smithy-async",
|
||||
"aws-smithy-http",
|
||||
"aws-smithy-json",
|
||||
"aws-smithy-query",
|
||||
"aws-smithy-runtime",
|
||||
"aws-smithy-runtime-api",
|
||||
"aws-smithy-types",
|
||||
"aws-smithy-xml",
|
||||
"aws-types",
|
||||
"http 0.2.9",
|
||||
"once_cell",
|
||||
"regex-lite",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aws-sdk-s3"
|
||||
version = "1.14.0"
|
||||
@@ -432,6 +410,29 @@ dependencies = [
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aws-sdk-secretsmanager"
|
||||
version = "1.14.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0a0b64e61e7d632d9df90a2e0f32630c68c24960cab1d27d848718180af883d3"
|
||||
dependencies = [
|
||||
"aws-credential-types",
|
||||
"aws-runtime",
|
||||
"aws-smithy-async",
|
||||
"aws-smithy-http",
|
||||
"aws-smithy-json",
|
||||
"aws-smithy-runtime",
|
||||
"aws-smithy-runtime-api",
|
||||
"aws-smithy-types",
|
||||
"aws-types",
|
||||
"bytes",
|
||||
"fastrand 2.0.0",
|
||||
"http 0.2.9",
|
||||
"once_cell",
|
||||
"regex-lite",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aws-sdk-sso"
|
||||
version = "1.12.0"
|
||||
@@ -501,9 +502,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-sigv4"
|
||||
version = "1.2.0"
|
||||
version = "1.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "11d6f29688a4be9895c0ba8bef861ad0c0dac5c15e9618b9b7a6c233990fc263"
|
||||
checksum = "c371c6b0ac54d4605eb6f016624fb5c7c2925d315fdf600ac1bf21b19d5f1742"
|
||||
dependencies = [
|
||||
"aws-credential-types",
|
||||
"aws-smithy-eventstream",
|
||||
@@ -516,7 +517,7 @@ dependencies = [
|
||||
"hex",
|
||||
"hmac",
|
||||
"http 0.2.9",
|
||||
"http 1.1.0",
|
||||
"http 1.0.0",
|
||||
"once_cell",
|
||||
"p256",
|
||||
"percent-encoding",
|
||||
@@ -530,9 +531,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-smithy-async"
|
||||
version = "1.1.8"
|
||||
version = "1.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d26ea8fa03025b2face2b3038a63525a10891e3d8829901d502e5384a0d8cd46"
|
||||
checksum = "72ee2d09cce0ef3ae526679b522835d63e75fb427aca5413cd371e490d52dcc6"
|
||||
dependencies = [
|
||||
"futures-util",
|
||||
"pin-project-lite",
|
||||
@@ -573,9 +574,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-smithy-http"
|
||||
version = "0.60.7"
|
||||
version = "0.60.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3f10fa66956f01540051b0aa7ad54574640f748f9839e843442d99b970d3aff9"
|
||||
checksum = "dab56aea3cd9e1101a0a999447fb346afb680ab1406cebc44b32346e25b4117d"
|
||||
dependencies = [
|
||||
"aws-smithy-eventstream",
|
||||
"aws-smithy-runtime-api",
|
||||
@@ -594,18 +595,18 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-smithy-json"
|
||||
version = "0.60.7"
|
||||
version = "0.60.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4683df9469ef09468dad3473d129960119a0d3593617542b7d52086c8486f2d6"
|
||||
checksum = "fd3898ca6518f9215f62678870064398f00031912390efd03f1f6ef56d83aa8e"
|
||||
dependencies = [
|
||||
"aws-smithy-types",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aws-smithy-query"
|
||||
version = "0.60.7"
|
||||
version = "0.60.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f2fbd61ceb3fe8a1cb7352e42689cec5335833cd9f94103a61e98f9bb61c64bb"
|
||||
checksum = "bda4b1dfc9810e35fba8a620e900522cd1bd4f9578c446e82f49d1ce41d2e9f9"
|
||||
dependencies = [
|
||||
"aws-smithy-types",
|
||||
"urlencoding",
|
||||
@@ -613,9 +614,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-smithy-runtime"
|
||||
version = "1.1.8"
|
||||
version = "1.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ec81002d883e5a7fd2bb063d6fb51c4999eb55d404f4fff3dd878bf4733b9f01"
|
||||
checksum = "fafdab38f40ad7816e7da5dec279400dd505160780083759f01441af1bbb10ea"
|
||||
dependencies = [
|
||||
"aws-smithy-async",
|
||||
"aws-smithy-http",
|
||||
@@ -638,15 +639,14 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-smithy-runtime-api"
|
||||
version = "1.2.0"
|
||||
version = "1.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9acb931e0adaf5132de878f1398d83f8677f90ba70f01f65ff87f6d7244be1c5"
|
||||
checksum = "c18276dd28852f34b3bf501f4f3719781f4999a51c7bff1a5c6dc8c4529adc29"
|
||||
dependencies = [
|
||||
"aws-smithy-async",
|
||||
"aws-smithy-types",
|
||||
"bytes",
|
||||
"http 0.2.9",
|
||||
"http 1.1.0",
|
||||
"pin-project-lite",
|
||||
"tokio",
|
||||
"tracing",
|
||||
@@ -655,9 +655,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-smithy-types"
|
||||
version = "1.1.8"
|
||||
version = "1.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "abe14dceea1e70101d38fbf2a99e6a34159477c0fb95e68e05c66bd7ae4c3729"
|
||||
checksum = "bb3e134004170d3303718baa2a4eb4ca64ee0a1c0a7041dca31b38be0fb414f3"
|
||||
dependencies = [
|
||||
"base64-simd",
|
||||
"bytes",
|
||||
@@ -678,18 +678,18 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "aws-smithy-xml"
|
||||
version = "0.60.7"
|
||||
version = "0.60.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "872c68cf019c0e4afc5de7753c4f7288ce4b71663212771bf5e4542eb9346ca9"
|
||||
checksum = "8604a11b25e9ecaf32f9aa56b9fe253c5e2f606a3477f0071e96d3155a5ed218"
|
||||
dependencies = [
|
||||
"xmlparser",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aws-types"
|
||||
version = "1.1.8"
|
||||
version = "1.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0dbf2f3da841a8930f159163175cf6a3d16ddde517c1b0fba7aa776822800f40"
|
||||
checksum = "789bbe008e65636fe1b6dbbb374c40c8960d1232b96af5ff4aec349f9c4accf4"
|
||||
dependencies = [
|
||||
"aws-credential-types",
|
||||
"aws-smithy-async",
|
||||
@@ -2396,9 +2396,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "http"
|
||||
version = "1.1.0"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258"
|
||||
checksum = "b32afd38673a8016f7c9ae69e5af41a58f81b1d31689040f2f1959594ce194ea"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fnv",
|
||||
@@ -2498,7 +2498,7 @@ dependencies = [
|
||||
"hyper",
|
||||
"log",
|
||||
"rustls 0.21.9",
|
||||
"rustls-native-certs 0.6.2",
|
||||
"rustls-native-certs",
|
||||
"tokio",
|
||||
"tokio-rustls 0.24.0",
|
||||
]
|
||||
@@ -3581,7 +3581,6 @@ dependencies = [
|
||||
"strum_macros",
|
||||
"svg_fmt",
|
||||
"sync_wrapper",
|
||||
"sysinfo",
|
||||
"tenant_size_model",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
@@ -4200,10 +4199,6 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"aws-config",
|
||||
"aws-sdk-iam",
|
||||
"aws-sigv4",
|
||||
"aws-types",
|
||||
"base64 0.13.1",
|
||||
"bstr",
|
||||
"bytes",
|
||||
@@ -4214,7 +4209,6 @@ dependencies = [
|
||||
"consumption_metrics",
|
||||
"dashmap",
|
||||
"env_logger",
|
||||
"fallible-iterator",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hashbrown 0.13.2",
|
||||
@@ -4222,7 +4216,6 @@ dependencies = [
|
||||
"hex",
|
||||
"hmac",
|
||||
"hostname",
|
||||
"http 1.1.0",
|
||||
"humantime",
|
||||
"hyper",
|
||||
"hyper-tungstenite",
|
||||
@@ -4438,9 +4431,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "redis"
|
||||
version = "0.25.2"
|
||||
version = "0.24.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "71d64e978fd98a0e6b105d066ba4889a7301fca65aeac850a877d8797343feeb"
|
||||
checksum = "c580d9cbbe1d1b479e8d67cf9daf6a62c957e6846048408b80b43ac3f6af84cd"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"bytes",
|
||||
@@ -4449,15 +4442,15 @@ dependencies = [
|
||||
"itoa",
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
"rustls 0.22.2",
|
||||
"rustls-native-certs 0.7.0",
|
||||
"rustls-pemfile 2.1.1",
|
||||
"rustls-pki-types",
|
||||
"rustls 0.21.9",
|
||||
"rustls-native-certs",
|
||||
"rustls-pemfile 1.0.2",
|
||||
"rustls-webpki 0.101.7",
|
||||
"ryu",
|
||||
"sha1_smol",
|
||||
"socket2 0.5.5",
|
||||
"socket2 0.4.9",
|
||||
"tokio",
|
||||
"tokio-rustls 0.25.0",
|
||||
"tokio-rustls 0.24.0",
|
||||
"tokio-util",
|
||||
"url",
|
||||
]
|
||||
@@ -4886,19 +4879,6 @@ dependencies = [
|
||||
"security-framework",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustls-native-certs"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8f1fb85efa936c42c6d5fc28d2629bb51e4b2f4b8a5211e297d599cc5a093792"
|
||||
dependencies = [
|
||||
"openssl-probe",
|
||||
"rustls-pemfile 2.1.1",
|
||||
"rustls-pki-types",
|
||||
"schannel",
|
||||
"security-framework",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustls-pemfile"
|
||||
version = "1.0.2"
|
||||
@@ -6166,7 +6146,7 @@ dependencies = [
|
||||
"percent-encoding",
|
||||
"pin-project",
|
||||
"prost",
|
||||
"rustls-native-certs 0.6.2",
|
||||
"rustls-native-certs",
|
||||
"rustls-pemfile 1.0.2",
|
||||
"tokio",
|
||||
"tokio-rustls 0.24.0",
|
||||
@@ -7051,6 +7031,7 @@ dependencies = [
|
||||
"aws-sigv4",
|
||||
"aws-smithy-async",
|
||||
"aws-smithy-http",
|
||||
"aws-smithy-runtime-api",
|
||||
"aws-smithy-types",
|
||||
"axum",
|
||||
"base64 0.21.1",
|
||||
|
||||
@@ -52,12 +52,10 @@ async-stream = "0.3"
|
||||
async-trait = "0.1"
|
||||
aws-config = { version = "1.1.4", default-features = false, features=["rustls"] }
|
||||
aws-sdk-s3 = "1.14"
|
||||
aws-sdk-iam = "1.15.0"
|
||||
aws-sdk-secretsmanager = { version = "1.14.0" }
|
||||
aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] }
|
||||
aws-smithy-types = "1.1.4"
|
||||
aws-credential-types = "1.1.4"
|
||||
aws-sigv4 = { version = "1.2.0", features = ["sign-http"] }
|
||||
aws-types = "1.1.7"
|
||||
axum = { version = "0.6.20", features = ["ws"] }
|
||||
base64 = "0.13.0"
|
||||
bincode = "1.3"
|
||||
@@ -78,7 +76,6 @@ either = "1.8"
|
||||
enum-map = "2.4.2"
|
||||
enumset = "1.0.12"
|
||||
fail = "0.5.0"
|
||||
fallible-iterator = "0.2"
|
||||
fs2 = "0.4.3"
|
||||
futures = "0.3"
|
||||
futures-core = "0.3"
|
||||
@@ -91,7 +88,6 @@ hex = "0.4"
|
||||
hex-literal = "0.4"
|
||||
hmac = "0.12.1"
|
||||
hostname = "0.3.1"
|
||||
http = {version = "1.1.0", features = ["std"]}
|
||||
http-types = { version = "2", default-features = false }
|
||||
humantime = "2.1"
|
||||
humantime-serde = "1.1.1"
|
||||
@@ -125,7 +121,7 @@ procfs = "0.14"
|
||||
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
|
||||
prost = "0.11"
|
||||
rand = "0.8"
|
||||
redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
|
||||
redis = { version = "0.24.0", features = ["tokio-rustls-comp", "keep-alive"] }
|
||||
regex = "1.10.2"
|
||||
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
|
||||
reqwest-tracing = { version = "0.4.7", features = ["opentelemetry_0_20"] }
|
||||
|
||||
@@ -135,7 +135,7 @@ WORKDIR /home/nonroot
|
||||
|
||||
# Rust
|
||||
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
|
||||
ENV RUSTC_VERSION=1.77.0
|
||||
ENV RUSTC_VERSION=1.76.0
|
||||
ENV RUSTUP_HOME="/home/nonroot/.rustup"
|
||||
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
|
||||
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
|
||||
@@ -149,7 +149,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
|
||||
cargo install --git https://github.com/paritytech/cachepot && \
|
||||
cargo install rustfilt && \
|
||||
cargo install cargo-hakari && \
|
||||
cargo install cargo-deny --locked && \
|
||||
cargo install cargo-deny && \
|
||||
cargo install cargo-hack && \
|
||||
cargo install cargo-nextest && \
|
||||
rm -rf /home/nonroot/.cargo/registry && \
|
||||
|
||||
@@ -17,7 +17,6 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
|
||||
.write(true)
|
||||
.create(true)
|
||||
.append(false)
|
||||
.truncate(false)
|
||||
.open(path)?;
|
||||
let buf = io::BufReader::new(&file);
|
||||
let mut count: usize = 0;
|
||||
|
||||
@@ -302,9 +302,9 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
RoleAction::Create => {
|
||||
// This branch only runs when roles are created through the console, so it is
|
||||
// safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
|
||||
// from neon_superuser.
|
||||
// from neon_superuser. (NOTE: REPLICATION has been removed from here for now).
|
||||
let mut query: String = format!(
|
||||
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
|
||||
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
|
||||
name.pg_quote()
|
||||
);
|
||||
info!("running role create query: '{}'", &query);
|
||||
@@ -806,8 +806,19 @@ $$;"#,
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
// Add new migrations below.
|
||||
r#"
|
||||
DO $$
|
||||
DECLARE
|
||||
role_name TEXT;
|
||||
BEGIN
|
||||
FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
|
||||
LOOP
|
||||
RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
|
||||
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
|
||||
END LOOP;
|
||||
END
|
||||
$$;"#,
|
||||
];
|
||||
|
||||
let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
|
||||
|
||||
@@ -16,6 +16,7 @@ testing = []
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
aws-config.workspace = true
|
||||
aws-sdk-secretsmanager.workspace = true
|
||||
bytes.workspace = true
|
||||
camino.workspace = true
|
||||
clap.workspace = true
|
||||
|
||||
@@ -139,7 +139,7 @@ impl HeartbeaterTask {
|
||||
.with_client_retries(
|
||||
|client| async move { client.get_utilization().await },
|
||||
&jwt_token,
|
||||
3,
|
||||
2,
|
||||
3,
|
||||
Duration::from_secs(1),
|
||||
&cancel,
|
||||
|
||||
@@ -3,6 +3,7 @@ use attachment_service::http::make_router;
|
||||
use attachment_service::metrics::preinitialize_metrics;
|
||||
use attachment_service::persistence::Persistence;
|
||||
use attachment_service::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT};
|
||||
use aws_config::{BehaviorVersion, Region};
|
||||
use camino::Utf8PathBuf;
|
||||
use clap::Parser;
|
||||
use diesel::Connection;
|
||||
@@ -54,31 +55,11 @@ struct Cli {
|
||||
#[arg(long)]
|
||||
database_url: Option<String>,
|
||||
|
||||
/// Flag to enable dev mode, which permits running without auth
|
||||
#[arg(long, default_value = "false")]
|
||||
dev: bool,
|
||||
|
||||
/// Grace period before marking unresponsive pageserver offline
|
||||
#[arg(long)]
|
||||
max_unavailable_interval: Option<humantime::Duration>,
|
||||
}
|
||||
|
||||
enum StrictMode {
|
||||
/// In strict mode, we will require that all secrets are loaded, i.e. security features
|
||||
/// may not be implicitly turned off by omitting secrets in the environment.
|
||||
Strict,
|
||||
/// In dev mode, secrets are optional, and omitting a particular secret will implicitly
|
||||
/// disable the auth related to it (e.g. no pageserver jwt key -> send unauthenticated
|
||||
/// requests, no public key -> don't authenticate incoming requests).
|
||||
Dev,
|
||||
}
|
||||
|
||||
impl Default for StrictMode {
|
||||
fn default() -> Self {
|
||||
Self::Strict
|
||||
}
|
||||
}
|
||||
|
||||
/// Secrets may either be provided on the command line (for testing), or loaded from AWS SecretManager: this
|
||||
/// type encapsulates the logic to decide which and do the loading.
|
||||
struct Secrets {
|
||||
@@ -89,6 +70,13 @@ struct Secrets {
|
||||
}
|
||||
|
||||
impl Secrets {
|
||||
const DATABASE_URL_SECRET: &'static str = "rds-neon-storage-controller-url";
|
||||
const PAGESERVER_JWT_TOKEN_SECRET: &'static str =
|
||||
"neon-storage-controller-pageserver-jwt-token";
|
||||
const CONTROL_PLANE_JWT_TOKEN_SECRET: &'static str =
|
||||
"neon-storage-controller-control-plane-jwt-token";
|
||||
const PUBLIC_KEY_SECRET: &'static str = "neon-storage-controller-public-key";
|
||||
|
||||
const DATABASE_URL_ENV: &'static str = "DATABASE_URL";
|
||||
const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN";
|
||||
const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN";
|
||||
@@ -99,41 +87,111 @@ impl Secrets {
|
||||
/// - Environment variables if DATABASE_URL is set.
|
||||
/// - AWS Secrets Manager secrets
|
||||
async fn load(args: &Cli) -> anyhow::Result<Self> {
|
||||
let Some(database_url) =
|
||||
Self::load_secret(&args.database_url, Self::DATABASE_URL_ENV).await
|
||||
else {
|
||||
anyhow::bail!(
|
||||
"Database URL is not set (set `--database-url`, or `DATABASE_URL` environment)"
|
||||
)
|
||||
};
|
||||
|
||||
let public_key = match Self::load_secret(&args.public_key, Self::PUBLIC_KEY_ENV).await {
|
||||
Some(v) => Some(JwtAuth::from_key(v).context("Loading public key")?),
|
||||
None => None,
|
||||
};
|
||||
|
||||
let this = Self {
|
||||
database_url,
|
||||
public_key,
|
||||
jwt_token: Self::load_secret(&args.jwt_token, Self::PAGESERVER_JWT_TOKEN_ENV).await,
|
||||
control_plane_jwt_token: Self::load_secret(
|
||||
&args.control_plane_jwt_token,
|
||||
Self::CONTROL_PLANE_JWT_TOKEN_ENV,
|
||||
)
|
||||
.await,
|
||||
};
|
||||
|
||||
Ok(this)
|
||||
match &args.database_url {
|
||||
Some(url) => Self::load_cli(url, args),
|
||||
None => match std::env::var(Self::DATABASE_URL_ENV) {
|
||||
Ok(database_url) => Self::load_env(database_url),
|
||||
Err(_) => Self::load_aws_sm().await,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
async fn load_secret(cli: &Option<String>, env_name: &str) -> Option<String> {
|
||||
if let Some(v) = cli {
|
||||
Some(v.clone())
|
||||
} else if let Ok(v) = std::env::var(env_name) {
|
||||
Some(v)
|
||||
} else {
|
||||
None
|
||||
fn load_env(database_url: String) -> anyhow::Result<Self> {
|
||||
let public_key = match std::env::var(Self::PUBLIC_KEY_ENV) {
|
||||
Ok(public_key) => Some(JwtAuth::from_key(public_key).context("Loading public key")?),
|
||||
Err(_) => None,
|
||||
};
|
||||
Ok(Self {
|
||||
database_url,
|
||||
public_key,
|
||||
jwt_token: std::env::var(Self::PAGESERVER_JWT_TOKEN_ENV).ok(),
|
||||
control_plane_jwt_token: std::env::var(Self::CONTROL_PLANE_JWT_TOKEN_ENV).ok(),
|
||||
})
|
||||
}
|
||||
|
||||
async fn load_aws_sm() -> anyhow::Result<Self> {
|
||||
let Ok(region) = std::env::var("AWS_REGION") else {
|
||||
anyhow::bail!("AWS_REGION is not set, cannot load secrets automatically: either set this, or use CLI args to supply secrets");
|
||||
};
|
||||
let config = aws_config::defaults(BehaviorVersion::v2023_11_09())
|
||||
.region(Region::new(region.clone()))
|
||||
.load()
|
||||
.await;
|
||||
|
||||
let asm = aws_sdk_secretsmanager::Client::new(&config);
|
||||
|
||||
let Some(database_url) = asm
|
||||
.get_secret_value()
|
||||
.secret_id(Self::DATABASE_URL_SECRET)
|
||||
.send()
|
||||
.await?
|
||||
.secret_string()
|
||||
.map(str::to_string)
|
||||
else {
|
||||
anyhow::bail!(
|
||||
"Database URL secret not found at {region}/{}",
|
||||
Self::DATABASE_URL_SECRET
|
||||
)
|
||||
};
|
||||
|
||||
let jwt_token = asm
|
||||
.get_secret_value()
|
||||
.secret_id(Self::PAGESERVER_JWT_TOKEN_SECRET)
|
||||
.send()
|
||||
.await?
|
||||
.secret_string()
|
||||
.map(str::to_string);
|
||||
if jwt_token.is_none() {
|
||||
tracing::warn!("No pageserver JWT token set: this will only work if authentication is disabled on the pageserver");
|
||||
}
|
||||
|
||||
let control_plane_jwt_token = asm
|
||||
.get_secret_value()
|
||||
.secret_id(Self::CONTROL_PLANE_JWT_TOKEN_SECRET)
|
||||
.send()
|
||||
.await?
|
||||
.secret_string()
|
||||
.map(str::to_string);
|
||||
if jwt_token.is_none() {
|
||||
tracing::warn!("No control plane JWT token set: this will only work if authentication is disabled on the pageserver");
|
||||
}
|
||||
|
||||
let public_key = asm
|
||||
.get_secret_value()
|
||||
.secret_id(Self::PUBLIC_KEY_SECRET)
|
||||
.send()
|
||||
.await?
|
||||
.secret_string()
|
||||
.map(str::to_string);
|
||||
let public_key = match public_key {
|
||||
Some(key) => Some(JwtAuth::from_key(key)?),
|
||||
None => {
|
||||
tracing::warn!(
|
||||
"No public key set: inccoming HTTP requests will not be authenticated"
|
||||
);
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
Ok(Self {
|
||||
database_url,
|
||||
public_key,
|
||||
jwt_token,
|
||||
control_plane_jwt_token,
|
||||
})
|
||||
}
|
||||
|
||||
fn load_cli(database_url: &str, args: &Cli) -> anyhow::Result<Self> {
|
||||
let public_key = match &args.public_key {
|
||||
None => None,
|
||||
Some(key) => Some(JwtAuth::from_key(key.clone()).context("Loading public key")?),
|
||||
};
|
||||
Ok(Self {
|
||||
database_url: database_url.to_owned(),
|
||||
public_key,
|
||||
jwt_token: args.jwt_token.clone(),
|
||||
control_plane_jwt_token: args.control_plane_jwt_token.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -189,42 +247,8 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
args.listen
|
||||
);
|
||||
|
||||
let strict_mode = if args.dev {
|
||||
StrictMode::Dev
|
||||
} else {
|
||||
StrictMode::Strict
|
||||
};
|
||||
|
||||
let secrets = Secrets::load(&args).await?;
|
||||
|
||||
// Validate required secrets and arguments are provided in strict mode
|
||||
match strict_mode {
|
||||
StrictMode::Strict
|
||||
if (secrets.public_key.is_none()
|
||||
|| secrets.jwt_token.is_none()
|
||||
|| secrets.control_plane_jwt_token.is_none()) =>
|
||||
{
|
||||
// Production systems should always have secrets configured: if public_key was not set
|
||||
// then we would implicitly disable auth.
|
||||
anyhow::bail!(
|
||||
"Insecure config! One or more secrets is not set. This is only permitted in `--dev` mode"
|
||||
);
|
||||
}
|
||||
StrictMode::Strict if args.compute_hook_url.is_none() => {
|
||||
// Production systems should always have a compute hook set, to prevent falling
|
||||
// back to trying to use neon_local.
|
||||
anyhow::bail!(
|
||||
"`--compute-hook-url` is not set: this is only permitted in `--dev` mode"
|
||||
);
|
||||
}
|
||||
StrictMode::Strict => {
|
||||
tracing::info!("Starting in strict mode: configuration is OK.")
|
||||
}
|
||||
StrictMode::Dev => {
|
||||
tracing::warn!("Starting in dev mode: this may be an insecure configuration.")
|
||||
}
|
||||
}
|
||||
|
||||
let config = Config {
|
||||
jwt_token: secrets.jwt_token,
|
||||
control_plane_jwt_token: secrets.control_plane_jwt_token,
|
||||
|
||||
@@ -1523,8 +1523,6 @@ impl Service {
|
||||
&self,
|
||||
create_req: TenantCreateRequest,
|
||||
) -> Result<TenantCreateResponse, ApiError> {
|
||||
let tenant_id = create_req.new_tenant_id.tenant_id;
|
||||
|
||||
// Exclude any concurrent attempts to create/access the same tenant ID
|
||||
let _tenant_lock = self
|
||||
.tenant_op_locks
|
||||
@@ -1533,12 +1531,7 @@ impl Service {
|
||||
|
||||
let (response, waiters) = self.do_tenant_create(create_req).await?;
|
||||
|
||||
if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await {
|
||||
// Avoid deadlock: reconcile may fail while notifying compute, if the cloud control plane refuses to
|
||||
// accept compute notifications while it is in the process of creating. Reconciliation will
|
||||
// be retried in the background.
|
||||
tracing::warn!(%tenant_id, "Reconcile not done yet while creating tenant ({e})");
|
||||
}
|
||||
self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await?;
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
@@ -1617,25 +1610,13 @@ impl Service {
|
||||
splitting: SplitState::default(),
|
||||
})
|
||||
.collect();
|
||||
|
||||
match self
|
||||
.persistence
|
||||
self.persistence
|
||||
.insert_tenant_shards(persist_tenant_shards)
|
||||
.await
|
||||
{
|
||||
Ok(_) => {}
|
||||
Err(DatabaseError::Query(diesel::result::Error::DatabaseError(
|
||||
DatabaseErrorKind::UniqueViolation,
|
||||
_,
|
||||
))) => {
|
||||
// Unique key violation: this is probably a retry. Because the shard count is part of the unique key,
|
||||
// if we see a unique key violation it means that the creation request's shard count matches the previous
|
||||
// creation's shard count.
|
||||
tracing::info!("Tenant shards already present in database, proceeding with idempotent creation...");
|
||||
}
|
||||
// Any other database error is unexpected and a bug.
|
||||
Err(e) => return Err(ApiError::InternalServerError(anyhow::anyhow!(e))),
|
||||
};
|
||||
.map_err(|e| {
|
||||
// TODO: distinguish primary key constraint (idempotent, OK), from other errors
|
||||
ApiError::InternalServerError(anyhow::anyhow!(e))
|
||||
})?;
|
||||
|
||||
let (waiters, response_shards) = {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
|
||||
@@ -294,7 +294,7 @@ where
|
||||
// is in state 'taken' but the thread that would unlock it is
|
||||
// not there.
|
||||
// 2. A rust object that represented some external resource in the
|
||||
// parent now got implicitly copied by the fork, even though
|
||||
// parent now got implicitly copied by the the fork, even though
|
||||
// the object's type is not `Copy`. The parent program may use
|
||||
// non-copyability as way to enforce unique ownership of an
|
||||
// external resource in the typesystem. The fork breaks that
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
//!
|
||||
//! The endpoint is managed by the `compute_ctl` binary. When an endpoint is
|
||||
//! started, we launch `compute_ctl` It synchronizes the safekeepers, downloads
|
||||
//! the basebackup from the pageserver to initialize the data directory, and
|
||||
//! the basebackup from the pageserver to initialize the the data directory, and
|
||||
//! finally launches the PostgreSQL process. It watches the PostgreSQL process
|
||||
//! until it exits.
|
||||
//!
|
||||
|
||||
@@ -279,7 +279,6 @@ impl StorageController {
|
||||
&self.listen,
|
||||
"-p",
|
||||
self.path.as_ref(),
|
||||
"--dev",
|
||||
"--database-url",
|
||||
&database_url,
|
||||
"--max-unavailable-interval",
|
||||
|
||||
@@ -40,7 +40,7 @@ macro_rules! register_hll {
|
||||
}};
|
||||
|
||||
($N:literal, $NAME:expr, $HELP:expr $(,)?) => {{
|
||||
$crate::register_hll!($N, $crate::opts!($NAME, $HELP))
|
||||
$crate::register_hll!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES)
|
||||
}};
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use anyhow::*;
|
||||
use clap::{value_parser, Arg, ArgMatches, Command};
|
||||
use postgres::Client;
|
||||
use std::{path::PathBuf, str::FromStr};
|
||||
use wal_craft::*;
|
||||
|
||||
@@ -9,8 +8,8 @@ fn main() -> Result<()> {
|
||||
.init();
|
||||
let arg_matches = cli().get_matches();
|
||||
|
||||
let wal_craft = |arg_matches: &ArgMatches, client: &mut Client| {
|
||||
let intermediate_lsns = match arg_matches
|
||||
let wal_craft = |arg_matches: &ArgMatches, client| {
|
||||
let (intermediate_lsns, end_of_wal_lsn) = match arg_matches
|
||||
.get_one::<String>("type")
|
||||
.map(|s| s.as_str())
|
||||
.context("'type' is required")?
|
||||
@@ -26,7 +25,6 @@ fn main() -> Result<()> {
|
||||
LastWalRecordCrossingSegment::NAME => LastWalRecordCrossingSegment::craft(client)?,
|
||||
a => panic!("Unknown --type argument: {a}"),
|
||||
};
|
||||
let end_of_wal_lsn = client.pg_current_wal_insert_lsn()?;
|
||||
for lsn in intermediate_lsns {
|
||||
println!("intermediate_lsn = {lsn}");
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@ use postgres::types::PgLsn;
|
||||
use postgres::Client;
|
||||
use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
|
||||
use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
|
||||
use std::cmp::Ordering;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::Command;
|
||||
use std::time::{Duration, Instant};
|
||||
@@ -231,52 +232,59 @@ pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> anyhow
|
||||
pub trait Crafter {
|
||||
const NAME: &'static str;
|
||||
|
||||
/// Generates WAL using the client `client`. Returns a vector of some valid
|
||||
/// "interesting" intermediate LSNs which one may start reading from.
|
||||
/// test_end_of_wal uses this to check various starting points.
|
||||
///
|
||||
/// Note that postgres is generally keen about writing some WAL. While we
|
||||
/// try to disable it (autovacuum, big wal_writer_delay, etc) it is always
|
||||
/// possible, e.g. xl_running_xacts are dumped each 15s. So checks about
|
||||
/// stable WAL end would be flaky unless postgres is shut down. For this
|
||||
/// reason returning potential end of WAL here is pointless. Most of the
|
||||
/// time this doesn't happen though, so it is reasonable to create needed
|
||||
/// WAL structure and immediately kill postgres like test_end_of_wal does.
|
||||
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>>;
|
||||
/// Generates WAL using the client `client`. Returns a pair of:
|
||||
/// * A vector of some valid "interesting" intermediate LSNs which one may start reading from.
|
||||
/// May include or exclude Lsn(0) and the end-of-wal.
|
||||
/// * The expected end-of-wal LSN.
|
||||
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)>;
|
||||
}
|
||||
|
||||
/// Wraps some WAL craft function, providing current LSN to it before the
|
||||
/// insertion and flushing WAL afterwards. Also pushes initial LSN to the
|
||||
/// result.
|
||||
fn craft_internal<C: postgres::GenericClient>(
|
||||
client: &mut C,
|
||||
f: impl Fn(&mut C, PgLsn) -> anyhow::Result<Vec<PgLsn>>,
|
||||
) -> anyhow::Result<Vec<PgLsn>> {
|
||||
f: impl Fn(&mut C, PgLsn) -> anyhow::Result<(Vec<PgLsn>, Option<PgLsn>)>,
|
||||
) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
|
||||
ensure_server_config(client)?;
|
||||
|
||||
let initial_lsn = client.pg_current_wal_insert_lsn()?;
|
||||
info!("LSN initial = {}", initial_lsn);
|
||||
|
||||
let mut intermediate_lsns = f(client, initial_lsn)?;
|
||||
let (mut intermediate_lsns, last_lsn) = f(client, initial_lsn)?;
|
||||
let last_lsn = match last_lsn {
|
||||
None => client.pg_current_wal_insert_lsn()?,
|
||||
Some(last_lsn) => {
|
||||
let insert_lsn = client.pg_current_wal_insert_lsn()?;
|
||||
match last_lsn.cmp(&insert_lsn) {
|
||||
Ordering::Less => bail!(
|
||||
"Some records were inserted after the crafted WAL: {} vs {}",
|
||||
last_lsn,
|
||||
insert_lsn
|
||||
),
|
||||
Ordering::Equal => last_lsn,
|
||||
Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"),
|
||||
}
|
||||
}
|
||||
};
|
||||
if !intermediate_lsns.starts_with(&[initial_lsn]) {
|
||||
intermediate_lsns.insert(0, initial_lsn);
|
||||
}
|
||||
|
||||
// Some records may be not flushed, e.g. non-transactional logical messages.
|
||||
//
|
||||
// Note: this is broken if pg_current_wal_insert_lsn is at page boundary
|
||||
// because pg_current_wal_insert_lsn skips page headers.
|
||||
client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?;
|
||||
Ok(intermediate_lsns)
|
||||
match last_lsn.cmp(&client.pg_current_wal_flush_lsn()?) {
|
||||
Ordering::Less => bail!("Some records were flushed after the crafted WAL"),
|
||||
Ordering::Equal => {}
|
||||
Ordering::Greater => bail!("Reported LSN is greater than flush_lsn"),
|
||||
}
|
||||
Ok((intermediate_lsns, last_lsn))
|
||||
}
|
||||
|
||||
pub struct Simple;
|
||||
impl Crafter for Simple {
|
||||
const NAME: &'static str = "simple";
|
||||
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
|
||||
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
|
||||
craft_internal(client, |client, _| {
|
||||
client.execute("CREATE table t(x int)", &[])?;
|
||||
Ok(Vec::new())
|
||||
Ok((Vec::new(), None))
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -284,36 +292,29 @@ impl Crafter for Simple {
|
||||
pub struct LastWalRecordXlogSwitch;
|
||||
impl Crafter for LastWalRecordXlogSwitch {
|
||||
const NAME: &'static str = "last_wal_record_xlog_switch";
|
||||
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
|
||||
// Do not use craft_internal because here we end up with flush_lsn exactly on
|
||||
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
|
||||
// Do not use generate_internal because here we end up with flush_lsn exactly on
|
||||
// the segment boundary and insert_lsn after the initial page header, which is unusual.
|
||||
ensure_server_config(client)?;
|
||||
|
||||
client.execute("CREATE table t(x int)", &[])?;
|
||||
let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
|
||||
// pg_switch_wal returns end of last record of the switched segment,
|
||||
// i.e. end of SWITCH itself.
|
||||
let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
|
||||
let before_xlog_switch_u64 = u64::from(before_xlog_switch);
|
||||
let next_segment = PgLsn::from(
|
||||
before_xlog_switch_u64 - (before_xlog_switch_u64 % WAL_SEGMENT_SIZE as u64)
|
||||
+ WAL_SEGMENT_SIZE as u64,
|
||||
);
|
||||
let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
|
||||
let next_segment = PgLsn::from(0x0200_0000);
|
||||
ensure!(
|
||||
xlog_switch_record_end <= next_segment,
|
||||
"XLOG_SWITCH record ended after the expected segment boundary: {} > {}",
|
||||
xlog_switch_record_end,
|
||||
after_xlog_switch <= next_segment,
|
||||
"XLOG_SWITCH message ended after the expected segment boundary: {} > {}",
|
||||
after_xlog_switch,
|
||||
next_segment
|
||||
);
|
||||
Ok(vec![before_xlog_switch, xlog_switch_record_end])
|
||||
Ok((vec![before_xlog_switch, after_xlog_switch], next_segment))
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LastWalRecordXlogSwitchEndsOnPageBoundary;
|
||||
/// Craft xlog SWITCH record ending at page boundary.
|
||||
impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
|
||||
const NAME: &'static str = "last_wal_record_xlog_switch_ends_on_page_boundary";
|
||||
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
|
||||
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
|
||||
// Do not use generate_internal because here we end up with flush_lsn exactly on
|
||||
// the segment boundary and insert_lsn after the initial page header, which is unusual.
|
||||
ensure_server_config(client)?;
|
||||
@@ -360,29 +361,28 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
|
||||
|
||||
// Emit the XLOG_SWITCH
|
||||
let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
|
||||
let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
|
||||
let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
|
||||
let next_segment = PgLsn::from(0x0200_0000);
|
||||
ensure!(
|
||||
xlog_switch_record_end < next_segment,
|
||||
"XLOG_SWITCH record ended on or after the expected segment boundary: {} > {}",
|
||||
xlog_switch_record_end,
|
||||
after_xlog_switch < next_segment,
|
||||
"XLOG_SWITCH message ended on or after the expected segment boundary: {} > {}",
|
||||
after_xlog_switch,
|
||||
next_segment
|
||||
);
|
||||
ensure!(
|
||||
u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
|
||||
u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
|
||||
"XLOG_SWITCH message ended not on page boundary: {}, offset = {}",
|
||||
xlog_switch_record_end,
|
||||
u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
|
||||
after_xlog_switch,
|
||||
u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ
|
||||
);
|
||||
Ok(vec![before_xlog_switch, xlog_switch_record_end])
|
||||
Ok((vec![before_xlog_switch, after_xlog_switch], next_segment))
|
||||
}
|
||||
}
|
||||
|
||||
/// Write ~16MB logical message; it should cross WAL segment.
|
||||
fn craft_seg_size_logical_message(
|
||||
fn craft_single_logical_message(
|
||||
client: &mut impl postgres::GenericClient,
|
||||
transactional: bool,
|
||||
) -> anyhow::Result<Vec<PgLsn>> {
|
||||
) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
|
||||
craft_internal(client, |client, initial_lsn| {
|
||||
ensure!(
|
||||
initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024),
|
||||
@@ -405,24 +405,34 @@ fn craft_seg_size_logical_message(
|
||||
"Logical message crossed two segments"
|
||||
);
|
||||
|
||||
Ok(vec![message_lsn])
|
||||
if transactional {
|
||||
// Transactional logical messages are part of a transaction, so the one above is
|
||||
// followed by a small COMMIT record.
|
||||
|
||||
let after_message_lsn = client.pg_current_wal_insert_lsn()?;
|
||||
ensure!(
|
||||
message_lsn < after_message_lsn,
|
||||
"No record found after the emitted message"
|
||||
);
|
||||
Ok((vec![message_lsn], Some(after_message_lsn)))
|
||||
} else {
|
||||
Ok((Vec::new(), Some(message_lsn)))
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub struct WalRecordCrossingSegmentFollowedBySmallOne;
|
||||
impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne {
|
||||
const NAME: &'static str = "wal_record_crossing_segment_followed_by_small_one";
|
||||
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
|
||||
// Transactional message crossing WAL segment will be followed by small
|
||||
// commit record.
|
||||
craft_seg_size_logical_message(client, true)
|
||||
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
|
||||
craft_single_logical_message(client, true)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LastWalRecordCrossingSegment;
|
||||
impl Crafter for LastWalRecordCrossingSegment {
|
||||
const NAME: &'static str = "last_wal_record_crossing_segment";
|
||||
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
|
||||
craft_seg_size_logical_message(client, false)
|
||||
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
|
||||
craft_single_logical_message(client, false)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,15 +11,13 @@ use utils::const_assert;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
fn init_logging() {
|
||||
let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(format!(
|
||||
"crate=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"
|
||||
)))
|
||||
let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(
|
||||
format!("crate=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"),
|
||||
))
|
||||
.is_test(true)
|
||||
.try_init();
|
||||
}
|
||||
|
||||
/// Test that find_end_of_wal returns the same results as pg_dump on various
|
||||
/// WALs created by Crafter.
|
||||
fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
|
||||
use crate::*;
|
||||
|
||||
@@ -40,13 +38,13 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
|
||||
}
|
||||
cfg.initdb().unwrap();
|
||||
let srv = cfg.start_server().unwrap();
|
||||
let intermediate_lsns = C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
|
||||
let (intermediate_lsns, expected_end_of_wal_partial) =
|
||||
C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
|
||||
let intermediate_lsns: Vec<Lsn> = intermediate_lsns
|
||||
.iter()
|
||||
.map(|&lsn| u64::from(lsn).into())
|
||||
.collect();
|
||||
// Kill postgres. Note that it might have inserted to WAL something after
|
||||
// 'craft' did its job.
|
||||
let expected_end_of_wal: Lsn = u64::from(expected_end_of_wal_partial).into();
|
||||
srv.kill();
|
||||
|
||||
// Check find_end_of_wal on the initial WAL
|
||||
@@ -58,7 +56,7 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
|
||||
.filter(|fname| IsXLogFileName(fname))
|
||||
.max()
|
||||
.unwrap();
|
||||
let expected_end_of_wal = find_pg_waldump_end_of_wal(&cfg, &last_segment);
|
||||
check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal);
|
||||
for start_lsn in intermediate_lsns
|
||||
.iter()
|
||||
.chain(std::iter::once(&expected_end_of_wal))
|
||||
@@ -93,7 +91,11 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
|
||||
}
|
||||
}
|
||||
|
||||
fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &str) -> Lsn {
|
||||
fn check_pg_waldump_end_of_wal(
|
||||
cfg: &crate::Conf,
|
||||
last_segment: &str,
|
||||
expected_end_of_wal: Lsn,
|
||||
) {
|
||||
// Get the actual end of WAL by pg_waldump
|
||||
let waldump_output = cfg
|
||||
.pg_waldump("000000010000000000000001", last_segment)
|
||||
@@ -111,8 +113,11 @@ fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &str) -> Lsn {
|
||||
}
|
||||
};
|
||||
let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap();
|
||||
info!("waldump erred on {}", waldump_wal_end);
|
||||
waldump_wal_end
|
||||
info!(
|
||||
"waldump erred on {}, expected wal end at {}",
|
||||
waldump_wal_end, expected_end_of_wal
|
||||
);
|
||||
assert_eq!(waldump_wal_end, expected_end_of_wal);
|
||||
}
|
||||
|
||||
fn check_end_of_wal(
|
||||
@@ -205,9 +210,9 @@ pub fn test_update_next_xid() {
|
||||
#[test]
|
||||
pub fn test_encode_logical_message() {
|
||||
let expected = [
|
||||
64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255, 38,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114, 101, 102,
|
||||
105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
|
||||
64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255,
|
||||
38, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114,
|
||||
101, 102, 105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
|
||||
];
|
||||
let actual = encode_logical_message("prefix", "message");
|
||||
assert_eq!(expected, actual[..]);
|
||||
|
||||
@@ -198,7 +198,6 @@ impl LocalFs {
|
||||
fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.create(true)
|
||||
.truncate(true)
|
||||
.open(&temp_file_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
|
||||
@@ -247,7 +247,7 @@ fn scenario_4() {
|
||||
//
|
||||
// This is in total 5000 + 1000 + 5000 + 1000 = 12000
|
||||
//
|
||||
// (If we used the method from the previous scenario, and
|
||||
// (If we used the the method from the previous scenario, and
|
||||
// kept only snapshot at the branch point, we'd need to keep
|
||||
// all the WAL between 10000-18000 on the main branch, so
|
||||
// the total size would be 5000 + 1000 + 8000 = 14000. The
|
||||
|
||||
@@ -63,7 +63,6 @@ impl UnwrittenLockFile {
|
||||
pub fn create_exclusive(lock_file_path: &Utf8Path) -> anyhow::Result<UnwrittenLockFile> {
|
||||
let lock_file = fs::OpenOptions::new()
|
||||
.create(true) // O_CREAT
|
||||
.truncate(true)
|
||||
.write(true)
|
||||
.open(lock_file_path)
|
||||
.context("open lock file")?;
|
||||
|
||||
@@ -69,7 +69,7 @@ pub struct Config {
|
||||
/// should be removed once we have a better solution there.
|
||||
sys_buffer_bytes: u64,
|
||||
|
||||
/// Minimum fraction of total system memory reserved *before* the cgroup threshold; in
|
||||
/// Minimum fraction of total system memory reserved *before* the the cgroup threshold; in
|
||||
/// other words, providing a ceiling for the highest value of the threshold by enforcing that
|
||||
/// there's at least `cgroup_min_overhead_fraction` of the total memory remaining beyond the
|
||||
/// threshold.
|
||||
|
||||
@@ -59,7 +59,6 @@ signal-hook.workspace = true
|
||||
smallvec = { workspace = true, features = ["write"] }
|
||||
svg_fmt.workspace = true
|
||||
sync_wrapper.workspace = true
|
||||
sysinfo.workspace = true
|
||||
tokio-tar.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
|
||||
|
||||
@@ -600,37 +600,32 @@ fn start_pageserver(
|
||||
None,
|
||||
"consumption metrics collection",
|
||||
true,
|
||||
{
|
||||
let tenant_manager = tenant_manager.clone();
|
||||
async move {
|
||||
// first wait until background jobs are cleared to launch.
|
||||
//
|
||||
// this is because we only process active tenants and timelines, and the
|
||||
// Timeline::get_current_logical_size will spawn the logical size calculation,
|
||||
// which will not be rate-limited.
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
async move {
|
||||
// first wait until background jobs are cleared to launch.
|
||||
//
|
||||
// this is because we only process active tenants and timelines, and the
|
||||
// Timeline::get_current_logical_size will spawn the logical size calculation,
|
||||
// which will not be rate-limited.
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => { return Ok(()); },
|
||||
_ = background_jobs_barrier.wait() => {}
|
||||
};
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => { return Ok(()); },
|
||||
_ = background_jobs_barrier.wait() => {}
|
||||
};
|
||||
|
||||
pageserver::consumption_metrics::collect_metrics(
|
||||
tenant_manager,
|
||||
metric_collection_endpoint,
|
||||
&conf.metric_collection_bucket,
|
||||
conf.metric_collection_interval,
|
||||
conf.cached_metric_collection_interval,
|
||||
conf.synthetic_size_calculation_interval,
|
||||
conf.id,
|
||||
local_disk_storage,
|
||||
cancel,
|
||||
metrics_ctx,
|
||||
)
|
||||
.instrument(info_span!("metrics_collection"))
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
pageserver::consumption_metrics::collect_metrics(
|
||||
metric_collection_endpoint,
|
||||
conf.metric_collection_interval,
|
||||
conf.cached_metric_collection_interval,
|
||||
conf.synthetic_size_calculation_interval,
|
||||
conf.id,
|
||||
local_disk_storage,
|
||||
cancel,
|
||||
metrics_ctx,
|
||||
)
|
||||
.instrument(info_span!("metrics_collection"))
|
||||
.await?;
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
@@ -95,8 +95,6 @@ pub mod defaults {
|
||||
|
||||
pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
|
||||
|
||||
pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
|
||||
|
||||
///
|
||||
/// Default built-in configuration file.
|
||||
///
|
||||
@@ -158,8 +156,6 @@ pub mod defaults {
|
||||
#heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
|
||||
#secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY}
|
||||
|
||||
#ephemeral_bytes_per_memory_kb = {DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB}
|
||||
|
||||
[remote_storage]
|
||||
|
||||
"#
|
||||
@@ -238,7 +234,6 @@ pub struct PageServerConf {
|
||||
// How often to send unchanged cached metrics to the metrics endpoint.
|
||||
pub cached_metric_collection_interval: Duration,
|
||||
pub metric_collection_endpoint: Option<Url>,
|
||||
pub metric_collection_bucket: Option<RemoteStorageConfig>,
|
||||
pub synthetic_size_calculation_interval: Duration,
|
||||
|
||||
pub disk_usage_based_eviction: Option<DiskUsageEvictionTaskConfig>,
|
||||
@@ -283,13 +278,6 @@ pub struct PageServerConf {
|
||||
pub max_vectored_read_bytes: MaxVectoredReadBytes,
|
||||
|
||||
pub validate_vectored_get: bool,
|
||||
|
||||
/// How many bytes of ephemeral layer content will we allow per kilobyte of RAM. When this
|
||||
/// is exceeded, we start proactively closing ephemeral layers to limit the total amount
|
||||
/// of ephemeral data.
|
||||
///
|
||||
/// Setting this to zero disables limits on total ephemeral layer size.
|
||||
pub ephemeral_bytes_per_memory_kb: usize,
|
||||
}
|
||||
|
||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||
@@ -385,7 +373,6 @@ struct PageServerConfigBuilder {
|
||||
cached_metric_collection_interval: BuilderValue<Duration>,
|
||||
metric_collection_endpoint: BuilderValue<Option<Url>>,
|
||||
synthetic_size_calculation_interval: BuilderValue<Duration>,
|
||||
metric_collection_bucket: BuilderValue<Option<RemoteStorageConfig>>,
|
||||
|
||||
disk_usage_based_eviction: BuilderValue<Option<DiskUsageEvictionTaskConfig>>,
|
||||
|
||||
@@ -411,8 +398,6 @@ struct PageServerConfigBuilder {
|
||||
max_vectored_read_bytes: BuilderValue<MaxVectoredReadBytes>,
|
||||
|
||||
validate_vectored_get: BuilderValue<bool>,
|
||||
|
||||
ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
|
||||
}
|
||||
|
||||
impl PageServerConfigBuilder {
|
||||
@@ -470,8 +455,6 @@ impl PageServerConfigBuilder {
|
||||
.expect("cannot parse default synthetic size calculation interval")),
|
||||
metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT),
|
||||
|
||||
metric_collection_bucket: Set(None),
|
||||
|
||||
disk_usage_based_eviction: Set(None),
|
||||
|
||||
test_remote_failures: Set(0),
|
||||
@@ -499,7 +482,6 @@ impl PageServerConfigBuilder {
|
||||
NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
|
||||
)),
|
||||
validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
|
||||
ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -604,13 +586,6 @@ impl PageServerConfigBuilder {
|
||||
self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint)
|
||||
}
|
||||
|
||||
pub fn metric_collection_bucket(
|
||||
&mut self,
|
||||
metric_collection_bucket: Option<RemoteStorageConfig>,
|
||||
) {
|
||||
self.metric_collection_bucket = BuilderValue::Set(metric_collection_bucket)
|
||||
}
|
||||
|
||||
pub fn synthetic_size_calculation_interval(
|
||||
&mut self,
|
||||
synthetic_size_calculation_interval: Duration,
|
||||
@@ -679,10 +654,6 @@ impl PageServerConfigBuilder {
|
||||
self.validate_vectored_get = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn get_ephemeral_bytes_per_memory_kb(&mut self, value: usize) {
|
||||
self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||
let default = Self::default_values();
|
||||
|
||||
@@ -723,7 +694,6 @@ impl PageServerConfigBuilder {
|
||||
metric_collection_interval,
|
||||
cached_metric_collection_interval,
|
||||
metric_collection_endpoint,
|
||||
metric_collection_bucket,
|
||||
synthetic_size_calculation_interval,
|
||||
disk_usage_based_eviction,
|
||||
test_remote_failures,
|
||||
@@ -738,7 +708,6 @@ impl PageServerConfigBuilder {
|
||||
get_vectored_impl,
|
||||
max_vectored_read_bytes,
|
||||
validate_vectored_get,
|
||||
ephemeral_bytes_per_memory_kb,
|
||||
}
|
||||
CUSTOM LOGIC
|
||||
{
|
||||
@@ -973,9 +942,6 @@ impl PageServerConf {
|
||||
let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?;
|
||||
builder.metric_collection_endpoint(Some(endpoint));
|
||||
},
|
||||
"metric_collection_bucket" => {
|
||||
builder.metric_collection_bucket(RemoteStorageConfig::from_toml(item)?)
|
||||
}
|
||||
"synthetic_size_calculation_interval" =>
|
||||
builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
|
||||
"test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
|
||||
@@ -1029,9 +995,6 @@ impl PageServerConf {
|
||||
"validate_vectored_get" => {
|
||||
builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
|
||||
}
|
||||
"ephemeral_bytes_per_memory_kb" => {
|
||||
builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
|
||||
}
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
}
|
||||
}
|
||||
@@ -1094,7 +1057,6 @@ impl PageServerConf {
|
||||
metric_collection_interval: Duration::from_secs(60),
|
||||
cached_metric_collection_interval: Duration::from_secs(60 * 60),
|
||||
metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
|
||||
metric_collection_bucket: None,
|
||||
synthetic_size_calculation_interval: Duration::from_secs(60),
|
||||
disk_usage_based_eviction: None,
|
||||
test_remote_failures: 0,
|
||||
@@ -1113,7 +1075,6 @@ impl PageServerConf {
|
||||
.expect("Invalid default constant"),
|
||||
),
|
||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1328,7 +1289,6 @@ background_task_maximum_delay = '334 s'
|
||||
defaults::DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL
|
||||
)?,
|
||||
metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
|
||||
metric_collection_bucket: None,
|
||||
synthetic_size_calculation_interval: humantime::parse_duration(
|
||||
defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL
|
||||
)?,
|
||||
@@ -1351,7 +1311,6 @@ background_task_maximum_delay = '334 s'
|
||||
.expect("Invalid default constant")
|
||||
),
|
||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
|
||||
},
|
||||
"Correct defaults should be used when no config values are provided"
|
||||
);
|
||||
@@ -1404,7 +1363,6 @@ background_task_maximum_delay = '334 s'
|
||||
metric_collection_interval: Duration::from_secs(222),
|
||||
cached_metric_collection_interval: Duration::from_secs(22200),
|
||||
metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
|
||||
metric_collection_bucket: None,
|
||||
synthetic_size_calculation_interval: Duration::from_secs(333),
|
||||
disk_usage_based_eviction: None,
|
||||
test_remote_failures: 0,
|
||||
@@ -1423,7 +1381,6 @@ background_task_maximum_delay = '334 s'
|
||||
.expect("Invalid default constant")
|
||||
),
|
||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
|
||||
},
|
||||
"Should be able to parse all basic config values correctly"
|
||||
);
|
||||
|
||||
@@ -3,13 +3,10 @@
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
|
||||
use crate::tenant::tasks::BackgroundLoopKind;
|
||||
use crate::tenant::{
|
||||
mgr::TenantManager, LogicalSizeCalculationCause, PageReconstructError, Tenant,
|
||||
};
|
||||
use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError, Tenant};
|
||||
use camino::Utf8PathBuf;
|
||||
use consumption_metrics::EventType;
|
||||
use pageserver_api::models::TenantState;
|
||||
use remote_storage::{GenericRemoteStorage, RemoteStorageConfig};
|
||||
use reqwest::Url;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
@@ -43,9 +40,7 @@ type Cache = HashMap<MetricsKey, (EventType, u64)>;
|
||||
/// Main thread that serves metrics collection
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn collect_metrics(
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
metric_collection_endpoint: &Url,
|
||||
metric_collection_bucket: &Option<RemoteStorageConfig>,
|
||||
metric_collection_interval: Duration,
|
||||
_cached_metric_collection_interval: Duration,
|
||||
synthetic_size_calculation_interval: Duration,
|
||||
@@ -70,19 +65,15 @@ pub async fn collect_metrics(
|
||||
None,
|
||||
"synthetic size calculation",
|
||||
false,
|
||||
{
|
||||
let tenant_manager = tenant_manager.clone();
|
||||
async move {
|
||||
calculate_synthetic_size_worker(
|
||||
tenant_manager,
|
||||
synthetic_size_calculation_interval,
|
||||
&cancel,
|
||||
&worker_ctx,
|
||||
)
|
||||
.instrument(info_span!("synthetic_size_worker"))
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
async move {
|
||||
calculate_synthetic_size_worker(
|
||||
synthetic_size_calculation_interval,
|
||||
&cancel,
|
||||
&worker_ctx,
|
||||
)
|
||||
.instrument(info_span!("synthetic_size_worker"))
|
||||
.await?;
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
|
||||
@@ -103,27 +94,13 @@ pub async fn collect_metrics(
|
||||
.build()
|
||||
.expect("Failed to create http client with timeout");
|
||||
|
||||
let bucket_client = if let Some(bucket_config) = metric_collection_bucket {
|
||||
match GenericRemoteStorage::from_config(bucket_config) {
|
||||
Ok(client) => Some(client),
|
||||
Err(e) => {
|
||||
// Non-fatal error: if we were given an invalid config, we will proceed
|
||||
// with sending metrics over the network, but not to S3.
|
||||
tracing::warn!("Invalid configuration for metric_collection_bucket: {e}");
|
||||
None
|
||||
}
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let node_id = node_id.to_string();
|
||||
|
||||
loop {
|
||||
let started_at = Instant::now();
|
||||
|
||||
// these are point in time, with variable "now"
|
||||
let metrics = metrics::collect_all_metrics(&tenant_manager, &cached_metrics, &ctx).await;
|
||||
let metrics = metrics::collect_all_metrics(&cached_metrics, &ctx).await;
|
||||
|
||||
let metrics = Arc::new(metrics);
|
||||
|
||||
@@ -141,18 +118,10 @@ pub async fn collect_metrics(
|
||||
tracing::error!("failed to persist metrics to {path:?}: {e:#}");
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(bucket_client) = &bucket_client {
|
||||
let res =
|
||||
upload::upload_metrics_bucket(bucket_client, &cancel, &node_id, &metrics).await;
|
||||
if let Err(e) = res {
|
||||
tracing::error!("failed to upload to S3: {e:#}");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let upload = async {
|
||||
let res = upload::upload_metrics_http(
|
||||
let res = upload::upload_metrics(
|
||||
&client,
|
||||
metric_collection_endpoint,
|
||||
&cancel,
|
||||
@@ -163,7 +132,7 @@ pub async fn collect_metrics(
|
||||
.await;
|
||||
if let Err(e) = res {
|
||||
// serialization error which should never happen
|
||||
tracing::error!("failed to upload via HTTP due to {e:#}");
|
||||
tracing::error!("failed to upload due to {e:#}");
|
||||
}
|
||||
};
|
||||
|
||||
@@ -278,7 +247,6 @@ async fn reschedule(
|
||||
|
||||
/// Caclculate synthetic size for each active tenant
|
||||
async fn calculate_synthetic_size_worker(
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
synthetic_size_calculation_interval: Duration,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
@@ -291,7 +259,7 @@ async fn calculate_synthetic_size_worker(
|
||||
loop {
|
||||
let started_at = Instant::now();
|
||||
|
||||
let tenants = match tenant_manager.list_tenants() {
|
||||
let tenants = match mgr::list_tenants().await {
|
||||
Ok(tenants) => tenants,
|
||||
Err(e) => {
|
||||
warn!("cannot get tenant list: {e:#}");
|
||||
@@ -310,14 +278,10 @@ async fn calculate_synthetic_size_worker(
|
||||
continue;
|
||||
}
|
||||
|
||||
let Ok(tenant) = tenant_manager.get_attached_tenant_shard(tenant_shard_id) else {
|
||||
let Ok(tenant) = mgr::get_tenant(tenant_shard_id, true) else {
|
||||
continue;
|
||||
};
|
||||
|
||||
if !tenant.is_active() {
|
||||
continue;
|
||||
}
|
||||
|
||||
// there is never any reason to exit calculate_synthetic_size_worker following any
|
||||
// return value -- we don't need to care about shutdown because no tenant is found when
|
||||
// pageserver is shut down.
|
||||
@@ -355,7 +319,9 @@ async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &Re
|
||||
};
|
||||
|
||||
// this error can be returned if timeline is shutting down, but it does not
|
||||
// mean the synthetic size worker should terminate.
|
||||
// mean the synthetic size worker should terminate. we do not need any checks
|
||||
// in this function because `mgr::get_tenant` will error out after shutdown has
|
||||
// progressed to shutting down tenants.
|
||||
let shutting_down = matches!(
|
||||
e.downcast_ref::<PageReconstructError>(),
|
||||
Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
use crate::tenant::mgr::TenantManager;
|
||||
use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogicalSize};
|
||||
use chrono::{DateTime, Utc};
|
||||
use consumption_metrics::EventType;
|
||||
@@ -182,7 +181,6 @@ impl MetricsKey {
|
||||
}
|
||||
|
||||
pub(super) async fn collect_all_metrics(
|
||||
tenant_manager: &Arc<TenantManager>,
|
||||
cached_metrics: &Cache,
|
||||
ctx: &RequestContext,
|
||||
) -> Vec<RawMetric> {
|
||||
@@ -190,7 +188,7 @@ pub(super) async fn collect_all_metrics(
|
||||
|
||||
let started_at = std::time::Instant::now();
|
||||
|
||||
let tenants = match tenant_manager.list_tenants() {
|
||||
let tenants = match crate::tenant::mgr::list_tenants().await {
|
||||
Ok(tenants) => tenants,
|
||||
Err(err) => {
|
||||
tracing::error!("failed to list tenants: {:?}", err);
|
||||
@@ -202,8 +200,7 @@ pub(super) async fn collect_all_metrics(
|
||||
if state != TenantState::Active || !id.is_zero() {
|
||||
None
|
||||
} else {
|
||||
tenant_manager
|
||||
.get_attached_tenant_shard(id)
|
||||
crate::tenant::mgr::get_tenant(id, true)
|
||||
.ok()
|
||||
.map(|tenant| (id.tenant_id, tenant))
|
||||
}
|
||||
|
||||
@@ -1,9 +1,4 @@
|
||||
use std::time::SystemTime;
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use consumption_metrics::{Event, EventChunk, IdempotencyKey, CHUNK_SIZE};
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath};
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::Instrument;
|
||||
|
||||
@@ -18,9 +13,8 @@ struct Ids {
|
||||
pub(super) timeline_id: Option<TimelineId>,
|
||||
}
|
||||
|
||||
/// Serialize and write metrics to an HTTP endpoint
|
||||
#[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))]
|
||||
pub(super) async fn upload_metrics_http(
|
||||
pub(super) async fn upload_metrics(
|
||||
client: &reqwest::Client,
|
||||
metric_collection_endpoint: &reqwest::Url,
|
||||
cancel: &CancellationToken,
|
||||
@@ -80,60 +74,6 @@ pub(super) async fn upload_metrics_http(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Serialize and write metrics to a remote storage object
|
||||
#[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))]
|
||||
pub(super) async fn upload_metrics_bucket(
|
||||
client: &GenericRemoteStorage,
|
||||
cancel: &CancellationToken,
|
||||
node_id: &str,
|
||||
metrics: &[RawMetric],
|
||||
) -> anyhow::Result<()> {
|
||||
if metrics.is_empty() {
|
||||
// Skip uploads if we have no metrics, so that readers don't have to handle the edge case
|
||||
// of an empty object.
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Compose object path
|
||||
let datetime: DateTime<Utc> = SystemTime::now().into();
|
||||
let ts_prefix = datetime.format("year=%Y/month=%m/day=%d/%H:%M:%SZ");
|
||||
let path = RemotePath::from_string(&format!("{ts_prefix}_{node_id}.ndjson.gz"))?;
|
||||
|
||||
// Set up a gzip writer into a buffer
|
||||
let mut compressed_bytes: Vec<u8> = Vec::new();
|
||||
let compressed_writer = std::io::Cursor::new(&mut compressed_bytes);
|
||||
let mut gzip_writer = async_compression::tokio::write::GzipEncoder::new(compressed_writer);
|
||||
|
||||
// Serialize and write into compressed buffer
|
||||
let started_at = std::time::Instant::now();
|
||||
for res in serialize_in_chunks(CHUNK_SIZE, metrics, node_id) {
|
||||
let (_chunk, body) = res?;
|
||||
gzip_writer.write_all(&body).await?;
|
||||
}
|
||||
gzip_writer.flush().await?;
|
||||
gzip_writer.shutdown().await?;
|
||||
let compressed_length = compressed_bytes.len();
|
||||
|
||||
// Write to remote storage
|
||||
client
|
||||
.upload_storage_object(
|
||||
futures::stream::once(futures::future::ready(Ok(compressed_bytes.into()))),
|
||||
compressed_length,
|
||||
&path,
|
||||
cancel,
|
||||
)
|
||||
.await?;
|
||||
let elapsed = started_at.elapsed();
|
||||
|
||||
tracing::info!(
|
||||
compressed_length,
|
||||
elapsed_ms = elapsed.as_millis(),
|
||||
"write metrics bucket at {path}",
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// The return type is quite ugly, but we gain testability in isolation
|
||||
fn serialize_in_chunks<'a, F>(
|
||||
chunk_size: usize,
|
||||
|
||||
@@ -61,6 +61,7 @@ use crate::{
|
||||
metrics::disk_usage_based_eviction::METRICS,
|
||||
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
||||
tenant::{
|
||||
self,
|
||||
mgr::TenantManager,
|
||||
remote_timeline_client::LayerFileMetadata,
|
||||
secondary::SecondaryTenant,
|
||||
@@ -813,8 +814,8 @@ async fn collect_eviction_candidates(
|
||||
const LOG_DURATION_THRESHOLD: std::time::Duration = std::time::Duration::from_secs(10);
|
||||
|
||||
// get a snapshot of the list of tenants
|
||||
let tenants = tenant_manager
|
||||
.list_tenants()
|
||||
let tenants = tenant::mgr::list_tenants()
|
||||
.await
|
||||
.context("get list of tenants")?;
|
||||
|
||||
// TODO: avoid listing every layer in every tenant: this loop can block the executor,
|
||||
@@ -826,12 +827,8 @@ async fn collect_eviction_candidates(
|
||||
if cancel.is_cancelled() {
|
||||
return Ok(EvictionCandidates::Cancelled);
|
||||
}
|
||||
let tenant = match tenant_manager.get_attached_tenant_shard(tenant_id) {
|
||||
Ok(tenant) if tenant.is_active() => tenant,
|
||||
Ok(_) => {
|
||||
debug!(tenant_id=%tenant_id.tenant_id, shard_id=%tenant_id.shard_slug(), "Tenant shard is not active");
|
||||
continue;
|
||||
}
|
||||
let tenant = match tenant::mgr::get_tenant(tenant_id, true) {
|
||||
Ok(tenant) => tenant,
|
||||
Err(e) => {
|
||||
// this can happen if tenant has lifecycle transition after we fetched it
|
||||
debug!("failed to get tenant: {e:#}");
|
||||
|
||||
@@ -1038,7 +1038,7 @@ paths:
|
||||
format: hex
|
||||
responses:
|
||||
"201":
|
||||
description: Timeline was created, or already existed with matching parameters
|
||||
description: TimelineInfo
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
@@ -1068,17 +1068,11 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"409":
|
||||
description: Timeline already exists, with different parameters. Creation cannot proceed.
|
||||
description: Timeline already exists, creation skipped
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ConflictError"
|
||||
"429":
|
||||
description: A creation request was sent for the same Timeline Id while a creation was already in progress. Back off and retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
|
||||
@@ -49,8 +49,8 @@ use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::config::{LocationConf, TenantConfOpt};
|
||||
use crate::tenant::mgr::GetActiveTenantError;
|
||||
use crate::tenant::mgr::{
|
||||
GetTenantError, TenantManager, TenantMapError, TenantMapInsertError, TenantSlotError,
|
||||
TenantSlotUpsertError, TenantStateError,
|
||||
GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
|
||||
TenantSlotError, TenantSlotUpsertError, TenantStateError,
|
||||
};
|
||||
use crate::tenant::mgr::{TenantSlot, UpsertLocationError};
|
||||
use crate::tenant::remote_timeline_client;
|
||||
@@ -249,11 +249,16 @@ impl From<GetTenantError> for ApiError {
|
||||
fn from(tse: GetTenantError) -> ApiError {
|
||||
match tse {
|
||||
GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
|
||||
GetTenantError::Broken(reason) => {
|
||||
ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason))
|
||||
}
|
||||
GetTenantError::NotActive(_) => {
|
||||
// Why is this not `ApiError::NotFound`?
|
||||
// Because we must be careful to never return 404 for a tenant if it does
|
||||
// in fact exist locally. If we did, the caller could draw the conclusion
|
||||
// that it can attach the tenant to another PS and we'd be in split-brain.
|
||||
//
|
||||
// (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls).
|
||||
ApiError::ResourceUnavailable("Tenant not yet active".into())
|
||||
}
|
||||
GetTenantError::MapState(e) => ApiError::ResourceUnavailable(format!("{e}").into()),
|
||||
@@ -264,9 +269,6 @@ impl From<GetTenantError> for ApiError {
|
||||
impl From<GetActiveTenantError> for ApiError {
|
||||
fn from(e: GetActiveTenantError) -> ApiError {
|
||||
match e {
|
||||
GetActiveTenantError::Broken(reason) => {
|
||||
ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason))
|
||||
}
|
||||
GetActiveTenantError::WillNotBecomeActive(_) => ApiError::Conflict(format!("{}", e)),
|
||||
GetActiveTenantError::Cancelled => ApiError::ShuttingDown,
|
||||
GetActiveTenantError::NotFound(gte) => gte.into(),
|
||||
@@ -277,6 +279,19 @@ impl From<GetActiveTenantError> for ApiError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<SetNewTenantConfigError> for ApiError {
|
||||
fn from(e: SetNewTenantConfigError) -> ApiError {
|
||||
match e {
|
||||
SetNewTenantConfigError::GetTenant(tid) => {
|
||||
ApiError::NotFound(anyhow!("tenant {}", tid).into())
|
||||
}
|
||||
e @ (SetNewTenantConfigError::Persist(_) | SetNewTenantConfigError::Other(_)) => {
|
||||
ApiError::InternalServerError(anyhow::Error::new(e))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<crate::tenant::DeleteTimelineError> for ApiError {
|
||||
fn from(value: crate::tenant::DeleteTimelineError) -> Self {
|
||||
use crate::tenant::DeleteTimelineError::*;
|
||||
@@ -480,7 +495,7 @@ async fn timeline_create_handler(
|
||||
async {
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
.get_attached_tenant_shard(tenant_shard_id, false)?;
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
@@ -520,13 +535,10 @@ async fn timeline_create_handler(
|
||||
HttpErrorBody::from_msg("Tenant shutting down".to_string()),
|
||||
)
|
||||
}
|
||||
Err(e @ tenant::CreateTimelineError::Conflict) => {
|
||||
json_response(StatusCode::CONFLICT, HttpErrorBody::from_msg(e.to_string()))
|
||||
}
|
||||
Err(e @ tenant::CreateTimelineError::AlreadyCreating) => json_response(
|
||||
StatusCode::TOO_MANY_REQUESTS,
|
||||
HttpErrorBody::from_msg(e.to_string()),
|
||||
),
|
||||
Err(
|
||||
e @ tenant::CreateTimelineError::Conflict
|
||||
| e @ tenant::CreateTimelineError::AlreadyCreating,
|
||||
) => json_response(StatusCode::CONFLICT, HttpErrorBody::from_msg(e.to_string())),
|
||||
Err(tenant::CreateTimelineError::AncestorLsn(err)) => json_response(
|
||||
StatusCode::NOT_ACCEPTABLE,
|
||||
HttpErrorBody::from_msg(format!("{err:#}")),
|
||||
@@ -569,7 +581,7 @@ async fn timeline_list_handler(
|
||||
let response_data = async {
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
.get_attached_tenant_shard(tenant_shard_id, false)?;
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
@@ -607,7 +619,6 @@ async fn timeline_preserve_initdb_handler(
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let state = get_state(&request);
|
||||
|
||||
// Part of the process for disaster recovery from safekeeper-stored WAL:
|
||||
// If we don't recover into a new timeline but want to keep the timeline ID,
|
||||
@@ -615,9 +626,7 @@ async fn timeline_preserve_initdb_handler(
|
||||
// location where timeline recreation cand find it.
|
||||
|
||||
async {
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
let tenant = mgr::get_tenant(tenant_shard_id, false)?;
|
||||
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, false)
|
||||
@@ -659,7 +668,7 @@ async fn timeline_detail_handler(
|
||||
let timeline_info = async {
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
.get_attached_tenant_shard(tenant_shard_id, false)?;
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
@@ -846,7 +855,7 @@ async fn timeline_delete_handler(
|
||||
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)
|
||||
.get_attached_tenant_shard(tenant_shard_id, false)
|
||||
.map_err(|e| {
|
||||
match e {
|
||||
// GetTenantError has a built-in conversion to ApiError, but in this context we don't
|
||||
@@ -964,11 +973,10 @@ async fn tenant_list_handler(
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&request, None)?;
|
||||
let state = get_state(&request);
|
||||
|
||||
let response_data = state
|
||||
.tenant_manager
|
||||
.list_tenants()
|
||||
let response_data = mgr::list_tenants()
|
||||
.instrument(info_span!("tenant_list"))
|
||||
.await
|
||||
.map_err(|_| {
|
||||
ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".into())
|
||||
})?
|
||||
@@ -991,12 +999,9 @@ async fn tenant_status(
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let state = get_state(&request);
|
||||
|
||||
let tenant_info = async {
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
let tenant = mgr::get_tenant(tenant_shard_id, false)?;
|
||||
|
||||
// Calculate total physical size of all timelines
|
||||
let mut current_physical_size = 0;
|
||||
@@ -1069,7 +1074,9 @@ async fn tenant_size_handler(
|
||||
let inputs_only: Option<bool> = parse_query_param(&request, "inputs_only")?;
|
||||
let retention_period: Option<u64> = parse_query_param(&request, "retention_period")?;
|
||||
let headers = request.headers();
|
||||
let state = get_state(&request);
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let tenant = mgr::get_tenant(tenant_shard_id, true)?;
|
||||
|
||||
if !tenant_shard_id.is_zero() {
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
@@ -1077,12 +1084,6 @@ async fn tenant_size_handler(
|
||||
)));
|
||||
}
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
// this can be long operation
|
||||
let inputs = tenant
|
||||
.gather_size_inputs(
|
||||
@@ -1151,15 +1152,10 @@ async fn tenant_shard_split_handler(
|
||||
let state = get_state(&request);
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
let new_shards = state
|
||||
.tenant_manager
|
||||
.shard_split(
|
||||
tenant,
|
||||
tenant_shard_id,
|
||||
ShardCount::new(req.new_shard_count),
|
||||
req.new_stripe_size,
|
||||
&ctx,
|
||||
@@ -1377,11 +1373,8 @@ async fn get_tenant_config_handler(
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let state = get_state(&request);
|
||||
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
let tenant = mgr::get_tenant(tenant_shard_id, false)?;
|
||||
|
||||
let response = HashMap::from([
|
||||
(
|
||||
@@ -1409,31 +1402,15 @@ async fn update_tenant_config_handler(
|
||||
let tenant_id = request_data.tenant_id;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let new_tenant_conf =
|
||||
let tenant_conf =
|
||||
TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||
|
||||
let tenant = state
|
||||
state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
// This is a legacy API that only operates on attached tenants: the preferred
|
||||
// API to use is the location_config/ endpoint, which lets the caller provide
|
||||
// the full LocationConf.
|
||||
let location_conf = LocationConf::attached_single(
|
||||
new_tenant_conf.clone(),
|
||||
tenant.get_generation(),
|
||||
&ShardParameters::default(),
|
||||
);
|
||||
|
||||
crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
tenant.set_new_tenant_config(new_tenant_conf);
|
||||
.set_new_tenant_config(tenant_conf, tenant_id)
|
||||
.instrument(info_span!("tenant_config", %tenant_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
@@ -1657,12 +1634,10 @@ async fn handle_tenant_break(
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?;
|
||||
|
||||
let state = get_state(&r);
|
||||
state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?
|
||||
.set_broken("broken from test".to_owned())
|
||||
.await;
|
||||
let tenant = crate::tenant::mgr::get_tenant(tenant_shard_id, true)
|
||||
.map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;
|
||||
|
||||
tenant.set_broken("broken from test".to_owned()).await;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
@@ -1906,7 +1881,7 @@ async fn active_timeline_of_active_tenant(
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<Arc<Timeline>, ApiError> {
|
||||
let tenant = tenant_manager.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
let tenant = tenant_manager.get_attached_tenant_shard(tenant_shard_id, false)?;
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
|
||||
@@ -435,7 +435,7 @@ pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(||
|
||||
static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_remote_physical_size",
|
||||
"The size of the layer files present in the remote storage that are listed in the remote index_part.json.",
|
||||
"The size of the layer files present in the remote storage that are listed in the the remote index_part.json.",
|
||||
// Corollary: If any files are missing from the index part, they won't be included here.
|
||||
&["tenant_id", "shard_id", "timeline_id"]
|
||||
)
|
||||
@@ -699,14 +699,6 @@ pub static STARTUP_IS_LOADING: Lazy<UIntGauge> = Lazy::new(|| {
|
||||
.expect("Failed to register pageserver_startup_is_loading")
|
||||
});
|
||||
|
||||
pub(crate) static TIMELINE_EPHEMERAL_BYTES: Lazy<UIntGauge> = Lazy::new(|| {
|
||||
register_uint_gauge!(
|
||||
"pageserver_timeline_ephemeral_bytes",
|
||||
"Total number of bytes in ephemeral layers, summed for all timelines. Approximate, lazily updated."
|
||||
)
|
||||
.expect("Failed to register metric")
|
||||
});
|
||||
|
||||
/// Metrics related to the lifecycle of a [`crate::tenant::Tenant`] object: things
|
||||
/// like how long it took to load.
|
||||
///
|
||||
|
||||
@@ -760,7 +760,6 @@ impl PageServerHandler {
|
||||
let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &tenant.cancel)));
|
||||
timeline
|
||||
.import_basebackup_from_tar(
|
||||
tenant.clone(),
|
||||
&mut copyin_reader,
|
||||
base_lsn,
|
||||
self.broker_client.clone(),
|
||||
|
||||
@@ -1411,7 +1411,7 @@ impl Tenant {
|
||||
/// the same timeline ID already exists, returns CreateTimelineError::AlreadyExists.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(crate) async fn create_timeline(
|
||||
self: &Arc<Tenant>,
|
||||
&self,
|
||||
new_timeline_id: TimelineId,
|
||||
ancestor_timeline_id: Option<TimelineId>,
|
||||
mut ancestor_start_lsn: Option<Lsn>,
|
||||
@@ -1559,7 +1559,7 @@ impl Tenant {
|
||||
})?;
|
||||
}
|
||||
|
||||
loaded_timeline.activate(self.clone(), broker_client, None, ctx);
|
||||
loaded_timeline.activate(broker_client, None, ctx);
|
||||
|
||||
Ok(loaded_timeline)
|
||||
}
|
||||
@@ -1731,12 +1731,7 @@ impl Tenant {
|
||||
let mut activated_timelines = 0;
|
||||
|
||||
for timeline in timelines_to_activate {
|
||||
timeline.activate(
|
||||
self.clone(),
|
||||
broker_client.clone(),
|
||||
background_jobs_can_start,
|
||||
ctx,
|
||||
);
|
||||
timeline.activate(broker_client.clone(), background_jobs_can_start, ctx);
|
||||
activated_timelines += 1;
|
||||
}
|
||||
|
||||
@@ -2068,12 +2063,7 @@ impl Tenant {
|
||||
TenantState::Active { .. } => {
|
||||
return Ok(());
|
||||
}
|
||||
TenantState::Broken { reason, .. } => {
|
||||
// This is fatal, and reported distinctly from the general case of "will never be active" because
|
||||
// it's logically a 500 to external API users (broken is always a bug).
|
||||
return Err(GetActiveTenantError::Broken(reason));
|
||||
}
|
||||
TenantState::Stopping { .. } => {
|
||||
TenantState::Broken { .. } | TenantState::Stopping { .. } => {
|
||||
// There's no chance the tenant can transition back into ::Active
|
||||
return Err(GetActiveTenantError::WillNotBecomeActive(current_state));
|
||||
}
|
||||
@@ -2151,7 +2141,7 @@ impl Tenant {
|
||||
|
||||
// Shut down the timeline's remote client: this means that the indices we write
|
||||
// for child shards will not be invalidated by the parent shard deleting layers.
|
||||
tl_client.shutdown().await;
|
||||
tl_client.shutdown().await?;
|
||||
|
||||
// Download methods can still be used after shutdown, as they don't flow through the remote client's
|
||||
// queue. In principal the RemoteTimelineClient could provide this without downloading it, but this
|
||||
|
||||
@@ -111,7 +111,6 @@ async fn create_local_delete_mark(
|
||||
let _ = std::fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.create(true)
|
||||
.truncate(true)
|
||||
.open(&marker_path)
|
||||
.with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::models::LocationConfigMode;
|
||||
use pageserver_api::models::{LocationConfigMode, ShardParameters};
|
||||
use pageserver_api::shard::{
|
||||
ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
|
||||
};
|
||||
@@ -16,7 +16,6 @@ use std::collections::{BTreeMap, HashMap};
|
||||
use std::ops::Deref;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
use sysinfo::SystemExt;
|
||||
use tokio::fs;
|
||||
use utils::timeout::{timeout_cancellable, TimeoutCancellableError};
|
||||
|
||||
@@ -40,10 +39,10 @@ use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
|
||||
use crate::task_mgr::{self, TaskKind};
|
||||
use crate::tenant::config::{
|
||||
AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig,
|
||||
TenantConfOpt,
|
||||
};
|
||||
use crate::tenant::delete::DeleteTenantFlow;
|
||||
use crate::tenant::span::debug_assert_current_span_has_tenant_id;
|
||||
use crate::tenant::storage_layer::inmemory_layer;
|
||||
use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
|
||||
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX};
|
||||
|
||||
@@ -544,18 +543,6 @@ pub async fn init_tenant_mgr(
|
||||
|
||||
let ctx = RequestContext::todo_child(TaskKind::Startup, DownloadBehavior::Warn);
|
||||
|
||||
// Initialize dynamic limits that depend on system resources
|
||||
let system_memory =
|
||||
sysinfo::System::new_with_specifics(sysinfo::RefreshKind::new().with_memory())
|
||||
.total_memory();
|
||||
let max_ephemeral_layer_bytes =
|
||||
conf.ephemeral_bytes_per_memory_kb as u64 * (system_memory / 1024);
|
||||
tracing::info!("Initialized ephemeral layer size limit to {max_ephemeral_layer_bytes}, for {system_memory} bytes of memory");
|
||||
inmemory_layer::GLOBAL_RESOURCES.max_dirty_bytes.store(
|
||||
max_ephemeral_layer_bytes,
|
||||
std::sync::atomic::Ordering::Relaxed,
|
||||
);
|
||||
|
||||
// Scan local filesystem for attached tenants
|
||||
let tenant_configs = init_load_tenant_configs(conf).await?;
|
||||
|
||||
@@ -888,6 +875,16 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
|
||||
// caller will log how long we took
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum SetNewTenantConfigError {
|
||||
#[error(transparent)]
|
||||
GetTenant(#[from] GetTenantError),
|
||||
#[error(transparent)]
|
||||
Persist(anyhow::Error),
|
||||
#[error(transparent)]
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum UpsertLocationError {
|
||||
#[error("Bad config request: {0}")]
|
||||
@@ -913,21 +910,32 @@ impl TenantManager {
|
||||
self.conf
|
||||
}
|
||||
|
||||
/// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or currently
|
||||
/// undergoing a state change (i.e. slot is InProgress).
|
||||
///
|
||||
/// The return Tenant is not guaranteed to be active: check its status after obtaing it, or
|
||||
/// use [`Tenant::wait_to_become_active`] before using it if you will do I/O on it.
|
||||
/// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or is not fitting to the query.
|
||||
/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
|
||||
pub(crate) fn get_attached_tenant_shard(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
active_only: bool,
|
||||
) -> Result<Arc<Tenant>, GetTenantError> {
|
||||
let locked = self.tenants.read().unwrap();
|
||||
|
||||
let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?;
|
||||
|
||||
match peek_slot {
|
||||
Some(TenantSlot::Attached(tenant)) => Ok(Arc::clone(tenant)),
|
||||
Some(TenantSlot::Attached(tenant)) => match tenant.current_state() {
|
||||
TenantState::Broken {
|
||||
reason,
|
||||
backtrace: _,
|
||||
} if active_only => Err(GetTenantError::Broken(reason)),
|
||||
TenantState::Active => Ok(Arc::clone(tenant)),
|
||||
_ => {
|
||||
if active_only {
|
||||
Err(GetTenantError::NotActive(tenant_shard_id))
|
||||
} else {
|
||||
Ok(Arc::clone(tenant))
|
||||
}
|
||||
}
|
||||
},
|
||||
Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)),
|
||||
None | Some(TenantSlot::Secondary(_)) => {
|
||||
Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
|
||||
@@ -1420,8 +1428,7 @@ impl TenantManager {
|
||||
.wait_to_become_active(activation_timeout)
|
||||
.await
|
||||
.map_err(|e| match e {
|
||||
GetActiveTenantError::WillNotBecomeActive(_)
|
||||
| GetActiveTenantError::Broken(_) => {
|
||||
GetActiveTenantError::WillNotBecomeActive(_) => {
|
||||
DeleteTenantError::InvalidState(tenant.current_state())
|
||||
}
|
||||
GetActiveTenantError::Cancelled => DeleteTenantError::Cancelled,
|
||||
@@ -1448,30 +1455,29 @@ impl TenantManager {
|
||||
result
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))]
|
||||
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), new_shard_count=%new_shard_count.literal()))]
|
||||
pub(crate) async fn shard_split(
|
||||
&self,
|
||||
tenant: Arc<Tenant>,
|
||||
tenant_shard_id: TenantShardId,
|
||||
new_shard_count: ShardCount,
|
||||
new_stripe_size: Option<ShardStripeSize>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<TenantShardId>> {
|
||||
let tenant_shard_id = *tenant.get_tenant_shard_id();
|
||||
let r = self
|
||||
.do_shard_split(tenant, new_shard_count, new_stripe_size, ctx)
|
||||
.do_shard_split(tenant_shard_id, new_shard_count, new_stripe_size, ctx)
|
||||
.await;
|
||||
if r.is_err() {
|
||||
// Shard splitting might have left the original shard in a partially shut down state (it
|
||||
// stops the shard's remote timeline client). Reset it to ensure we leave things in
|
||||
// a working state.
|
||||
if self.get(tenant_shard_id).is_some() {
|
||||
tracing::warn!("Resetting after shard split failure");
|
||||
tracing::warn!("Resetting {tenant_shard_id} after shard split failure");
|
||||
if let Err(e) = self.reset_tenant(tenant_shard_id, false, ctx).await {
|
||||
// Log this error because our return value will still be the original error, not this one. This is
|
||||
// a severe error: if this happens, we might be leaving behind a tenant that is not fully functional
|
||||
// (e.g. has uploads disabled). We can't do anything else: if reset fails then shutting the tenant down or
|
||||
// setting it broken probably won't help either.
|
||||
tracing::error!("Failed to reset: {e}");
|
||||
tracing::error!("Failed to reset {tenant_shard_id}: {e}");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1481,12 +1487,12 @@ impl TenantManager {
|
||||
|
||||
pub(crate) async fn do_shard_split(
|
||||
&self,
|
||||
tenant: Arc<Tenant>,
|
||||
tenant_shard_id: TenantShardId,
|
||||
new_shard_count: ShardCount,
|
||||
new_stripe_size: Option<ShardStripeSize>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<TenantShardId>> {
|
||||
let tenant_shard_id = *tenant.get_tenant_shard_id();
|
||||
let tenant = get_tenant(tenant_shard_id, true)?;
|
||||
|
||||
// Validate the incoming request
|
||||
if new_shard_count.count() <= tenant_shard_id.shard_count.count() {
|
||||
@@ -1532,6 +1538,7 @@ impl TenantManager {
|
||||
// If [`Tenant::split_prepare`] fails, we must reload the tenant, because it might
|
||||
// have been left in a partially-shut-down state.
|
||||
tracing::warn!("Failed to prepare for split: {e}, reloading Tenant before returning");
|
||||
self.reset_tenant(tenant_shard_id, false, ctx).await?;
|
||||
return Err(e);
|
||||
}
|
||||
|
||||
@@ -1929,23 +1936,38 @@ impl TenantManager {
|
||||
removal_result
|
||||
}
|
||||
|
||||
pub(crate) fn list_tenants(
|
||||
pub(crate) async fn set_new_tenant_config(
|
||||
&self,
|
||||
) -> Result<Vec<(TenantShardId, TenantState, Generation)>, TenantMapListError> {
|
||||
let tenants = TENANTS.read().unwrap();
|
||||
let m = match &*tenants {
|
||||
TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
|
||||
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m,
|
||||
};
|
||||
Ok(m.iter()
|
||||
.filter_map(|(id, tenant)| match tenant {
|
||||
TenantSlot::Attached(tenant) => {
|
||||
Some((*id, tenant.current_state(), tenant.generation()))
|
||||
}
|
||||
TenantSlot::Secondary(_) => None,
|
||||
TenantSlot::InProgress(_) => None,
|
||||
})
|
||||
.collect())
|
||||
new_tenant_conf: TenantConfOpt,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<(), SetNewTenantConfigError> {
|
||||
// Legacy API: does not support sharding
|
||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||
|
||||
info!("configuring tenant {tenant_id}");
|
||||
let tenant = get_tenant(tenant_shard_id, true)?;
|
||||
|
||||
if !tenant.tenant_shard_id().shard_count.is_unsharded() {
|
||||
// Note that we use ShardParameters::default below.
|
||||
return Err(SetNewTenantConfigError::Other(anyhow::anyhow!(
|
||||
"This API may only be used on single-sharded tenants, use the /location_config API for sharded tenants"
|
||||
)));
|
||||
}
|
||||
|
||||
// This is a legacy API that only operates on attached tenants: the preferred
|
||||
// API to use is the location_config/ endpoint, which lets the caller provide
|
||||
// the full LocationConf.
|
||||
let location_conf = LocationConf::attached_single(
|
||||
new_tenant_conf.clone(),
|
||||
tenant.generation,
|
||||
&ShardParameters::default(),
|
||||
);
|
||||
|
||||
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &location_conf)
|
||||
.await
|
||||
.map_err(SetNewTenantConfigError::Persist)?;
|
||||
tenant.set_new_tenant_config(new_tenant_conf);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1958,12 +1980,51 @@ pub(crate) enum GetTenantError {
|
||||
|
||||
#[error("Tenant {0} is not active")]
|
||||
NotActive(TenantShardId),
|
||||
/// Broken is logically a subset of NotActive, but a distinct error is useful as
|
||||
/// NotActive is usually a retryable state for API purposes, whereas Broken
|
||||
/// is a stuck error state
|
||||
#[error("Tenant is broken: {0}")]
|
||||
Broken(String),
|
||||
|
||||
// Initializing or shutting down: cannot authoritatively say whether we have this tenant
|
||||
#[error("Tenant map is not available: {0}")]
|
||||
MapState(#[from] TenantMapError),
|
||||
}
|
||||
|
||||
/// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
|
||||
/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
|
||||
///
|
||||
/// This method is cancel-safe.
|
||||
pub(crate) fn get_tenant(
|
||||
tenant_shard_id: TenantShardId,
|
||||
active_only: bool,
|
||||
) -> Result<Arc<Tenant>, GetTenantError> {
|
||||
let locked = TENANTS.read().unwrap();
|
||||
|
||||
let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?;
|
||||
|
||||
match peek_slot {
|
||||
Some(TenantSlot::Attached(tenant)) => match tenant.current_state() {
|
||||
TenantState::Broken {
|
||||
reason,
|
||||
backtrace: _,
|
||||
} if active_only => Err(GetTenantError::Broken(reason)),
|
||||
TenantState::Active => Ok(Arc::clone(tenant)),
|
||||
_ => {
|
||||
if active_only {
|
||||
Err(GetTenantError::NotActive(tenant_shard_id))
|
||||
} else {
|
||||
Ok(Arc::clone(tenant))
|
||||
}
|
||||
}
|
||||
},
|
||||
Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)),
|
||||
None | Some(TenantSlot::Secondary(_)) => {
|
||||
Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum GetActiveTenantError {
|
||||
/// We may time out either while TenantSlot is InProgress, or while the Tenant
|
||||
@@ -1987,12 +2048,6 @@ pub(crate) enum GetActiveTenantError {
|
||||
/// Tenant exists, but is in a state that cannot become active (e.g. Stopping, Broken)
|
||||
#[error("will not become active. Current state: {0}")]
|
||||
WillNotBecomeActive(TenantState),
|
||||
|
||||
/// Broken is logically a subset of WillNotBecomeActive, but a distinct error is useful as
|
||||
/// WillNotBecomeActive is a permitted error under some circumstances, whereas broken should
|
||||
/// never happen.
|
||||
#[error("Tenant is broken: {0}")]
|
||||
Broken(String),
|
||||
}
|
||||
|
||||
/// Get a [`Tenant`] in its active state. If the tenant_id is currently in [`TenantSlot::InProgress`]
|
||||
@@ -2212,6 +2267,27 @@ pub(crate) enum TenantMapListError {
|
||||
Initializing,
|
||||
}
|
||||
|
||||
///
|
||||
/// Get list of tenants, for the mgmt API
|
||||
///
|
||||
pub(crate) async fn list_tenants(
|
||||
) -> Result<Vec<(TenantShardId, TenantState, Generation)>, TenantMapListError> {
|
||||
let tenants = TENANTS.read().unwrap();
|
||||
let m = match &*tenants {
|
||||
TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
|
||||
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m,
|
||||
};
|
||||
Ok(m.iter()
|
||||
.filter_map(|(id, tenant)| match tenant {
|
||||
TenantSlot::Attached(tenant) => {
|
||||
Some((*id, tenant.current_state(), tenant.generation()))
|
||||
}
|
||||
TenantSlot::Secondary(_) => None,
|
||||
TenantSlot::InProgress(_) => None,
|
||||
})
|
||||
.collect())
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum TenantMapInsertError {
|
||||
#[error(transparent)]
|
||||
|
||||
@@ -217,7 +217,7 @@ use crate::task_mgr::shutdown_token;
|
||||
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::remote_timeline_client::download::download_retry;
|
||||
use crate::tenant::storage_layer::AsLayerDesc;
|
||||
use crate::tenant::upload_queue::{Delete, UploadQueueStoppedDeletable};
|
||||
use crate::tenant::upload_queue::Delete;
|
||||
use crate::tenant::TIMELINES_SEGMENT_NAME;
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
@@ -266,6 +266,15 @@ pub enum MaybeDeletedIndexPart {
|
||||
Deleted(IndexPart),
|
||||
}
|
||||
|
||||
/// Errors that can arise when calling [`RemoteTimelineClient::stop`].
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum StopError {
|
||||
/// Returned if the upload queue was never initialized.
|
||||
/// See [`RemoteTimelineClient::init_upload_queue`] and [`RemoteTimelineClient::init_upload_queue_for_empty_remote`].
|
||||
#[error("queue is not initialized")]
|
||||
QueueUninitialized,
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum PersistIndexPartWithDeletedFlagError {
|
||||
#[error("another task is already setting the deleted_flag, started at {0:?}")]
|
||||
@@ -390,10 +399,15 @@ impl RemoteTimelineClient {
|
||||
"bug: it is responsibility of the caller to provide index part from MaybeDeletedIndexPart::Deleted"
|
||||
))?;
|
||||
|
||||
{
|
||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||
upload_queue.initialize_with_current_remote_index_part(index_part)?;
|
||||
self.update_remote_physical_size_gauge(Some(index_part));
|
||||
}
|
||||
// also locks upload queue, without dropping the guard above it will be a deadlock
|
||||
self.stop().expect("initialized line above");
|
||||
|
||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||
upload_queue.initialize_with_current_remote_index_part(index_part)?;
|
||||
self.update_remote_physical_size_gauge(Some(index_part));
|
||||
self.stop_impl(&mut upload_queue);
|
||||
|
||||
upload_queue
|
||||
.stopped_mut()
|
||||
@@ -407,8 +421,7 @@ impl RemoteTimelineClient {
|
||||
match &mut *self.upload_queue.lock().unwrap() {
|
||||
UploadQueue::Uninitialized => None,
|
||||
UploadQueue::Initialized(q) => q.get_last_remote_consistent_lsn_projected(),
|
||||
UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => None,
|
||||
UploadQueue::Stopped(UploadQueueStopped::Deletable(q)) => q
|
||||
UploadQueue::Stopped(q) => q
|
||||
.upload_queue_for_deletion
|
||||
.get_last_remote_consistent_lsn_projected(),
|
||||
}
|
||||
@@ -418,8 +431,7 @@ impl RemoteTimelineClient {
|
||||
match &mut *self.upload_queue.lock().unwrap() {
|
||||
UploadQueue::Uninitialized => None,
|
||||
UploadQueue::Initialized(q) => Some(q.get_last_remote_consistent_lsn_visible()),
|
||||
UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => None,
|
||||
UploadQueue::Stopped(UploadQueueStopped::Deletable(q)) => Some(
|
||||
UploadQueue::Stopped(q) => Some(
|
||||
q.upload_queue_for_deletion
|
||||
.get_last_remote_consistent_lsn_visible(),
|
||||
),
|
||||
@@ -886,7 +898,7 @@ impl RemoteTimelineClient {
|
||||
/// Wait for all previously scheduled operations to complete, and then stop.
|
||||
///
|
||||
/// Not cancellation safe
|
||||
pub(crate) async fn shutdown(self: &Arc<Self>) {
|
||||
pub(crate) async fn shutdown(self: &Arc<Self>) -> Result<(), StopError> {
|
||||
// On cancellation the queue is left in ackward state of refusing new operations but
|
||||
// proper stop is yet to be called. On cancel the original or some later task must call
|
||||
// `stop` or `shutdown`.
|
||||
@@ -897,12 +909,8 @@ impl RemoteTimelineClient {
|
||||
let fut = {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = match &mut *guard {
|
||||
UploadQueue::Stopped(_) => return,
|
||||
UploadQueue::Uninitialized => {
|
||||
// transition into Stopped state
|
||||
self.stop_impl(&mut guard);
|
||||
return;
|
||||
}
|
||||
UploadQueue::Stopped(_) => return Ok(()),
|
||||
UploadQueue::Uninitialized => return Err(StopError::QueueUninitialized),
|
||||
UploadQueue::Initialized(ref mut init) => init,
|
||||
};
|
||||
|
||||
@@ -934,7 +942,7 @@ impl RemoteTimelineClient {
|
||||
}
|
||||
}
|
||||
|
||||
self.stop();
|
||||
self.stop()
|
||||
}
|
||||
|
||||
/// Set the deleted_at field in the remote index file.
|
||||
@@ -1316,7 +1324,12 @@ impl RemoteTimelineClient {
|
||||
// upload finishes or times out soon enough.
|
||||
if cancel.is_cancelled() {
|
||||
info!("upload task cancelled by shutdown request");
|
||||
self.stop();
|
||||
match self.stop() {
|
||||
Ok(()) => {}
|
||||
Err(StopError::QueueUninitialized) => {
|
||||
unreachable!("we never launch an upload task if the queue is uninitialized, and once it is initialized, we never go back")
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1571,23 +1584,17 @@ impl RemoteTimelineClient {
|
||||
/// In-progress operations will still be running after this function returns.
|
||||
/// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))`
|
||||
/// to wait for them to complete, after calling this function.
|
||||
pub(crate) fn stop(&self) {
|
||||
pub(crate) fn stop(&self) -> Result<(), StopError> {
|
||||
// Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue
|
||||
// into stopped state, thereby dropping all off the queued *ops* which haven't become *tasks* yet.
|
||||
// The other *tasks* will come here and observe an already shut down queue and hence simply wrap up their business.
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
self.stop_impl(&mut guard);
|
||||
}
|
||||
|
||||
fn stop_impl(&self, guard: &mut std::sync::MutexGuard<UploadQueue>) {
|
||||
match &mut **guard {
|
||||
UploadQueue::Uninitialized => {
|
||||
info!("UploadQueue is in state Uninitialized, nothing to do");
|
||||
**guard = UploadQueue::Stopped(UploadQueueStopped::Uninitialized);
|
||||
}
|
||||
match &mut *guard {
|
||||
UploadQueue::Uninitialized => Err(StopError::QueueUninitialized),
|
||||
UploadQueue::Stopped(_) => {
|
||||
// nothing to do
|
||||
info!("another concurrent task already shut down the queue");
|
||||
Ok(())
|
||||
}
|
||||
UploadQueue::Initialized(initialized) => {
|
||||
info!("shutting down upload queue");
|
||||
@@ -1620,13 +1627,11 @@ impl RemoteTimelineClient {
|
||||
};
|
||||
|
||||
let upload_queue = std::mem::replace(
|
||||
&mut **guard,
|
||||
UploadQueue::Stopped(UploadQueueStopped::Deletable(
|
||||
UploadQueueStoppedDeletable {
|
||||
upload_queue_for_deletion,
|
||||
deleted_at: SetDeletedFlagProgress::NotRunning,
|
||||
},
|
||||
)),
|
||||
&mut *guard,
|
||||
UploadQueue::Stopped(UploadQueueStopped {
|
||||
upload_queue_for_deletion,
|
||||
deleted_at: SetDeletedFlagProgress::NotRunning,
|
||||
}),
|
||||
);
|
||||
if let UploadQueue::Initialized(qi) = upload_queue {
|
||||
qi
|
||||
@@ -1655,6 +1660,10 @@ impl RemoteTimelineClient {
|
||||
// which is exactly what we want to happen.
|
||||
drop(op);
|
||||
}
|
||||
|
||||
// We're done.
|
||||
drop(guard);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,11 +11,11 @@ use crate::{
|
||||
disk_usage_eviction_task::{
|
||||
finite_f32, DiskUsageEvictionInfo, EvictionCandidate, EvictionLayer, EvictionSecondaryLayer,
|
||||
},
|
||||
is_temporary,
|
||||
metrics::SECONDARY_MODE,
|
||||
tenant::{
|
||||
config::SecondaryLocationConfig,
|
||||
debug_assert_current_span_has_tenant_and_timeline_id,
|
||||
ephemeral_file::is_ephemeral_file,
|
||||
remote_timeline_client::{
|
||||
index::LayerFileMetadata, is_temp_download_file, FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
@@ -964,7 +964,7 @@ async fn init_timeline_state(
|
||||
continue;
|
||||
} else if crate::is_temporary(&file_path)
|
||||
|| is_temp_download_file(&file_path)
|
||||
|| is_ephemeral_file(file_name)
|
||||
|| is_temporary(&file_path)
|
||||
{
|
||||
// Temporary files are frequently left behind from restarting during downloads
|
||||
tracing::info!("Cleaning up temporary file {file_path}");
|
||||
|
||||
@@ -9,7 +9,6 @@ use crate::{
|
||||
metrics::SECONDARY_MODE,
|
||||
tenant::{
|
||||
config::AttachmentMode,
|
||||
mgr::GetTenantError,
|
||||
mgr::TenantManager,
|
||||
remote_timeline_client::remote_heatmap_path,
|
||||
span::debug_assert_current_span_has_tenant_id,
|
||||
@@ -293,11 +292,8 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
|
||||
"Starting heatmap write on command");
|
||||
let tenant = self
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(*tenant_shard_id)
|
||||
.get_attached_tenant_shard(*tenant_shard_id, true)
|
||||
.map_err(|e| anyhow::anyhow!(e))?;
|
||||
if !tenant.is_active() {
|
||||
return Err(GetTenantError::NotActive(*tenant_shard_id).into());
|
||||
}
|
||||
|
||||
Ok(UploadPending {
|
||||
// Ignore our state for last digest: this forces an upload even if nothing has changed
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
pub mod delta_layer;
|
||||
mod filename;
|
||||
pub mod image_layer;
|
||||
pub(crate) mod inmemory_layer;
|
||||
mod inmemory_layer;
|
||||
pub(crate) mod layer;
|
||||
mod layer_desc;
|
||||
|
||||
|
||||
@@ -23,12 +23,8 @@ use tracing::*;
|
||||
use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
|
||||
// avoid binding to Write (conflicts with std::io::Write)
|
||||
// while being able to use std::fmt::Write's methods
|
||||
use crate::metrics::TIMELINE_EPHEMERAL_BYTES;
|
||||
use std::cmp::Ordering;
|
||||
use std::fmt::Write as _;
|
||||
use std::ops::Range;
|
||||
use std::sync::atomic::Ordering as AtomicOrdering;
|
||||
use std::sync::atomic::{AtomicU64, AtomicUsize};
|
||||
use tokio::sync::{RwLock, RwLockWriteGuard};
|
||||
|
||||
use super::{
|
||||
@@ -74,8 +70,6 @@ pub struct InMemoryLayerInner {
|
||||
/// Each serialized Value is preceded by a 'u32' length field.
|
||||
/// PerSeg::page_versions map stores offsets into this file.
|
||||
file: EphemeralFile,
|
||||
|
||||
resource_units: GlobalResourceUnits,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for InMemoryLayerInner {
|
||||
@@ -84,121 +78,6 @@ impl std::fmt::Debug for InMemoryLayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
/// State shared by all in-memory (ephemeral) layers. Updated infrequently during background ticks in Timeline,
|
||||
/// to minimize contention.
|
||||
///
|
||||
/// This global state is used to implement behaviors that require a global view of the system, e.g.
|
||||
/// rolling layers proactively to limit the total amount of dirty data.
|
||||
pub(crate) struct GlobalResources {
|
||||
// Limit on how high dirty_bytes may grow before we start freezing layers to reduce it.
|
||||
// Zero means unlimited.
|
||||
pub(crate) max_dirty_bytes: AtomicU64,
|
||||
// How many bytes are in all EphemeralFile objects
|
||||
dirty_bytes: AtomicU64,
|
||||
// How many layers are contributing to dirty_bytes
|
||||
dirty_layers: AtomicUsize,
|
||||
}
|
||||
|
||||
// Per-timeline RAII struct for its contribution to [`GlobalResources`]
|
||||
struct GlobalResourceUnits {
|
||||
// How many dirty bytes have I added to the global dirty_bytes: this guard object is responsible
|
||||
// for decrementing the global counter by this many bytes when dropped.
|
||||
dirty_bytes: u64,
|
||||
}
|
||||
|
||||
impl GlobalResourceUnits {
|
||||
// Hint for the layer append path to update us when the layer size differs from the last
|
||||
// call to update_size by this much. If we don't reach this threshold, we'll still get
|
||||
// updated when the Timeline "ticks" in the background.
|
||||
const MAX_SIZE_DRIFT: u64 = 10 * 1024 * 1024;
|
||||
|
||||
fn new() -> Self {
|
||||
GLOBAL_RESOURCES
|
||||
.dirty_layers
|
||||
.fetch_add(1, AtomicOrdering::Relaxed);
|
||||
Self { dirty_bytes: 0 }
|
||||
}
|
||||
|
||||
/// Do not call this frequently: all timelines will write to these same global atomics,
|
||||
/// so this is a relatively expensive operation. Wait at least a few seconds between calls.
|
||||
///
|
||||
/// Returns the effective layer size limit that should be applied, if any, to keep
|
||||
/// the total number of dirty bytes below the configured maximum.
|
||||
fn publish_size(&mut self, size: u64) -> Option<u64> {
|
||||
let new_global_dirty_bytes = match size.cmp(&self.dirty_bytes) {
|
||||
Ordering::Equal => GLOBAL_RESOURCES.dirty_bytes.load(AtomicOrdering::Relaxed),
|
||||
Ordering::Greater => {
|
||||
let delta = size - self.dirty_bytes;
|
||||
let old = GLOBAL_RESOURCES
|
||||
.dirty_bytes
|
||||
.fetch_add(delta, AtomicOrdering::Relaxed);
|
||||
old + delta
|
||||
}
|
||||
Ordering::Less => {
|
||||
let delta = self.dirty_bytes - size;
|
||||
let old = GLOBAL_RESOURCES
|
||||
.dirty_bytes
|
||||
.fetch_sub(delta, AtomicOrdering::Relaxed);
|
||||
old - delta
|
||||
}
|
||||
};
|
||||
|
||||
// This is a sloppy update: concurrent updates to the counter will race, and the exact
|
||||
// value of the metric might not be the exact latest value of GLOBAL_RESOURCES::dirty_bytes.
|
||||
// That's okay: as long as the metric contains some recent value, it doesn't have to always
|
||||
// be literally the last update.
|
||||
TIMELINE_EPHEMERAL_BYTES.set(new_global_dirty_bytes);
|
||||
|
||||
self.dirty_bytes = size;
|
||||
|
||||
let max_dirty_bytes = GLOBAL_RESOURCES
|
||||
.max_dirty_bytes
|
||||
.load(AtomicOrdering::Relaxed);
|
||||
if max_dirty_bytes > 0 && new_global_dirty_bytes > max_dirty_bytes {
|
||||
// Set the layer file limit to the average layer size: this implies that all above-average
|
||||
// sized layers will be elegible for freezing. They will be frozen in the order they
|
||||
// next enter publish_size.
|
||||
Some(
|
||||
new_global_dirty_bytes
|
||||
/ GLOBAL_RESOURCES.dirty_layers.load(AtomicOrdering::Relaxed) as u64,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
// Call publish_size if the input size differs from last published size by more than
|
||||
// the drift limit
|
||||
fn maybe_publish_size(&mut self, size: u64) {
|
||||
let publish = match size.cmp(&self.dirty_bytes) {
|
||||
Ordering::Equal => false,
|
||||
Ordering::Greater => size - self.dirty_bytes > Self::MAX_SIZE_DRIFT,
|
||||
Ordering::Less => self.dirty_bytes - size > Self::MAX_SIZE_DRIFT,
|
||||
};
|
||||
|
||||
if publish {
|
||||
self.publish_size(size);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for GlobalResourceUnits {
|
||||
fn drop(&mut self) {
|
||||
GLOBAL_RESOURCES
|
||||
.dirty_layers
|
||||
.fetch_sub(1, AtomicOrdering::Relaxed);
|
||||
|
||||
// Subtract our contribution to the global total dirty bytes
|
||||
self.publish_size(0);
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) static GLOBAL_RESOURCES: GlobalResources = GlobalResources {
|
||||
max_dirty_bytes: AtomicU64::new(0),
|
||||
dirty_bytes: AtomicU64::new(0),
|
||||
dirty_layers: AtomicUsize::new(0),
|
||||
};
|
||||
|
||||
impl InMemoryLayer {
|
||||
pub(crate) fn get_timeline_id(&self) -> TimelineId {
|
||||
self.timeline_id
|
||||
@@ -214,10 +93,6 @@ impl InMemoryLayer {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn try_len(&self) -> Option<u64> {
|
||||
self.inner.try_read().map(|i| i.file.len()).ok()
|
||||
}
|
||||
|
||||
pub(crate) fn assert_writable(&self) {
|
||||
assert!(self.end_lsn.get().is_none());
|
||||
}
|
||||
@@ -453,7 +328,6 @@ impl InMemoryLayer {
|
||||
inner: RwLock::new(InMemoryLayerInner {
|
||||
index: HashMap::new(),
|
||||
file,
|
||||
resource_units: GlobalResourceUnits::new(),
|
||||
}),
|
||||
})
|
||||
}
|
||||
@@ -504,18 +378,9 @@ impl InMemoryLayer {
|
||||
warn!("Key {} at {} already exists", key, lsn);
|
||||
}
|
||||
|
||||
let size = locked_inner.file.len();
|
||||
locked_inner.resource_units.maybe_publish_size(size);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn tick(&self) -> Option<u64> {
|
||||
let mut inner = self.inner.write().await;
|
||||
let size = inner.file.len();
|
||||
inner.resource_units.publish_size(size)
|
||||
}
|
||||
|
||||
pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range<Key>, Lsn)]) -> Result<()> {
|
||||
// TODO: Currently, we just leak the storage for any deleted keys
|
||||
Ok(())
|
||||
|
||||
@@ -19,7 +19,7 @@ use pageserver_api::{
|
||||
keyspace::KeySpaceAccum,
|
||||
models::{
|
||||
CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
|
||||
EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, TimelineState,
|
||||
EvictionPolicy, LayerMapInfo, TimelineState,
|
||||
},
|
||||
reltag::BlockNumber,
|
||||
shard::{ShardIdentity, TenantShardId},
|
||||
@@ -54,7 +54,6 @@ use std::{
|
||||
ops::ControlFlow,
|
||||
};
|
||||
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::tenant::timeline::logical_size::CurrentLogicalSize;
|
||||
use crate::tenant::{
|
||||
layer_map::{LayerMap, SearchResult},
|
||||
@@ -65,6 +64,7 @@ use crate::{
|
||||
disk_usage_eviction_task::DiskUsageEvictionInfo,
|
||||
pgdatadir_mapping::CollectKeySpaceError,
|
||||
};
|
||||
use crate::{deletion_queue::DeletionQueueClient, tenant::remote_timeline_client::StopError};
|
||||
use crate::{
|
||||
disk_usage_eviction_task::finite_f32,
|
||||
tenant::storage_layer::{
|
||||
@@ -1142,79 +1142,6 @@ impl Timeline {
|
||||
self.flush_frozen_layers_and_wait().await
|
||||
}
|
||||
|
||||
/// If there is no writer, and conditions for rolling the latest layer are met, then freeze it.
|
||||
///
|
||||
/// This is for use in background housekeeping, to provide guarantees of layers closing eventually
|
||||
/// even if there are no ongoing writes to drive that.
|
||||
async fn maybe_freeze_ephemeral_layer(&self) {
|
||||
let Ok(_write_guard) = self.write_lock.try_lock() else {
|
||||
// If the write lock is held, there is an active wal receiver: rolling open layers
|
||||
// is their responsibility while they hold this lock.
|
||||
return;
|
||||
};
|
||||
|
||||
let Ok(layers_guard) = self.layers.try_read() else {
|
||||
// Don't block if the layer lock is busy
|
||||
return;
|
||||
};
|
||||
|
||||
let Some(open_layer) = &layers_guard.layer_map().open_layer else {
|
||||
// No open layer, no work to do.
|
||||
return;
|
||||
};
|
||||
|
||||
let Some(current_size) = open_layer.try_len() else {
|
||||
// Unexpected: since we hold the write guard, nobody else should be writing to this layer, so
|
||||
// read lock to get size should always succeed.
|
||||
tracing::warn!("Lock conflict while reading size of open layer");
|
||||
return;
|
||||
};
|
||||
|
||||
let current_lsn = self.get_last_record_lsn();
|
||||
|
||||
let checkpoint_distance_override = open_layer.tick().await;
|
||||
|
||||
if let Some(size_override) = checkpoint_distance_override {
|
||||
if current_size > size_override {
|
||||
// This is not harmful, but it only happens in relatively rare cases where
|
||||
// time-based checkpoints are not happening fast enough to keep the amount of
|
||||
// ephemeral data within configured limits. It's a sign of stress on the system.
|
||||
tracing::info!("Early-rolling open layer at size {current_size} (limit {size_override}) due to dirty data pressure");
|
||||
}
|
||||
}
|
||||
|
||||
let checkpoint_distance =
|
||||
checkpoint_distance_override.unwrap_or(self.get_checkpoint_distance());
|
||||
|
||||
if self.should_roll(
|
||||
current_size,
|
||||
current_size,
|
||||
checkpoint_distance,
|
||||
self.get_last_record_lsn(),
|
||||
self.last_freeze_at.load(),
|
||||
*self.last_freeze_ts.read().unwrap(),
|
||||
) {
|
||||
match open_layer.info() {
|
||||
InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => {
|
||||
// We may reach this point if the layer was already frozen by not yet flushed: flushing
|
||||
// happens asynchronously in the background.
|
||||
tracing::debug!(
|
||||
"Not freezing open layer, it's already frozen ({lsn_start}..{lsn_end})"
|
||||
);
|
||||
}
|
||||
InMemoryLayerInfo::Open { .. } => {
|
||||
// Upgrade to a write lock and freeze the layer
|
||||
drop(layers_guard);
|
||||
let mut layers_guard = self.layers.write().await;
|
||||
layers_guard
|
||||
.try_freeze_in_memory_layer(current_lsn, &self.last_freeze_at)
|
||||
.await;
|
||||
}
|
||||
}
|
||||
self.flush_frozen_layers();
|
||||
}
|
||||
}
|
||||
|
||||
/// Outermost timeline compaction operation; downloads needed layers.
|
||||
pub(crate) async fn compact(
|
||||
self: &Arc<Self>,
|
||||
@@ -1237,11 +1164,6 @@ impl Timeline {
|
||||
(guard, permit)
|
||||
};
|
||||
|
||||
// Prior to compaction, check if an open ephemeral layer should be closed: this provides
|
||||
// background enforcement of checkpoint interval if there is no active WAL receiver, to avoid keeping
|
||||
// an ephemeral layer open forever when idle.
|
||||
self.maybe_freeze_ephemeral_layer().await;
|
||||
|
||||
// this wait probably never needs any "long time spent" logging, because we already nag if
|
||||
// compaction task goes over it's period (20s) which is quite often in production.
|
||||
let (_guard, _permit) = tokio::select! {
|
||||
@@ -1274,7 +1196,6 @@ impl Timeline {
|
||||
|
||||
pub(crate) fn activate(
|
||||
self: &Arc<Self>,
|
||||
parent: Arc<crate::tenant::Tenant>,
|
||||
broker_client: BrokerClientChannel,
|
||||
background_jobs_can_start: Option<&completion::Barrier>,
|
||||
ctx: &RequestContext,
|
||||
@@ -1285,7 +1206,7 @@ impl Timeline {
|
||||
}
|
||||
self.launch_wal_receiver(ctx, broker_client);
|
||||
self.set_state(TimelineState::Active);
|
||||
self.launch_eviction_task(parent, background_jobs_can_start);
|
||||
self.launch_eviction_task(background_jobs_can_start);
|
||||
}
|
||||
|
||||
/// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then
|
||||
@@ -1320,7 +1241,11 @@ impl Timeline {
|
||||
// what is problematic is the shutting down of RemoteTimelineClient, because
|
||||
// obviously it does not make sense to stop while we wait for it, but what
|
||||
// about corner cases like s3 suddenly hanging up?
|
||||
client.shutdown().await;
|
||||
if let Err(e) = client.shutdown().await {
|
||||
// Non-fatal. Shutdown is infallible. Failures to flush just mean that
|
||||
// we have some extra WAL replay to do next time the timeline starts.
|
||||
warn!("failed to flush to remote storage: {e:#}");
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
@@ -1357,7 +1282,12 @@ impl Timeline {
|
||||
// Shut down remote timeline client: this gracefully moves its metadata into its Stopping state in
|
||||
// case our caller wants to use that for a deletion
|
||||
if let Some(remote_client) = self.remote_client.as_ref() {
|
||||
remote_client.stop();
|
||||
match remote_client.stop() {
|
||||
Ok(()) => {}
|
||||
Err(StopError::QueueUninitialized) => {
|
||||
// Shutting down during initialization is legal
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tracing::debug!("Waiting for tasks...");
|
||||
@@ -1513,53 +1443,6 @@ impl Timeline {
|
||||
Err(EvictionError::Timeout) => Ok(Some(false)),
|
||||
}
|
||||
}
|
||||
|
||||
fn should_roll(
|
||||
&self,
|
||||
layer_size: u64,
|
||||
projected_layer_size: u64,
|
||||
checkpoint_distance: u64,
|
||||
projected_lsn: Lsn,
|
||||
last_freeze_at: Lsn,
|
||||
last_freeze_ts: Instant,
|
||||
) -> bool {
|
||||
let distance = projected_lsn.widening_sub(last_freeze_at);
|
||||
|
||||
// Rolling the open layer can be triggered by:
|
||||
// 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that
|
||||
// the safekeepers need to store. For sharded tenants, we multiply by shard count to
|
||||
// account for how writes are distributed across shards: we expect each node to consume
|
||||
// 1/count of the LSN on average.
|
||||
// 2. The size of the currently open layer.
|
||||
// 3. The time since the last roll. It helps safekeepers to regard pageserver as caught
|
||||
// up and suspend activity.
|
||||
if distance >= checkpoint_distance as i128 * self.shard_identity.count.count() as i128 {
|
||||
info!(
|
||||
"Will roll layer at {} with layer size {} due to LSN distance ({})",
|
||||
projected_lsn, layer_size, distance
|
||||
);
|
||||
|
||||
true
|
||||
} else if projected_layer_size >= checkpoint_distance {
|
||||
info!(
|
||||
"Will roll layer at {} with layer size {} due to layer size ({})",
|
||||
projected_lsn, layer_size, projected_layer_size
|
||||
);
|
||||
|
||||
true
|
||||
} else if distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() {
|
||||
info!(
|
||||
"Will roll layer at {} with layer size {} due to time since last flush ({:?})",
|
||||
projected_lsn,
|
||||
layer_size,
|
||||
last_freeze_ts.elapsed()
|
||||
);
|
||||
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Number of times we will compute partition within a checkpoint distance.
|
||||
@@ -2713,10 +2596,6 @@ impl Timeline {
|
||||
// Get all the data needed to reconstruct the page version from this layer.
|
||||
// But if we have an older cached page image, no need to go past that.
|
||||
let lsn_floor = max(cached_lsn + 1, start_lsn);
|
||||
|
||||
let open_layer = open_layer.clone();
|
||||
drop(guard);
|
||||
|
||||
result = match open_layer
|
||||
.get_value_reconstruct_data(
|
||||
key,
|
||||
@@ -2734,7 +2613,10 @@ impl Timeline {
|
||||
traversal_path.push((
|
||||
result,
|
||||
cont_lsn,
|
||||
Box::new(move || open_layer.traversal_id()),
|
||||
Box::new({
|
||||
let open_layer = Arc::clone(open_layer);
|
||||
move || open_layer.traversal_id()
|
||||
}),
|
||||
));
|
||||
continue 'outer;
|
||||
}
|
||||
@@ -2744,10 +2626,6 @@ impl Timeline {
|
||||
if cont_lsn > start_lsn {
|
||||
//info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display());
|
||||
let lsn_floor = max(cached_lsn + 1, start_lsn);
|
||||
|
||||
let frozen_layer = frozen_layer.clone();
|
||||
drop(guard);
|
||||
|
||||
result = match frozen_layer
|
||||
.get_value_reconstruct_data(
|
||||
key,
|
||||
@@ -2765,7 +2643,10 @@ impl Timeline {
|
||||
traversal_path.push((
|
||||
result,
|
||||
cont_lsn,
|
||||
Box::new(move || frozen_layer.traversal_id()),
|
||||
Box::new({
|
||||
let frozen_layer = Arc::clone(frozen_layer);
|
||||
move || frozen_layer.traversal_id()
|
||||
}),
|
||||
));
|
||||
continue 'outer;
|
||||
}
|
||||
@@ -2773,8 +2654,6 @@ impl Timeline {
|
||||
|
||||
if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
|
||||
let layer = guard.get_from_desc(&layer);
|
||||
drop(guard);
|
||||
|
||||
// Get all the data needed to reconstruct the page version from this layer.
|
||||
// But if we have an older cached page image, no need to go past that.
|
||||
let lsn_floor = max(cached_lsn + 1, lsn_floor);
|
||||
@@ -4581,6 +4460,49 @@ impl<'a> TimelineWriter<'a> {
|
||||
res
|
||||
}
|
||||
|
||||
/// "Tick" the timeline writer: it will roll the open layer if required
|
||||
/// and do nothing else.
|
||||
pub(crate) async fn tick(&mut self) -> anyhow::Result<()> {
|
||||
self.open_layer_if_present().await?;
|
||||
|
||||
let last_record_lsn = self.get_last_record_lsn();
|
||||
let action = self.get_open_layer_action(last_record_lsn, 0);
|
||||
if action == OpenLayerAction::Roll {
|
||||
self.roll_layer(last_record_lsn).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Populate the timeline writer state only if an in-memory layer
|
||||
/// is already open.
|
||||
async fn open_layer_if_present(&mut self) -> anyhow::Result<()> {
|
||||
assert!(self.write_guard.is_none());
|
||||
|
||||
let open_layer = {
|
||||
let guard = self.layers.read().await;
|
||||
let layers = guard.layer_map();
|
||||
match layers.open_layer {
|
||||
Some(ref open_layer) => open_layer.clone(),
|
||||
None => {
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let initial_size = open_layer.size().await?;
|
||||
let last_freeze_at = self.last_freeze_at.load();
|
||||
let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
|
||||
self.write_guard.replace(TimelineWriterState::new(
|
||||
open_layer,
|
||||
initial_size,
|
||||
last_freeze_at,
|
||||
last_freeze_ts,
|
||||
));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_open_layer_action(
|
||||
&mut self,
|
||||
at: Lsn,
|
||||
@@ -4652,14 +4574,43 @@ impl<'a> TimelineWriter<'a> {
|
||||
return OpenLayerAction::None;
|
||||
}
|
||||
|
||||
if self.tl.should_roll(
|
||||
state.current_size,
|
||||
state.current_size + new_value_size,
|
||||
self.get_checkpoint_distance(),
|
||||
lsn,
|
||||
state.cached_last_freeze_at,
|
||||
state.cached_last_freeze_ts,
|
||||
) {
|
||||
let distance = lsn.widening_sub(state.cached_last_freeze_at);
|
||||
let proposed_open_layer_size = state.current_size + new_value_size;
|
||||
|
||||
// Rolling the open layer can be triggered by:
|
||||
// 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that
|
||||
// the safekeepers need to store. For sharded tenants, we multiply by shard count to
|
||||
// account for how writes are distributed across shards: we expect each node to consume
|
||||
// 1/count of the LSN on average.
|
||||
// 2. The size of the currently open layer.
|
||||
// 3. The time since the last roll. It helps safekeepers to regard pageserver as caught
|
||||
// up and suspend activity.
|
||||
if distance
|
||||
>= self.get_checkpoint_distance() as i128 * self.shard_identity.count.count() as i128
|
||||
{
|
||||
info!(
|
||||
"Will roll layer at {} with layer size {} due to LSN distance ({})",
|
||||
lsn, state.current_size, distance
|
||||
);
|
||||
|
||||
OpenLayerAction::Roll
|
||||
} else if proposed_open_layer_size >= self.get_checkpoint_distance() {
|
||||
info!(
|
||||
"Will roll layer at {} with layer size {} due to layer size ({})",
|
||||
lsn, state.current_size, proposed_open_layer_size
|
||||
);
|
||||
|
||||
OpenLayerAction::Roll
|
||||
} else if distance > 0
|
||||
&& state.cached_last_freeze_ts.elapsed() >= self.get_checkpoint_timeout()
|
||||
{
|
||||
info!(
|
||||
"Will roll layer at {} with layer size {} due to time since last flush ({:?})",
|
||||
lsn,
|
||||
state.current_size,
|
||||
state.cached_last_freeze_ts.elapsed()
|
||||
);
|
||||
|
||||
OpenLayerAction::Roll
|
||||
} else {
|
||||
OpenLayerAction::None
|
||||
|
||||
@@ -16,7 +16,9 @@ use crate::{
|
||||
tenant::{
|
||||
debug_assert_current_span_has_tenant_and_timeline_id,
|
||||
metadata::TimelineMetadata,
|
||||
remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
|
||||
remote_timeline_client::{
|
||||
self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
|
||||
},
|
||||
CreateTimelineCause, DeleteTimelineError, Tenant,
|
||||
},
|
||||
};
|
||||
@@ -48,7 +50,19 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
|
||||
|
||||
// Prevent new uploads from starting.
|
||||
if let Some(remote_client) = timeline.remote_client.as_ref() {
|
||||
remote_client.stop();
|
||||
let res = remote_client.stop();
|
||||
match res {
|
||||
Ok(()) => {}
|
||||
Err(e) => match e {
|
||||
remote_timeline_client::StopError::QueueUninitialized => {
|
||||
// This case shouldn't happen currently because the
|
||||
// load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart.
|
||||
// That is, before we declare the Tenant as Active.
|
||||
// But we only allow calls to delete_timeline on Active tenants.
|
||||
return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs")));
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// Stop & wait for the remaining timeline tasks, including upload tasks.
|
||||
|
||||
@@ -51,7 +51,6 @@ pub struct EvictionTaskTenantState {
|
||||
impl Timeline {
|
||||
pub(super) fn launch_eviction_task(
|
||||
self: &Arc<Self>,
|
||||
parent: Arc<Tenant>,
|
||||
background_tasks_can_start: Option<&completion::Barrier>,
|
||||
) {
|
||||
let self_clone = Arc::clone(self);
|
||||
@@ -73,14 +72,14 @@ impl Timeline {
|
||||
_ = completion::Barrier::maybe_wait(background_tasks_can_start) => {}
|
||||
};
|
||||
|
||||
self_clone.eviction_task(parent, cancel).await;
|
||||
self_clone.eviction_task(cancel).await;
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
|
||||
async fn eviction_task(self: Arc<Self>, tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
async fn eviction_task(self: Arc<Self>, cancel: CancellationToken) {
|
||||
use crate::tenant::tasks::random_init_delay;
|
||||
|
||||
// acquire the gate guard only once within a useful span
|
||||
@@ -104,7 +103,7 @@ impl Timeline {
|
||||
loop {
|
||||
let policy = self.get_eviction_policy();
|
||||
let cf = self
|
||||
.eviction_iteration(&tenant, &policy, &cancel, &guard, &ctx)
|
||||
.eviction_iteration(&policy, &cancel, &guard, &ctx)
|
||||
.await;
|
||||
|
||||
match cf {
|
||||
@@ -124,7 +123,6 @@ impl Timeline {
|
||||
#[instrument(skip_all, fields(policy_kind = policy.discriminant_str()))]
|
||||
async fn eviction_iteration(
|
||||
self: &Arc<Self>,
|
||||
tenant: &Tenant,
|
||||
policy: &EvictionPolicy,
|
||||
cancel: &CancellationToken,
|
||||
gate: &GateGuard,
|
||||
@@ -139,7 +137,7 @@ impl Timeline {
|
||||
}
|
||||
EvictionPolicy::LayerAccessThreshold(p) => {
|
||||
match self
|
||||
.eviction_iteration_threshold(tenant, p, cancel, gate, ctx)
|
||||
.eviction_iteration_threshold(p, cancel, gate, ctx)
|
||||
.await
|
||||
{
|
||||
ControlFlow::Break(()) => return ControlFlow::Break(()),
|
||||
@@ -148,11 +146,7 @@ impl Timeline {
|
||||
(p.period, p.threshold)
|
||||
}
|
||||
EvictionPolicy::OnlyImitiate(p) => {
|
||||
if self
|
||||
.imitiate_only(tenant, p, cancel, gate, ctx)
|
||||
.await
|
||||
.is_break()
|
||||
{
|
||||
if self.imitiate_only(p, cancel, gate, ctx).await.is_break() {
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
(p.period, p.threshold)
|
||||
@@ -181,7 +175,6 @@ impl Timeline {
|
||||
|
||||
async fn eviction_iteration_threshold(
|
||||
self: &Arc<Self>,
|
||||
tenant: &Tenant,
|
||||
p: &EvictionPolicyLayerAccessThreshold,
|
||||
cancel: &CancellationToken,
|
||||
gate: &GateGuard,
|
||||
@@ -200,10 +193,7 @@ impl Timeline {
|
||||
_ = self.cancel.cancelled() => return ControlFlow::Break(()),
|
||||
};
|
||||
|
||||
match self
|
||||
.imitate_layer_accesses(tenant, p, cancel, gate, ctx)
|
||||
.await
|
||||
{
|
||||
match self.imitate_layer_accesses(p, cancel, gate, ctx).await {
|
||||
ControlFlow::Break(()) => return ControlFlow::Break(()),
|
||||
ControlFlow::Continue(()) => (),
|
||||
}
|
||||
@@ -325,7 +315,6 @@ impl Timeline {
|
||||
/// disk usage based eviction task.
|
||||
async fn imitiate_only(
|
||||
self: &Arc<Self>,
|
||||
tenant: &Tenant,
|
||||
p: &EvictionPolicyLayerAccessThreshold,
|
||||
cancel: &CancellationToken,
|
||||
gate: &GateGuard,
|
||||
@@ -342,8 +331,7 @@ impl Timeline {
|
||||
_ = self.cancel.cancelled() => return ControlFlow::Break(()),
|
||||
};
|
||||
|
||||
self.imitate_layer_accesses(tenant, p, cancel, gate, ctx)
|
||||
.await
|
||||
self.imitate_layer_accesses(p, cancel, gate, ctx).await
|
||||
}
|
||||
|
||||
/// If we evict layers but keep cached values derived from those layers, then
|
||||
@@ -373,7 +361,6 @@ impl Timeline {
|
||||
#[instrument(skip_all)]
|
||||
async fn imitate_layer_accesses(
|
||||
&self,
|
||||
tenant: &Tenant,
|
||||
p: &EvictionPolicyLayerAccessThreshold,
|
||||
cancel: &CancellationToken,
|
||||
gate: &GateGuard,
|
||||
@@ -409,11 +396,17 @@ impl Timeline {
|
||||
// Make one of the tenant's timelines draw the short straw and run the calculation.
|
||||
// The others wait until the calculation is done so that they take into account the
|
||||
// imitated accesses that the winner made.
|
||||
let tenant = match crate::tenant::mgr::get_tenant(self.tenant_shard_id, true) {
|
||||
Ok(t) => t,
|
||||
Err(_) => {
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
};
|
||||
let mut state = tenant.eviction_task_tenant_state.lock().await;
|
||||
match state.last_layer_access_imitation {
|
||||
Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
|
||||
_ => {
|
||||
self.imitate_synthetic_size_calculation_worker(tenant, cancel, ctx)
|
||||
self.imitate_synthetic_size_calculation_worker(&tenant, cancel, ctx)
|
||||
.await;
|
||||
state.last_layer_access_imitation = Some(tokio::time::Instant::now());
|
||||
}
|
||||
@@ -487,7 +480,7 @@ impl Timeline {
|
||||
#[instrument(skip_all)]
|
||||
async fn imitate_synthetic_size_calculation_worker(
|
||||
&self,
|
||||
tenant: &Tenant,
|
||||
tenant: &Arc<Tenant>,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) {
|
||||
|
||||
@@ -86,7 +86,6 @@ impl<'t> UninitializedTimeline<'t> {
|
||||
/// Prepares timeline data by loading it from the basebackup archive.
|
||||
pub(crate) async fn import_basebackup_from_tar(
|
||||
self,
|
||||
tenant: Arc<Tenant>,
|
||||
copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin),
|
||||
base_lsn: Lsn,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
@@ -115,7 +114,7 @@ impl<'t> UninitializedTimeline<'t> {
|
||||
|
||||
// All the data has been imported. Insert the Timeline into the tenant's timelines map
|
||||
let tl = self.finish_creation()?;
|
||||
tl.activate(tenant, broker_client, None, ctx);
|
||||
tl.activate(broker_client, None, ctx);
|
||||
Ok(tl)
|
||||
}
|
||||
|
||||
|
||||
@@ -33,9 +33,11 @@ use crate::tenant::timeline::walreceiver::connection_manager::{
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::future::Future;
|
||||
use std::num::NonZeroU64;
|
||||
use std::ops::ControlFlow;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use tokio::select;
|
||||
use tokio::sync::watch;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
@@ -89,27 +91,31 @@ impl WalReceiver {
|
||||
async move {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
debug!("WAL receiver manager started, connecting to broker");
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
let mut connection_manager_state = ConnectionManagerState::new(
|
||||
timeline,
|
||||
conf,
|
||||
);
|
||||
while !cancel.is_cancelled() {
|
||||
let loop_step_result = connection_manager_loop_step(
|
||||
&mut broker_client,
|
||||
&mut connection_manager_state,
|
||||
&walreceiver_ctx,
|
||||
&cancel,
|
||||
&loop_status,
|
||||
).await;
|
||||
match loop_step_result {
|
||||
Ok(()) => continue,
|
||||
Err(_cancelled) => {
|
||||
trace!("Connection manager loop ended, shutting down");
|
||||
loop {
|
||||
select! {
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
trace!("WAL receiver shutdown requested, shutting down");
|
||||
break;
|
||||
}
|
||||
},
|
||||
loop_step_result = connection_manager_loop_step(
|
||||
&mut broker_client,
|
||||
&mut connection_manager_state,
|
||||
&walreceiver_ctx,
|
||||
&loop_status,
|
||||
) => match loop_step_result {
|
||||
ControlFlow::Continue(()) => continue,
|
||||
ControlFlow::Break(()) => {
|
||||
trace!("Connection manager loop ended, shutting down");
|
||||
break;
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
connection_manager_state.shutdown().await;
|
||||
*loop_status.write().unwrap() = None;
|
||||
Ok(())
|
||||
@@ -191,9 +197,6 @@ impl<E: Clone> TaskHandle<E> {
|
||||
}
|
||||
}
|
||||
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// Cancellation-safe.
|
||||
async fn next_task_event(&mut self) -> TaskEvent<E> {
|
||||
match self.events_receiver.changed().await {
|
||||
Ok(()) => TaskEvent::Update((self.events_receiver.borrow()).clone()),
|
||||
|
||||
@@ -17,7 +17,7 @@ use crate::metrics::{
|
||||
WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED,
|
||||
WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES,
|
||||
};
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::task_mgr::{shutdown_token, TaskKind};
|
||||
use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline};
|
||||
use anyhow::Context;
|
||||
use chrono::{NaiveDateTime, Utc};
|
||||
@@ -27,7 +27,7 @@ use storage_broker::proto::SafekeeperTimelineInfo;
|
||||
use storage_broker::proto::SubscribeSafekeeperInfoRequest;
|
||||
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
|
||||
use storage_broker::{BrokerClientChannel, Code, Streaming};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tokio::select;
|
||||
use tracing::*;
|
||||
|
||||
use postgres_connection::PgConnectionConfig;
|
||||
@@ -45,33 +45,27 @@ use super::{
|
||||
TaskEvent, TaskHandle,
|
||||
};
|
||||
|
||||
pub(crate) struct Cancelled;
|
||||
|
||||
/// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker.
|
||||
/// Based on the updates, desides whether to start, keep or stop a WAL receiver task.
|
||||
/// If storage broker subscription is cancelled, exits.
|
||||
///
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// Not cancellation-safe. Use `cancel` token to request cancellation.
|
||||
pub(super) async fn connection_manager_loop_step(
|
||||
broker_client: &mut BrokerClientChannel,
|
||||
connection_manager_state: &mut ConnectionManagerState,
|
||||
ctx: &RequestContext,
|
||||
cancel: &CancellationToken,
|
||||
manager_status: &std::sync::RwLock<Option<ConnectionManagerStatus>>,
|
||||
) -> Result<(), Cancelled> {
|
||||
match tokio::select! {
|
||||
_ = cancel.cancelled() => { return Err(Cancelled); },
|
||||
st = connection_manager_state.timeline.wait_to_become_active(ctx) => { st }
|
||||
} {
|
||||
) -> ControlFlow<(), ()> {
|
||||
match connection_manager_state
|
||||
.timeline
|
||||
.wait_to_become_active(ctx)
|
||||
.await
|
||||
{
|
||||
Ok(()) => {}
|
||||
Err(new_state) => {
|
||||
debug!(
|
||||
?new_state,
|
||||
"state changed, stopping wal connection manager loop"
|
||||
);
|
||||
return Err(Cancelled);
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -92,7 +86,7 @@ pub(super) async fn connection_manager_loop_step(
|
||||
// Subscribe to the broker updates. Stream shares underlying TCP connection
|
||||
// with other streams on this client (other connection managers). When
|
||||
// object goes out of scope, stream finishes in drop() automatically.
|
||||
let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?;
|
||||
let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id).await;
|
||||
debug!("Subscribed for broker timeline updates");
|
||||
|
||||
loop {
|
||||
@@ -100,7 +94,6 @@ pub(super) async fn connection_manager_loop_step(
|
||||
|
||||
// These things are happening concurrently:
|
||||
//
|
||||
// - cancellation request
|
||||
// - keep receiving WAL on the current connection
|
||||
// - if the shared state says we need to change connection, disconnect and return
|
||||
// - this runs in a separate task and we receive updates via a watch channel
|
||||
@@ -108,11 +101,7 @@ pub(super) async fn connection_manager_loop_step(
|
||||
// - receive updates from broker
|
||||
// - this might change the current desired connection
|
||||
// - timeline state changes to something that does not allow walreceiver to run concurrently
|
||||
|
||||
// NB: make sure each of the select expressions are cancellation-safe
|
||||
// (no need for arms to be cancellation-safe).
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => { return Err(Cancelled); }
|
||||
select! {
|
||||
Some(wal_connection_update) = async {
|
||||
match connection_manager_state.wal_connection.as_mut() {
|
||||
Some(wal_connection) => Some(wal_connection.connection_task.next_task_event().await),
|
||||
@@ -144,7 +133,7 @@ pub(super) async fn connection_manager_loop_step(
|
||||
},
|
||||
|
||||
// Got a new update from the broker
|
||||
broker_update = broker_subscription.message() /* TODO: review cancellation-safety */ => {
|
||||
broker_update = broker_subscription.message() => {
|
||||
match broker_update {
|
||||
Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update),
|
||||
Err(status) => {
|
||||
@@ -158,17 +147,16 @@ pub(super) async fn connection_manager_loop_step(
|
||||
warn!("broker subscription failed: {status}");
|
||||
}
|
||||
}
|
||||
return Ok(());
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
Ok(None) => {
|
||||
error!("broker subscription stream ended"); // can't happen
|
||||
return Ok(());
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
new_event = async {
|
||||
// Reminder: this match arm needs to be cancellation-safe.
|
||||
loop {
|
||||
if connection_manager_state.timeline.current_state() == TimelineState::Loading {
|
||||
warn!("wal connection manager should only be launched after timeline has become active");
|
||||
@@ -194,11 +182,11 @@ pub(super) async fn connection_manager_loop_step(
|
||||
}
|
||||
} => match new_event {
|
||||
ControlFlow::Continue(()) => {
|
||||
return Ok(());
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
ControlFlow::Break(()) => {
|
||||
debug!("Timeline is no longer active, stopping wal connection manager loop");
|
||||
return Err(Cancelled);
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
},
|
||||
|
||||
@@ -230,15 +218,16 @@ pub(super) async fn connection_manager_loop_step(
|
||||
async fn subscribe_for_timeline_updates(
|
||||
broker_client: &mut BrokerClientChannel,
|
||||
id: TenantTimelineId,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Streaming<SafekeeperTimelineInfo>, Cancelled> {
|
||||
) -> Streaming<SafekeeperTimelineInfo> {
|
||||
let mut attempt = 0;
|
||||
let cancel = shutdown_token();
|
||||
|
||||
loop {
|
||||
exponential_backoff(
|
||||
attempt,
|
||||
DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
cancel,
|
||||
&cancel,
|
||||
)
|
||||
.await;
|
||||
attempt += 1;
|
||||
@@ -252,14 +241,9 @@ async fn subscribe_for_timeline_updates(
|
||||
subscription_key: Some(key),
|
||||
};
|
||||
|
||||
match {
|
||||
tokio::select! {
|
||||
r = broker_client.subscribe_safekeeper_info(request) => { r }
|
||||
_ = cancel.cancelled() => { return Err(Cancelled); }
|
||||
}
|
||||
} {
|
||||
match broker_client.subscribe_safekeeper_info(request).await {
|
||||
Ok(resp) => {
|
||||
return Ok(resp.into_inner());
|
||||
return resp.into_inner();
|
||||
}
|
||||
Err(e) => {
|
||||
// Safekeeper nodes can stop pushing timeline updates to the broker, when no new writes happen and
|
||||
@@ -502,10 +486,6 @@ impl ConnectionManagerState {
|
||||
|
||||
/// Drops the current connection (if any) and updates retry timeout for the next
|
||||
/// connection attempt to the same safekeeper.
|
||||
///
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// Not cancellation-safe.
|
||||
async fn drop_old_connection(&mut self, needs_shutdown: bool) {
|
||||
let wal_connection = match self.wal_connection.take() {
|
||||
Some(wal_connection) => wal_connection,
|
||||
@@ -513,14 +493,7 @@ impl ConnectionManagerState {
|
||||
};
|
||||
|
||||
if needs_shutdown {
|
||||
wal_connection
|
||||
.connection_task
|
||||
.shutdown()
|
||||
// This here is why this function isn't cancellation-safe.
|
||||
// If we got cancelled here, then self.wal_connection is already None and we lose track of the task.
|
||||
// Even if our caller diligently calls Self::shutdown(), it will find a self.wal_connection=None
|
||||
// and thus be ineffective.
|
||||
.await;
|
||||
wal_connection.connection_task.shutdown().await;
|
||||
}
|
||||
|
||||
let retry = self
|
||||
@@ -865,9 +838,6 @@ impl ConnectionManagerState {
|
||||
}
|
||||
}
|
||||
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// Not cancellation-safe.
|
||||
pub(super) async fn shutdown(mut self) {
|
||||
if let Some(wal_connection) = self.wal_connection.take() {
|
||||
wal_connection.connection_task.shutdown().await;
|
||||
|
||||
@@ -389,6 +389,17 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
// This is a hack. It piggybacks on the keepalive messages sent by the
|
||||
// safekeeper in order to enforce `checkpoint_timeout` on the currently
|
||||
// open layer. This hack doesn't provide a bound on the total size of
|
||||
// in-memory layers on a pageserver. See https://github.com/neondatabase/neon/issues/6916.
|
||||
let mut writer = timeline.writer().await;
|
||||
if let Err(err) = writer.tick().await {
|
||||
warn!("Timeline writer tick failed: {err}");
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(last_lsn) = status_update {
|
||||
let timeline_remote_consistent_lsn = timeline
|
||||
.get_remote_consistent_lsn_visible()
|
||||
|
||||
@@ -121,16 +121,11 @@ pub(super) enum SetDeletedFlagProgress {
|
||||
Successful(NaiveDateTime),
|
||||
}
|
||||
|
||||
pub(super) struct UploadQueueStoppedDeletable {
|
||||
pub(super) struct UploadQueueStopped {
|
||||
pub(super) upload_queue_for_deletion: UploadQueueInitialized,
|
||||
pub(super) deleted_at: SetDeletedFlagProgress,
|
||||
}
|
||||
|
||||
pub(super) enum UploadQueueStopped {
|
||||
Deletable(UploadQueueStoppedDeletable),
|
||||
Uninitialized,
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum NotInitialized {
|
||||
#[error("queue is in state Uninitialized")]
|
||||
@@ -254,15 +249,12 @@ impl UploadQueue {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn stopped_mut(&mut self) -> anyhow::Result<&mut UploadQueueStoppedDeletable> {
|
||||
pub(crate) fn stopped_mut(&mut self) -> anyhow::Result<&mut UploadQueueStopped> {
|
||||
match self {
|
||||
UploadQueue::Initialized(_) | UploadQueue::Uninitialized => {
|
||||
anyhow::bail!("queue is in state {}", self.as_str())
|
||||
}
|
||||
UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => {
|
||||
anyhow::bail!("queue is in state Stopped(Uninitialized)")
|
||||
}
|
||||
UploadQueue::Stopped(UploadQueueStopped::Deletable(deletable)) => Ok(deletable),
|
||||
UploadQueue::Stopped(stopped) => Ok(stopped),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -782,7 +782,7 @@ where
|
||||
}
|
||||
}
|
||||
// NB: don't use `buf.is_empty()` here; it is from the
|
||||
// `impl Deref for Slice { Target = [u8] }`; the &[u8]
|
||||
// `impl Deref for Slice { Target = [u8] }`; the the &[u8]
|
||||
// returned by it only covers the initialized portion of `buf`.
|
||||
// Whereas we're interested in ensuring that we filled the entire
|
||||
// buffer that the user passed in.
|
||||
|
||||
@@ -312,7 +312,7 @@ pg_cluster_size(PG_FUNCTION_ARGS)
|
||||
{
|
||||
int64 size;
|
||||
|
||||
size = GetNeonCurrentClusterSize();
|
||||
size = GetZenithCurrentClusterSize();
|
||||
|
||||
if (size == 0)
|
||||
PG_RETURN_NULL();
|
||||
|
||||
@@ -26,8 +26,6 @@ extern void pg_init_libpagestore(void);
|
||||
extern void pg_init_walproposer(void);
|
||||
|
||||
extern uint64 BackpressureThrottlingTime(void);
|
||||
extern void SetNeonCurrentClusterSize(uint64 size);
|
||||
extern uint64 GetNeonCurrentClusterSize(void);
|
||||
extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
|
||||
|
||||
extern void PGDLLEXPORT WalProposerSync(int argc, char *argv[]);
|
||||
|
||||
@@ -1831,7 +1831,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
||||
reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT &&
|
||||
!IsAutoVacuumWorkerProcess())
|
||||
{
|
||||
uint64 current_size = GetNeonCurrentClusterSize();
|
||||
uint64 current_size = GetZenithCurrentClusterSize();
|
||||
|
||||
if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024)
|
||||
ereport(ERROR,
|
||||
@@ -1912,7 +1912,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
|
||||
reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT &&
|
||||
!IsAutoVacuumWorkerProcess())
|
||||
{
|
||||
uint64 current_size = GetNeonCurrentClusterSize();
|
||||
uint64 current_size = GetZenithCurrentClusterSize();
|
||||
|
||||
if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024)
|
||||
ereport(ERROR,
|
||||
|
||||
@@ -287,7 +287,6 @@ typedef struct WalproposerShmemState
|
||||
slock_t mutex;
|
||||
term_t mineLastElectedTerm;
|
||||
pg_atomic_uint64 backpressureThrottlingTime;
|
||||
pg_atomic_uint64 currentClusterSize;
|
||||
|
||||
/* last feedback from each shard */
|
||||
PageserverFeedback shard_ps_feedback[MAX_SHARDS];
|
||||
|
||||
@@ -282,7 +282,6 @@ WalproposerShmemInit(void)
|
||||
memset(walprop_shared, 0, WalproposerShmemSize());
|
||||
SpinLockInit(&walprop_shared->mutex);
|
||||
pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0);
|
||||
pg_atomic_init_u64(&walprop_shared->currentClusterSize, 0);
|
||||
}
|
||||
LWLockRelease(AddinShmemInitLock);
|
||||
|
||||
@@ -1973,7 +1972,7 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk)
|
||||
|
||||
/* Only one main shard sends non-zero currentClusterSize */
|
||||
if (sk->appendResponse.ps_feedback.currentClusterSize > 0)
|
||||
SetNeonCurrentClusterSize(sk->appendResponse.ps_feedback.currentClusterSize);
|
||||
SetZenithCurrentClusterSize(sk->appendResponse.ps_feedback.currentClusterSize);
|
||||
|
||||
if (min_feedback.disk_consistent_lsn != standby_apply_lsn)
|
||||
{
|
||||
@@ -2095,18 +2094,6 @@ GetLogRepRestartLSN(WalProposer *wp)
|
||||
return lrRestartLsn;
|
||||
}
|
||||
|
||||
void SetNeonCurrentClusterSize(uint64 size)
|
||||
{
|
||||
pg_atomic_write_u64(&walprop_shared->currentClusterSize, size);
|
||||
}
|
||||
|
||||
uint64 GetNeonCurrentClusterSize(void)
|
||||
{
|
||||
return pg_atomic_read_u64(&walprop_shared->currentClusterSize);
|
||||
}
|
||||
uint64 GetNeonCurrentClusterSize(void);
|
||||
|
||||
|
||||
static const walproposer_api walprop_pg = {
|
||||
.get_shmem_state = walprop_pg_get_shmem_state,
|
||||
.start_streaming = walprop_pg_start_streaming,
|
||||
|
||||
@@ -11,10 +11,6 @@ testing = []
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
async-trait.workspace = true
|
||||
aws-config.workspace = true
|
||||
aws-sdk-iam.workspace = true
|
||||
aws-sigv4.workspace = true
|
||||
aws-types.workspace = true
|
||||
base64.workspace = true
|
||||
bstr.workspace = true
|
||||
bytes = { workspace = true, features = ["serde"] }
|
||||
@@ -31,7 +27,6 @@ hashlink.workspace = true
|
||||
hex.workspace = true
|
||||
hmac.workspace = true
|
||||
hostname.workspace = true
|
||||
http.workspace = true
|
||||
humantime.workspace = true
|
||||
hyper-tungstenite.workspace = true
|
||||
hyper.workspace = true
|
||||
@@ -97,7 +92,6 @@ workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
camino-tempfile.workspace = true
|
||||
fallible-iterator.workspace = true
|
||||
rcgen.workspace = true
|
||||
rstest.workspace = true
|
||||
tokio-postgres-rustls.workspace = true
|
||||
|
||||
@@ -12,8 +12,6 @@ use crate::console::errors::GetAuthInfoError;
|
||||
use crate::console::provider::{CachedRoleSecret, ConsoleBackend};
|
||||
use crate::console::{AuthSecret, NodeInfo};
|
||||
use crate::context::RequestMonitoring;
|
||||
use crate::intern::EndpointIdInt;
|
||||
use crate::metrics::{AUTH_RATE_LIMIT_HITS, ENDPOINTS_AUTH_RATE_LIMITED};
|
||||
use crate::proxy::connect_compute::ComputeConnectBackend;
|
||||
use crate::proxy::NeonOptions;
|
||||
use crate::stream::Stream;
|
||||
@@ -30,7 +28,7 @@ use crate::{
|
||||
use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
|
||||
use std::sync::Arc;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::{info, warn};
|
||||
use tracing::info;
|
||||
|
||||
/// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality
|
||||
pub enum MaybeOwned<'a, T> {
|
||||
@@ -176,52 +174,6 @@ impl TryFrom<ComputeUserInfoMaybeEndpoint> for ComputeUserInfo {
|
||||
}
|
||||
}
|
||||
|
||||
impl AuthenticationConfig {
|
||||
pub fn check_rate_limit(
|
||||
&self,
|
||||
|
||||
ctx: &mut RequestMonitoring,
|
||||
secret: AuthSecret,
|
||||
endpoint: &EndpointId,
|
||||
is_cleartext: bool,
|
||||
) -> auth::Result<AuthSecret> {
|
||||
// we have validated the endpoint exists, so let's intern it.
|
||||
let endpoint_int = EndpointIdInt::from(endpoint);
|
||||
|
||||
// only count the full hash count if password hack or websocket flow.
|
||||
// in other words, if proxy needs to run the hashing
|
||||
let password_weight = if is_cleartext {
|
||||
match &secret {
|
||||
#[cfg(any(test, feature = "testing"))]
|
||||
AuthSecret::Md5(_) => 1,
|
||||
AuthSecret::Scram(s) => s.iterations + 1,
|
||||
}
|
||||
} else {
|
||||
// validating scram takes just 1 hmac_sha_256 operation.
|
||||
1
|
||||
};
|
||||
|
||||
let limit_not_exceeded = self
|
||||
.rate_limiter
|
||||
.check((endpoint_int, ctx.peer_addr), password_weight);
|
||||
|
||||
if !limit_not_exceeded {
|
||||
warn!(
|
||||
enabled = self.rate_limiter_enabled,
|
||||
"rate limiting authentication"
|
||||
);
|
||||
AUTH_RATE_LIMIT_HITS.inc();
|
||||
ENDPOINTS_AUTH_RATE_LIMITED.measure(endpoint);
|
||||
|
||||
if self.rate_limiter_enabled {
|
||||
return Err(auth::AuthError::too_many_connections());
|
||||
}
|
||||
}
|
||||
|
||||
Ok(secret)
|
||||
}
|
||||
}
|
||||
|
||||
/// True to its name, this function encapsulates our current auth trade-offs.
|
||||
/// Here, we choose the appropriate auth flow based on circumstances.
|
||||
///
|
||||
@@ -262,24 +214,14 @@ async fn auth_quirks(
|
||||
Some(secret) => secret,
|
||||
None => api.get_role_secret(ctx, &info).await?,
|
||||
};
|
||||
let (cached_entry, secret) = cached_secret.take_value();
|
||||
|
||||
let secret = match secret {
|
||||
Some(secret) => config.check_rate_limit(
|
||||
ctx,
|
||||
secret,
|
||||
&info.endpoint,
|
||||
unauthenticated_password.is_some() || allow_cleartext,
|
||||
)?,
|
||||
None => {
|
||||
// If we don't have an authentication secret, we mock one to
|
||||
// prevent malicious probing (possible due to missing protocol steps).
|
||||
// This mocked secret will never lead to successful authentication.
|
||||
info!("authentication info not found, mocking it");
|
||||
AuthSecret::Scram(scram::ServerSecret::mock(rand::random()))
|
||||
}
|
||||
};
|
||||
|
||||
let secret = cached_secret.value.clone().unwrap_or_else(|| {
|
||||
// If we don't have an authentication secret, we mock one to
|
||||
// prevent malicious probing (possible due to missing protocol steps).
|
||||
// This mocked secret will never lead to successful authentication.
|
||||
info!("authentication info not found, mocking it");
|
||||
AuthSecret::Scram(scram::ServerSecret::mock(&info.user, rand::random()))
|
||||
});
|
||||
match authenticate_with_secret(
|
||||
ctx,
|
||||
secret,
|
||||
@@ -295,7 +237,7 @@ async fn auth_quirks(
|
||||
Err(e) => {
|
||||
if e.is_auth_failed() {
|
||||
// The password could have been changed, so we invalidate the cache.
|
||||
cached_entry.invalidate();
|
||||
cached_secret.invalidate();
|
||||
}
|
||||
Err(e)
|
||||
}
|
||||
@@ -466,232 +408,3 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use bytes::BytesMut;
|
||||
use fallible_iterator::FallibleIterator;
|
||||
use once_cell::sync::Lazy;
|
||||
use postgres_protocol::{
|
||||
authentication::sasl::{ChannelBinding, ScramSha256},
|
||||
message::{backend::Message as PgMessage, frontend},
|
||||
};
|
||||
use provider::AuthSecret;
|
||||
use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt};
|
||||
|
||||
use crate::{
|
||||
auth::{ComputeUserInfoMaybeEndpoint, IpPattern},
|
||||
config::AuthenticationConfig,
|
||||
console::{
|
||||
self,
|
||||
provider::{self, CachedAllowedIps, CachedRoleSecret},
|
||||
CachedNodeInfo,
|
||||
},
|
||||
context::RequestMonitoring,
|
||||
proxy::NeonOptions,
|
||||
rate_limiter::{AuthRateLimiter, RateBucketInfo},
|
||||
scram::ServerSecret,
|
||||
stream::{PqStream, Stream},
|
||||
};
|
||||
|
||||
use super::auth_quirks;
|
||||
|
||||
struct Auth {
|
||||
ips: Vec<IpPattern>,
|
||||
secret: AuthSecret,
|
||||
}
|
||||
|
||||
impl console::Api for Auth {
|
||||
async fn get_role_secret(
|
||||
&self,
|
||||
_ctx: &mut RequestMonitoring,
|
||||
_user_info: &super::ComputeUserInfo,
|
||||
) -> Result<CachedRoleSecret, console::errors::GetAuthInfoError> {
|
||||
Ok(CachedRoleSecret::new_uncached(Some(self.secret.clone())))
|
||||
}
|
||||
|
||||
async fn get_allowed_ips_and_secret(
|
||||
&self,
|
||||
_ctx: &mut RequestMonitoring,
|
||||
_user_info: &super::ComputeUserInfo,
|
||||
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError>
|
||||
{
|
||||
Ok((
|
||||
CachedAllowedIps::new_uncached(Arc::new(self.ips.clone())),
|
||||
Some(CachedRoleSecret::new_uncached(Some(self.secret.clone()))),
|
||||
))
|
||||
}
|
||||
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
_ctx: &mut RequestMonitoring,
|
||||
_user_info: &super::ComputeUserInfo,
|
||||
) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
||||
static CONFIG: Lazy<AuthenticationConfig> = Lazy::new(|| AuthenticationConfig {
|
||||
scram_protocol_timeout: std::time::Duration::from_secs(5),
|
||||
rate_limiter_enabled: true,
|
||||
rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET),
|
||||
});
|
||||
|
||||
async fn read_message(r: &mut (impl AsyncRead + Unpin), b: &mut BytesMut) -> PgMessage {
|
||||
loop {
|
||||
r.read_buf(&mut *b).await.unwrap();
|
||||
if let Some(m) = PgMessage::parse(&mut *b).unwrap() {
|
||||
break m;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn auth_quirks_scram() {
|
||||
let (mut client, server) = tokio::io::duplex(1024);
|
||||
let mut stream = PqStream::new(Stream::from_raw(server));
|
||||
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let api = Auth {
|
||||
ips: vec![],
|
||||
secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
|
||||
};
|
||||
|
||||
let user_info = ComputeUserInfoMaybeEndpoint {
|
||||
user: "conrad".into(),
|
||||
endpoint_id: Some("endpoint".into()),
|
||||
options: NeonOptions::default(),
|
||||
};
|
||||
|
||||
let handle = tokio::spawn(async move {
|
||||
let mut scram = ScramSha256::new(b"my-secret-password", ChannelBinding::unsupported());
|
||||
|
||||
let mut read = BytesMut::new();
|
||||
|
||||
// server should offer scram
|
||||
match read_message(&mut client, &mut read).await {
|
||||
PgMessage::AuthenticationSasl(a) => {
|
||||
let options: Vec<&str> = a.mechanisms().collect().unwrap();
|
||||
assert_eq!(options, ["SCRAM-SHA-256"]);
|
||||
}
|
||||
_ => panic!("wrong message"),
|
||||
}
|
||||
|
||||
// client sends client-first-message
|
||||
let mut write = BytesMut::new();
|
||||
frontend::sasl_initial_response("SCRAM-SHA-256", scram.message(), &mut write).unwrap();
|
||||
client.write_all(&write).await.unwrap();
|
||||
|
||||
// server response with server-first-message
|
||||
match read_message(&mut client, &mut read).await {
|
||||
PgMessage::AuthenticationSaslContinue(a) => {
|
||||
scram.update(a.data()).await.unwrap();
|
||||
}
|
||||
_ => panic!("wrong message"),
|
||||
}
|
||||
|
||||
// client response with client-final-message
|
||||
write.clear();
|
||||
frontend::sasl_response(scram.message(), &mut write).unwrap();
|
||||
client.write_all(&write).await.unwrap();
|
||||
|
||||
// server response with server-final-message
|
||||
match read_message(&mut client, &mut read).await {
|
||||
PgMessage::AuthenticationSaslFinal(a) => {
|
||||
scram.finish(a.data()).unwrap();
|
||||
}
|
||||
_ => panic!("wrong message"),
|
||||
}
|
||||
});
|
||||
|
||||
let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, false, &CONFIG)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
handle.await.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn auth_quirks_cleartext() {
|
||||
let (mut client, server) = tokio::io::duplex(1024);
|
||||
let mut stream = PqStream::new(Stream::from_raw(server));
|
||||
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let api = Auth {
|
||||
ips: vec![],
|
||||
secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
|
||||
};
|
||||
|
||||
let user_info = ComputeUserInfoMaybeEndpoint {
|
||||
user: "conrad".into(),
|
||||
endpoint_id: Some("endpoint".into()),
|
||||
options: NeonOptions::default(),
|
||||
};
|
||||
|
||||
let handle = tokio::spawn(async move {
|
||||
let mut read = BytesMut::new();
|
||||
let mut write = BytesMut::new();
|
||||
|
||||
// server should offer cleartext
|
||||
match read_message(&mut client, &mut read).await {
|
||||
PgMessage::AuthenticationCleartextPassword => {}
|
||||
_ => panic!("wrong message"),
|
||||
}
|
||||
|
||||
// client responds with password
|
||||
write.clear();
|
||||
frontend::password_message(b"my-secret-password", &mut write).unwrap();
|
||||
client.write_all(&write).await.unwrap();
|
||||
});
|
||||
|
||||
let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, &CONFIG)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
handle.await.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn auth_quirks_password_hack() {
|
||||
let (mut client, server) = tokio::io::duplex(1024);
|
||||
let mut stream = PqStream::new(Stream::from_raw(server));
|
||||
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let api = Auth {
|
||||
ips: vec![],
|
||||
secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
|
||||
};
|
||||
|
||||
let user_info = ComputeUserInfoMaybeEndpoint {
|
||||
user: "conrad".into(),
|
||||
endpoint_id: None,
|
||||
options: NeonOptions::default(),
|
||||
};
|
||||
|
||||
let handle = tokio::spawn(async move {
|
||||
let mut read = BytesMut::new();
|
||||
|
||||
// server should offer cleartext
|
||||
match read_message(&mut client, &mut read).await {
|
||||
PgMessage::AuthenticationCleartextPassword => {}
|
||||
_ => panic!("wrong message"),
|
||||
}
|
||||
|
||||
// client responds with password
|
||||
let mut write = BytesMut::new();
|
||||
frontend::password_message(b"endpoint=my-endpoint;my-secret-password", &mut write)
|
||||
.unwrap();
|
||||
client.write_all(&write).await.unwrap();
|
||||
});
|
||||
|
||||
let creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, &CONFIG)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(creds.info.endpoint, "my-endpoint");
|
||||
|
||||
handle.await.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,10 +1,3 @@
|
||||
use aws_config::environment::EnvironmentVariableCredentialsProvider;
|
||||
use aws_config::imds::credentials::ImdsCredentialsProvider;
|
||||
use aws_config::meta::credentials::CredentialsProviderChain;
|
||||
use aws_config::meta::region::RegionProviderChain;
|
||||
use aws_config::profile::ProfileFileCredentialsProvider;
|
||||
use aws_config::provider_config::ProviderConfig;
|
||||
use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
|
||||
use futures::future::Either;
|
||||
use proxy::auth;
|
||||
use proxy::auth::backend::MaybeOwned;
|
||||
@@ -17,15 +10,11 @@ use proxy::config::ProjectInfoCacheOptions;
|
||||
use proxy::console;
|
||||
use proxy::context::parquet::ParquetUploadArgs;
|
||||
use proxy::http;
|
||||
use proxy::metrics::NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT;
|
||||
use proxy::rate_limiter::AuthRateLimiter;
|
||||
use proxy::rate_limiter::EndpointRateLimiter;
|
||||
use proxy::rate_limiter::RateBucketInfo;
|
||||
use proxy::rate_limiter::RateLimiterConfig;
|
||||
use proxy::redis::cancellation_publisher::RedisPublisherClient;
|
||||
use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
|
||||
use proxy::redis::elasticache;
|
||||
use proxy::redis::notifications;
|
||||
use proxy::redis::publisher::RedisPublisherClient;
|
||||
use proxy::serverless::GlobalConnPoolOptions;
|
||||
use proxy::usage_metrics;
|
||||
|
||||
@@ -142,16 +131,10 @@ struct ProxyCliArgs {
|
||||
///
|
||||
/// Provided in the form '<Requests Per Second>@<Bucket Duration Size>'.
|
||||
/// Can be given multiple times for different bucket sizes.
|
||||
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
|
||||
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
|
||||
endpoint_rps_limit: Vec<RateBucketInfo>,
|
||||
/// Whether the auth rate limiter actually takes effect (for testing)
|
||||
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
||||
auth_rate_limit_enabled: bool,
|
||||
/// Authentication rate limiter max number of hashes per second.
|
||||
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)]
|
||||
auth_rate_limit: Vec<RateBucketInfo>,
|
||||
/// Redis rate limiter max number of requests per second.
|
||||
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
|
||||
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
|
||||
redis_rps_limit: Vec<RateBucketInfo>,
|
||||
/// Initial limit for dynamic rate limiter. Makes sense only if `rate_limit_algorithm` is *not* `None`.
|
||||
#[clap(long, default_value_t = 100)]
|
||||
@@ -167,24 +150,9 @@ struct ProxyCliArgs {
|
||||
/// disable ip check for http requests. If it is too time consuming, it could be turned off.
|
||||
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
||||
disable_ip_check_for_http: bool,
|
||||
/// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections)
|
||||
/// redis url for notifications.
|
||||
#[clap(long)]
|
||||
redis_notifications: Option<String>,
|
||||
/// redis host for streaming connections (might be different from the notifications host)
|
||||
#[clap(long)]
|
||||
redis_host: Option<String>,
|
||||
/// redis port for streaming connections (might be different from the notifications host)
|
||||
#[clap(long)]
|
||||
redis_port: Option<u16>,
|
||||
/// redis cluster name, used in aws elasticache
|
||||
#[clap(long)]
|
||||
redis_cluster_name: Option<String>,
|
||||
/// redis user_id, used in aws elasticache
|
||||
#[clap(long)]
|
||||
redis_user_id: Option<String>,
|
||||
/// aws region to retrieve credentials
|
||||
#[clap(long, default_value_t = String::new())]
|
||||
aws_region: String,
|
||||
/// cache for `project_info` (use `size=0` to disable)
|
||||
#[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)]
|
||||
project_info_cache: String,
|
||||
@@ -248,61 +216,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
let config = build_config(&args)?;
|
||||
|
||||
info!("Authentication backend: {}", config.auth_backend);
|
||||
info!("Using region: {}", config.aws_region);
|
||||
|
||||
let region_provider = RegionProviderChain::default_provider().or_else(&*config.aws_region); // Replace with your Redis region if needed
|
||||
let provider_conf =
|
||||
ProviderConfig::without_region().with_region(region_provider.region().await);
|
||||
let aws_credentials_provider = {
|
||||
// uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
|
||||
CredentialsProviderChain::first_try("env", EnvironmentVariableCredentialsProvider::new())
|
||||
// uses "AWS_PROFILE" / `aws sso login --profile <profile>`
|
||||
.or_else(
|
||||
"profile-sso",
|
||||
ProfileFileCredentialsProvider::builder()
|
||||
.configure(&provider_conf)
|
||||
.build(),
|
||||
)
|
||||
// uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
|
||||
// needed to access remote extensions bucket
|
||||
.or_else(
|
||||
"token",
|
||||
WebIdentityTokenCredentialsProvider::builder()
|
||||
.configure(&provider_conf)
|
||||
.build(),
|
||||
)
|
||||
// uses imds v2
|
||||
.or_else("imds", ImdsCredentialsProvider::builder().build())
|
||||
};
|
||||
let elasticache_credentials_provider = Arc::new(elasticache::CredentialsProvider::new(
|
||||
elasticache::AWSIRSAConfig::new(
|
||||
config.aws_region.clone(),
|
||||
args.redis_cluster_name,
|
||||
args.redis_user_id,
|
||||
),
|
||||
aws_credentials_provider,
|
||||
));
|
||||
let redis_notifications_client =
|
||||
match (args.redis_notifications, (args.redis_host, args.redis_port)) {
|
||||
(Some(url), _) => {
|
||||
info!("Starting redis notifications listener ({url})");
|
||||
Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url))
|
||||
}
|
||||
(None, (Some(host), Some(port))) => Some(
|
||||
ConnectionWithCredentialsProvider::new_with_credentials_provider(
|
||||
host,
|
||||
port,
|
||||
elasticache_credentials_provider.clone(),
|
||||
),
|
||||
),
|
||||
(None, (None, None)) => {
|
||||
warn!("Redis is disabled");
|
||||
None
|
||||
}
|
||||
_ => {
|
||||
bail!("redis-host and redis-port must be specified together");
|
||||
}
|
||||
};
|
||||
|
||||
// Check that we can bind to address before further initialization
|
||||
let http_address: SocketAddr = args.http.parse()?;
|
||||
@@ -320,22 +233,17 @@ async fn main() -> anyhow::Result<()> {
|
||||
|
||||
let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(&config.endpoint_rps_limit));
|
||||
let cancel_map = CancelMap::default();
|
||||
|
||||
// let redis_notifications_client = redis_notifications_client.map(|x| Box::leak(Box::new(x)));
|
||||
let redis_publisher = match &redis_notifications_client {
|
||||
Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
|
||||
redis_publisher.clone(),
|
||||
let redis_publisher = match &args.redis_notifications {
|
||||
Some(url) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
|
||||
url,
|
||||
args.region.clone(),
|
||||
&config.redis_rps_limit,
|
||||
)?))),
|
||||
None => None,
|
||||
};
|
||||
let cancellation_handler = Arc::new(CancellationHandler::<
|
||||
Option<Arc<tokio::sync::Mutex<RedisPublisherClient>>>,
|
||||
>::new(
|
||||
let cancellation_handler = Arc::new(CancellationHandler::new(
|
||||
cancel_map.clone(),
|
||||
redis_publisher,
|
||||
NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT,
|
||||
));
|
||||
|
||||
// client facing tasks. these will exit on error or on cancellation
|
||||
@@ -382,16 +290,17 @@ async fn main() -> anyhow::Result<()> {
|
||||
|
||||
if let auth::BackendType::Console(api, _) = &config.auth_backend {
|
||||
if let proxy::console::provider::ConsoleBackend::Console(api) = &**api {
|
||||
if let Some(redis_notifications_client) = redis_notifications_client {
|
||||
let cache = api.caches.project_info.clone();
|
||||
let cache = api.caches.project_info.clone();
|
||||
if let Some(url) = args.redis_notifications {
|
||||
info!("Starting redis notifications listener ({url})");
|
||||
maintenance_tasks.spawn(notifications::task_main(
|
||||
redis_notifications_client.clone(),
|
||||
url.to_owned(),
|
||||
cache.clone(),
|
||||
cancel_map.clone(),
|
||||
args.region.clone(),
|
||||
));
|
||||
maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
|
||||
}
|
||||
maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
|
||||
}
|
||||
}
|
||||
|
||||
@@ -517,8 +426,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
};
|
||||
let authentication_config = AuthenticationConfig {
|
||||
scram_protocol_timeout: args.scram_protocol_timeout,
|
||||
rate_limiter_enabled: args.auth_rate_limit_enabled,
|
||||
rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()),
|
||||
};
|
||||
|
||||
let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
|
||||
@@ -538,8 +445,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
endpoint_rps_limit,
|
||||
redis_rps_limit,
|
||||
handshake_timeout: args.handshake_timeout,
|
||||
// TODO: add this argument
|
||||
region: args.region.clone(),
|
||||
aws_region: args.aws_region.clone(),
|
||||
}));
|
||||
|
||||
Ok(config)
|
||||
|
||||
10
proxy/src/cache/common.rs
vendored
10
proxy/src/cache/common.rs
vendored
@@ -43,16 +43,6 @@ impl<C: Cache, V> Cached<C, V> {
|
||||
Self { token: None, value }
|
||||
}
|
||||
|
||||
pub fn take_value(self) -> (Cached<C, ()>, V) {
|
||||
(
|
||||
Cached {
|
||||
token: self.token,
|
||||
value: (),
|
||||
},
|
||||
self.value,
|
||||
)
|
||||
}
|
||||
|
||||
/// Drop this entry from a cache if it's still there.
|
||||
pub fn invalidate(self) -> V {
|
||||
if let Some((cache, info)) = &self.token {
|
||||
|
||||
30
proxy/src/cache/project_info.rs
vendored
30
proxy/src/cache/project_info.rs
vendored
@@ -373,7 +373,10 @@ mod tests {
|
||||
let endpoint_id = "endpoint".into();
|
||||
let user1: RoleName = "user1".into();
|
||||
let user2: RoleName = "user2".into();
|
||||
let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
|
||||
let secret1 = Some(AuthSecret::Scram(ServerSecret::mock(
|
||||
user1.as_str(),
|
||||
[1; 32],
|
||||
)));
|
||||
let secret2 = None;
|
||||
let allowed_ips = Arc::new(vec![
|
||||
"127.0.0.1".parse().unwrap(),
|
||||
@@ -392,7 +395,10 @@ mod tests {
|
||||
|
||||
// Shouldn't add more than 2 roles.
|
||||
let user3: RoleName = "user3".into();
|
||||
let secret3 = Some(AuthSecret::Scram(ServerSecret::mock([3; 32])));
|
||||
let secret3 = Some(AuthSecret::Scram(ServerSecret::mock(
|
||||
user3.as_str(),
|
||||
[3; 32],
|
||||
)));
|
||||
cache.insert_role_secret(&project_id, &endpoint_id, &user3, secret3.clone());
|
||||
assert!(cache.get_role_secret(&endpoint_id, &user3).is_none());
|
||||
|
||||
@@ -425,8 +431,14 @@ mod tests {
|
||||
let endpoint_id = "endpoint".into();
|
||||
let user1: RoleName = "user1".into();
|
||||
let user2: RoleName = "user2".into();
|
||||
let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
|
||||
let secret2 = Some(AuthSecret::Scram(ServerSecret::mock([2; 32])));
|
||||
let secret1 = Some(AuthSecret::Scram(ServerSecret::mock(
|
||||
user1.as_str(),
|
||||
[1; 32],
|
||||
)));
|
||||
let secret2 = Some(AuthSecret::Scram(ServerSecret::mock(
|
||||
user2.as_str(),
|
||||
[2; 32],
|
||||
)));
|
||||
let allowed_ips = Arc::new(vec![
|
||||
"127.0.0.1".parse().unwrap(),
|
||||
"127.0.0.2".parse().unwrap(),
|
||||
@@ -474,8 +486,14 @@ mod tests {
|
||||
let endpoint_id = "endpoint".into();
|
||||
let user1: RoleName = "user1".into();
|
||||
let user2: RoleName = "user2".into();
|
||||
let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
|
||||
let secret2 = Some(AuthSecret::Scram(ServerSecret::mock([2; 32])));
|
||||
let secret1 = Some(AuthSecret::Scram(ServerSecret::mock(
|
||||
user1.as_str(),
|
||||
[1; 32],
|
||||
)));
|
||||
let secret2 = Some(AuthSecret::Scram(ServerSecret::mock(
|
||||
user2.as_str(),
|
||||
[2; 32],
|
||||
)));
|
||||
let allowed_ips = Arc::new(vec![
|
||||
"127.0.0.1".parse().unwrap(),
|
||||
"127.0.0.2".parse().unwrap(),
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use async_trait::async_trait;
|
||||
use dashmap::DashMap;
|
||||
use pq_proto::CancelKeyData;
|
||||
use std::{net::SocketAddr, sync::Arc};
|
||||
@@ -9,26 +10,18 @@ use tracing::info;
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::{
|
||||
error::ReportableError,
|
||||
metrics::NUM_CANCELLATION_REQUESTS,
|
||||
redis::cancellation_publisher::{
|
||||
CancellationPublisher, CancellationPublisherMut, RedisPublisherClient,
|
||||
},
|
||||
error::ReportableError, metrics::NUM_CANCELLATION_REQUESTS,
|
||||
redis::publisher::RedisPublisherClient,
|
||||
};
|
||||
|
||||
pub type CancelMap = Arc<DashMap<CancelKeyData, Option<CancelClosure>>>;
|
||||
pub type CancellationHandlerMain = CancellationHandler<Option<Arc<Mutex<RedisPublisherClient>>>>;
|
||||
pub type CancellationHandlerMainInternal = Option<Arc<Mutex<RedisPublisherClient>>>;
|
||||
|
||||
/// Enables serving `CancelRequest`s.
|
||||
///
|
||||
/// If `CancellationPublisher` is available, cancel request will be used to publish the cancellation key to other proxy instances.
|
||||
pub struct CancellationHandler<P> {
|
||||
/// If there is a `RedisPublisherClient` available, it will be used to publish the cancellation key to other proxy instances.
|
||||
pub struct CancellationHandler {
|
||||
map: CancelMap,
|
||||
client: P,
|
||||
/// This field used for the monitoring purposes.
|
||||
/// Represents the source of the cancellation request.
|
||||
from: &'static str,
|
||||
redis_client: Option<Arc<Mutex<RedisPublisherClient>>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
@@ -51,9 +44,49 @@ impl ReportableError for CancelError {
|
||||
}
|
||||
}
|
||||
|
||||
impl<P: CancellationPublisher> CancellationHandler<P> {
|
||||
impl CancellationHandler {
|
||||
pub fn new(map: CancelMap, redis_client: Option<Arc<Mutex<RedisPublisherClient>>>) -> Self {
|
||||
Self { map, redis_client }
|
||||
}
|
||||
/// Cancel a running query for the corresponding connection.
|
||||
pub async fn cancel_session(
|
||||
&self,
|
||||
key: CancelKeyData,
|
||||
session_id: Uuid,
|
||||
) -> Result<(), CancelError> {
|
||||
let from = "from_client";
|
||||
// NB: we should immediately release the lock after cloning the token.
|
||||
let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else {
|
||||
tracing::warn!("query cancellation key not found: {key}");
|
||||
if let Some(redis_client) = &self.redis_client {
|
||||
NUM_CANCELLATION_REQUESTS
|
||||
.with_label_values(&[from, "not_found"])
|
||||
.inc();
|
||||
info!("publishing cancellation key to Redis");
|
||||
match redis_client.lock().await.try_publish(key, session_id).await {
|
||||
Ok(()) => {
|
||||
info!("cancellation key successfuly published to Redis");
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::error!("failed to publish a message: {e}");
|
||||
return Err(CancelError::IO(std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
e.to_string(),
|
||||
)));
|
||||
}
|
||||
}
|
||||
}
|
||||
return Ok(());
|
||||
};
|
||||
NUM_CANCELLATION_REQUESTS
|
||||
.with_label_values(&[from, "found"])
|
||||
.inc();
|
||||
info!("cancelling query per user's request using key {key}");
|
||||
cancel_closure.try_cancel_query().await
|
||||
}
|
||||
|
||||
/// Run async action within an ephemeral session identified by [`CancelKeyData`].
|
||||
pub fn get_session(self: Arc<Self>) -> Session<P> {
|
||||
pub fn get_session(self: Arc<Self>) -> Session {
|
||||
// HACK: We'd rather get the real backend_pid but tokio_postgres doesn't
|
||||
// expose it and we don't want to do another roundtrip to query
|
||||
// for it. The client will be able to notice that this is not the
|
||||
@@ -79,39 +112,9 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
|
||||
cancellation_handler: self,
|
||||
}
|
||||
}
|
||||
/// Try to cancel a running query for the corresponding connection.
|
||||
/// If the cancellation key is not found, it will be published to Redis.
|
||||
pub async fn cancel_session(
|
||||
&self,
|
||||
key: CancelKeyData,
|
||||
session_id: Uuid,
|
||||
) -> Result<(), CancelError> {
|
||||
// NB: we should immediately release the lock after cloning the token.
|
||||
let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else {
|
||||
tracing::warn!("query cancellation key not found: {key}");
|
||||
NUM_CANCELLATION_REQUESTS
|
||||
.with_label_values(&[self.from, "not_found"])
|
||||
.inc();
|
||||
match self.client.try_publish(key, session_id).await {
|
||||
Ok(()) => {} // do nothing
|
||||
Err(e) => {
|
||||
return Err(CancelError::IO(std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
e.to_string(),
|
||||
)));
|
||||
}
|
||||
}
|
||||
return Ok(());
|
||||
};
|
||||
NUM_CANCELLATION_REQUESTS
|
||||
.with_label_values(&[self.from, "found"])
|
||||
.inc();
|
||||
info!("cancelling query per user's request using key {key}");
|
||||
cancel_closure.try_cancel_query().await
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
fn contains(&self, session: &Session<P>) -> bool {
|
||||
fn contains(&self, session: &Session) -> bool {
|
||||
self.map.contains_key(&session.key)
|
||||
}
|
||||
|
||||
@@ -121,19 +124,31 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
|
||||
}
|
||||
}
|
||||
|
||||
impl CancellationHandler<()> {
|
||||
pub fn new(map: CancelMap, from: &'static str) -> Self {
|
||||
Self {
|
||||
map,
|
||||
client: (),
|
||||
from,
|
||||
}
|
||||
}
|
||||
#[async_trait]
|
||||
pub trait NotificationsCancellationHandler {
|
||||
async fn cancel_session_no_publish(&self, key: CancelKeyData) -> Result<(), CancelError>;
|
||||
}
|
||||
|
||||
impl<P: CancellationPublisherMut> CancellationHandler<Option<Arc<Mutex<P>>>> {
|
||||
pub fn new(map: CancelMap, client: Option<Arc<Mutex<P>>>, from: &'static str) -> Self {
|
||||
Self { map, client, from }
|
||||
#[async_trait]
|
||||
impl NotificationsCancellationHandler for CancellationHandler {
|
||||
async fn cancel_session_no_publish(&self, key: CancelKeyData) -> Result<(), CancelError> {
|
||||
let from = "from_redis";
|
||||
let cancel_closure = self.map.get(&key).and_then(|x| x.clone());
|
||||
match cancel_closure {
|
||||
Some(cancel_closure) => {
|
||||
NUM_CANCELLATION_REQUESTS
|
||||
.with_label_values(&[from, "found"])
|
||||
.inc();
|
||||
cancel_closure.try_cancel_query().await
|
||||
}
|
||||
None => {
|
||||
NUM_CANCELLATION_REQUESTS
|
||||
.with_label_values(&[from, "not_found"])
|
||||
.inc();
|
||||
tracing::warn!("query cancellation key not found: {key}");
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -163,14 +178,14 @@ impl CancelClosure {
|
||||
}
|
||||
|
||||
/// Helper for registering query cancellation tokens.
|
||||
pub struct Session<P> {
|
||||
pub struct Session {
|
||||
/// The user-facing key identifying this session.
|
||||
key: CancelKeyData,
|
||||
/// The [`CancelMap`] this session belongs to.
|
||||
cancellation_handler: Arc<CancellationHandler<P>>,
|
||||
cancellation_handler: Arc<CancellationHandler>,
|
||||
}
|
||||
|
||||
impl<P> Session<P> {
|
||||
impl Session {
|
||||
/// Store the cancel token for the given session.
|
||||
/// This enables query cancellation in `crate::proxy::prepare_client_connection`.
|
||||
pub fn enable_query_cancellation(&self, cancel_closure: CancelClosure) -> CancelKeyData {
|
||||
@@ -183,7 +198,7 @@ impl<P> Session<P> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<P> Drop for Session<P> {
|
||||
impl Drop for Session {
|
||||
fn drop(&mut self) {
|
||||
self.cancellation_handler.map.remove(&self.key);
|
||||
info!("dropped query cancellation key {}", &self.key);
|
||||
@@ -192,16 +207,14 @@ impl<P> Drop for Session<P> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::metrics::NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn check_session_drop() -> anyhow::Result<()> {
|
||||
let cancellation_handler = Arc::new(CancellationHandler::<()>::new(
|
||||
CancelMap::default(),
|
||||
NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS,
|
||||
));
|
||||
let cancellation_handler = Arc::new(CancellationHandler {
|
||||
map: CancelMap::default(),
|
||||
redis_client: None,
|
||||
});
|
||||
|
||||
let session = cancellation_handler.clone().get_session();
|
||||
assert!(cancellation_handler.contains(&session));
|
||||
@@ -211,19 +224,4 @@ mod tests {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn cancel_session_noop_regression() {
|
||||
let handler = CancellationHandler::<()>::new(Default::default(), "local");
|
||||
handler
|
||||
.cancel_session(
|
||||
CancelKeyData {
|
||||
backend_pid: 0,
|
||||
cancel_key: 0,
|
||||
},
|
||||
Uuid::new_v4(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -82,13 +82,14 @@ pub type ScramKeys = tokio_postgres::config::ScramKeys<32>;
|
||||
/// A config for establishing a connection to compute node.
|
||||
/// Eventually, `tokio_postgres` will be replaced with something better.
|
||||
/// Newtype allows us to implement methods on top of it.
|
||||
#[derive(Clone, Default)]
|
||||
#[derive(Clone)]
|
||||
#[repr(transparent)]
|
||||
pub struct ConnCfg(Box<tokio_postgres::Config>);
|
||||
|
||||
/// Creation and initialization routines.
|
||||
impl ConnCfg {
|
||||
pub fn new() -> Self {
|
||||
Self::default()
|
||||
Self(Default::default())
|
||||
}
|
||||
|
||||
/// Reuse password or auth keys from the other config.
|
||||
@@ -164,6 +165,12 @@ impl std::ops::DerefMut for ConnCfg {
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ConnCfg {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl ConnCfg {
|
||||
/// Establish a raw TCP connection to the compute node.
|
||||
async fn connect_raw(&self, timeout: Duration) -> io::Result<(SocketAddr, TcpStream, &str)> {
|
||||
|
||||
@@ -1,8 +1,4 @@
|
||||
use crate::{
|
||||
auth,
|
||||
rate_limiter::{AuthRateLimiter, RateBucketInfo},
|
||||
serverless::GlobalConnPoolOptions,
|
||||
};
|
||||
use crate::{auth, rate_limiter::RateBucketInfo, serverless::GlobalConnPoolOptions};
|
||||
use anyhow::{bail, ensure, Context, Ok};
|
||||
use itertools::Itertools;
|
||||
use rustls::{
|
||||
@@ -32,7 +28,6 @@ pub struct ProxyConfig {
|
||||
pub redis_rps_limit: Vec<RateBucketInfo>,
|
||||
pub region: String,
|
||||
pub handshake_timeout: Duration,
|
||||
pub aws_region: String,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -54,8 +49,6 @@ pub struct HttpConfig {
|
||||
|
||||
pub struct AuthenticationConfig {
|
||||
pub scram_protocol_timeout: tokio::time::Duration,
|
||||
pub rate_limiter_enabled: bool,
|
||||
pub rate_limiter: AuthRateLimiter,
|
||||
}
|
||||
|
||||
impl TlsConfig {
|
||||
|
||||
@@ -6,7 +6,7 @@ pub mod messages;
|
||||
|
||||
/// Wrappers for console APIs and their mocks.
|
||||
pub mod provider;
|
||||
pub(crate) use provider::{errors, Api, AuthSecret, CachedNodeInfo, NodeInfo};
|
||||
pub use provider::{errors, Api, AuthSecret, CachedNodeInfo, NodeInfo};
|
||||
|
||||
/// Various cache-related types.
|
||||
pub mod caches {
|
||||
|
||||
@@ -14,6 +14,7 @@ use crate::{
|
||||
context::RequestMonitoring,
|
||||
scram, EndpointCacheKey, ProjectId,
|
||||
};
|
||||
use async_trait::async_trait;
|
||||
use dashmap::DashMap;
|
||||
use std::{sync::Arc, time::Duration};
|
||||
use tokio::sync::{OwnedSemaphorePermit, Semaphore};
|
||||
@@ -325,7 +326,8 @@ pub type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<IpPatt
|
||||
|
||||
/// This will allocate per each call, but the http requests alone
|
||||
/// already require a few allocations, so it should be fine.
|
||||
pub(crate) trait Api {
|
||||
#[async_trait]
|
||||
pub trait Api {
|
||||
/// Get the client's auth secret for authentication.
|
||||
/// Returns option because user not found situation is special.
|
||||
/// We still have to mock the scram to avoid leaking information that user doesn't exist.
|
||||
@@ -361,6 +363,7 @@ pub enum ConsoleBackend {
|
||||
Test(Box<dyn crate::auth::backend::TestBackend>),
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Api for ConsoleBackend {
|
||||
async fn get_role_secret(
|
||||
&self,
|
||||
|
||||
@@ -8,6 +8,7 @@ use crate::console::provider::{CachedAllowedIps, CachedRoleSecret};
|
||||
use crate::context::RequestMonitoring;
|
||||
use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
|
||||
use crate::{auth::IpPattern, cache::Cached};
|
||||
use async_trait::async_trait;
|
||||
use futures::TryFutureExt;
|
||||
use std::{str::FromStr, sync::Arc};
|
||||
use thiserror::Error;
|
||||
@@ -143,6 +144,7 @@ async fn get_execute_postgres_query(
|
||||
Ok(Some(entry))
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl super::Api for Api {
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn get_role_secret(
|
||||
|
||||
@@ -14,6 +14,7 @@ use crate::{
|
||||
context::RequestMonitoring,
|
||||
metrics::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER},
|
||||
};
|
||||
use async_trait::async_trait;
|
||||
use futures::TryFutureExt;
|
||||
use std::sync::Arc;
|
||||
use tokio::time::Instant;
|
||||
@@ -55,7 +56,7 @@ impl Api {
|
||||
ctx: &mut RequestMonitoring,
|
||||
user_info: &ComputeUserInfo,
|
||||
) -> Result<AuthInfo, GetAuthInfoError> {
|
||||
let request_id = ctx.session_id.to_string();
|
||||
let request_id = uuid::Uuid::new_v4().to_string();
|
||||
let application_name = ctx.console_application_name();
|
||||
async {
|
||||
let request = self
|
||||
@@ -112,7 +113,7 @@ impl Api {
|
||||
ctx: &mut RequestMonitoring,
|
||||
user_info: &ComputeUserInfo,
|
||||
) -> Result<NodeInfo, WakeComputeError> {
|
||||
let request_id = ctx.session_id.to_string();
|
||||
let request_id = uuid::Uuid::new_v4().to_string();
|
||||
let application_name = ctx.console_application_name();
|
||||
async {
|
||||
let mut request_builder = self
|
||||
@@ -167,6 +168,7 @@ impl Api {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl super::Api for Api {
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn get_role_secret(
|
||||
|
||||
@@ -4,10 +4,7 @@ use ::metrics::{
|
||||
register_int_gauge_vec, Histogram, HistogramVec, HyperLogLogVec, IntCounterPairVec,
|
||||
IntCounterVec, IntGauge, IntGaugeVec,
|
||||
};
|
||||
use metrics::{
|
||||
register_hll, register_int_counter, register_int_counter_pair, HyperLogLog, IntCounter,
|
||||
IntCounterPair,
|
||||
};
|
||||
use metrics::{register_int_counter, register_int_counter_pair, IntCounter, IntCounterPair};
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use tokio::time::{self, Instant};
|
||||
@@ -164,9 +161,6 @@ pub static NUM_CANCELLATION_REQUESTS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT: &str = "from_client";
|
||||
pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS: &str = "from_redis";
|
||||
|
||||
pub enum Waiting {
|
||||
Cplane,
|
||||
Client,
|
||||
@@ -361,20 +355,3 @@ pub static TLS_HANDSHAKE_FAILURES: Lazy<IntCounter> = Lazy::new(|| {
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub static ENDPOINTS_AUTH_RATE_LIMITED: Lazy<HyperLogLog<32>> = Lazy::new(|| {
|
||||
register_hll!(
|
||||
32,
|
||||
"proxy_endpoints_auth_rate_limits",
|
||||
"Number of endpoints affected by authentication rate limits",
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub static AUTH_RATE_LIMIT_HITS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"proxy_requests_auth_rate_limits_total",
|
||||
"Number of connection requests affected by authentication rate limits",
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
@@ -10,7 +10,7 @@ pub mod wake_compute;
|
||||
|
||||
use crate::{
|
||||
auth,
|
||||
cancellation::{self, CancellationHandlerMain, CancellationHandlerMainInternal},
|
||||
cancellation::{self, CancellationHandler},
|
||||
compute,
|
||||
config::{ProxyConfig, TlsConfig},
|
||||
context::RequestMonitoring,
|
||||
@@ -62,7 +62,7 @@ pub async fn task_main(
|
||||
listener: tokio::net::TcpListener,
|
||||
cancellation_token: CancellationToken,
|
||||
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
cancellation_handler: Arc<CancellationHandlerMain>,
|
||||
cancellation_handler: Arc<CancellationHandler>,
|
||||
) -> anyhow::Result<()> {
|
||||
scopeguard::defer! {
|
||||
info!("proxy has shut down");
|
||||
@@ -233,12 +233,12 @@ impl ReportableError for ClientRequestError {
|
||||
pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
config: &'static ProxyConfig,
|
||||
ctx: &mut RequestMonitoring,
|
||||
cancellation_handler: Arc<CancellationHandlerMain>,
|
||||
cancellation_handler: Arc<CancellationHandler>,
|
||||
stream: S,
|
||||
mode: ClientMode,
|
||||
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
conn_gauge: IntCounterPairGuard,
|
||||
) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
|
||||
) -> Result<Option<ProxyPassthrough<S>>, ClientRequestError> {
|
||||
info!("handling interactive connection from client");
|
||||
|
||||
let proto = ctx.protocol;
|
||||
@@ -280,7 +280,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
|
||||
// check rate limit
|
||||
if let Some(ep) = user_info.get_endpoint() {
|
||||
if !endpoint_rate_limiter.check(ep, 1) {
|
||||
if !endpoint_rate_limiter.check(ep) {
|
||||
return stream
|
||||
.throw_error(auth::AuthError::too_many_connections())
|
||||
.await?;
|
||||
@@ -338,9 +338,9 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
|
||||
/// Finish client connection initialization: confirm auth success, send params, etc.
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn prepare_client_connection<P>(
|
||||
async fn prepare_client_connection(
|
||||
node: &compute::PostgresConnection,
|
||||
session: &cancellation::Session<P>,
|
||||
session: &cancellation::Session,
|
||||
stream: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
) -> Result<(), std::io::Error> {
|
||||
// Register compute's query cancellation token and produce a new, unique one.
|
||||
|
||||
@@ -55,17 +55,17 @@ pub async fn proxy_pass(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub struct ProxyPassthrough<P, S> {
|
||||
pub struct ProxyPassthrough<S> {
|
||||
pub client: Stream<S>,
|
||||
pub compute: PostgresConnection,
|
||||
pub aux: MetricsAuxInfo,
|
||||
|
||||
pub req: IntCounterPairGuard,
|
||||
pub conn: IntCounterPairGuard,
|
||||
pub cancel: cancellation::Session<P>,
|
||||
pub cancel: cancellation::Session,
|
||||
}
|
||||
|
||||
impl<P, S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<P, S> {
|
||||
impl<S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<S> {
|
||||
pub async fn proxy_pass(self) -> anyhow::Result<()> {
|
||||
let res = proxy_pass(self.client, self.compute.stream, self.aux).await;
|
||||
self.compute.cancel_closure.try_cancel_query().await?;
|
||||
|
||||
@@ -142,8 +142,8 @@ impl Scram {
|
||||
Ok(Scram(secret))
|
||||
}
|
||||
|
||||
fn mock() -> Self {
|
||||
Scram(scram::ServerSecret::mock(rand::random()))
|
||||
fn mock(user: &str) -> Self {
|
||||
Scram(scram::ServerSecret::mock(user, rand::random()))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -330,7 +330,11 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
|
||||
|
||||
let (client_config, server_config) =
|
||||
generate_tls_config("generic-project-name.localhost", "localhost")?;
|
||||
let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), Scram::mock()));
|
||||
let proxy = tokio::spawn(dummy_proxy(
|
||||
client,
|
||||
Some(server_config),
|
||||
Scram::mock("user"),
|
||||
));
|
||||
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
let password: String = rand::thread_rng()
|
||||
|
||||
@@ -4,4 +4,4 @@ mod limiter;
|
||||
pub use aimd::Aimd;
|
||||
pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig};
|
||||
pub use limiter::Limiter;
|
||||
pub use limiter::{AuthRateLimiter, EndpointRateLimiter, RateBucketInfo, RedisRateLimiter};
|
||||
pub use limiter::{EndpointRateLimiter, RateBucketInfo, RedisRateLimiter};
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
collections::hash_map::RandomState,
|
||||
hash::{BuildHasher, Hash},
|
||||
net::IpAddr,
|
||||
hash::BuildHasher,
|
||||
sync::{
|
||||
atomic::{AtomicUsize, Ordering},
|
||||
Arc, Mutex,
|
||||
@@ -17,7 +15,7 @@ use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit};
|
||||
use tokio::time::{timeout, Duration, Instant};
|
||||
use tracing::info;
|
||||
|
||||
use crate::{intern::EndpointIdInt, EndpointId};
|
||||
use crate::EndpointId;
|
||||
|
||||
use super::{
|
||||
limit_algorithm::{LimitAlgorithm, Sample},
|
||||
@@ -51,11 +49,11 @@ impl RedisRateLimiter {
|
||||
.data
|
||||
.iter_mut()
|
||||
.zip(self.info)
|
||||
.all(|(bucket, info)| bucket.should_allow_request(info, now, 1));
|
||||
.all(|(bucket, info)| bucket.should_allow_request(info, now));
|
||||
|
||||
if should_allow_request {
|
||||
// only increment the bucket counts if the request will actually be accepted
|
||||
self.data.iter_mut().for_each(|b| b.inc(1));
|
||||
self.data.iter_mut().for_each(RateBucket::inc);
|
||||
}
|
||||
|
||||
should_allow_request
|
||||
@@ -73,14 +71,9 @@ impl RedisRateLimiter {
|
||||
// saw SNI, before doing TLS handshake. User-side error messages in that case
|
||||
// does not look very nice (`SSL SYSCALL error: Undefined error: 0`), so for now
|
||||
// I went with a more expensive way that yields user-friendlier error messages.
|
||||
pub type EndpointRateLimiter = BucketRateLimiter<EndpointId, StdRng, RandomState>;
|
||||
|
||||
// This can't be just per IP because that would limit some PaaS that share IP addresses
|
||||
pub type AuthRateLimiter = BucketRateLimiter<(EndpointIdInt, IpAddr), StdRng, RandomState>;
|
||||
|
||||
pub struct BucketRateLimiter<Key, Rand = StdRng, Hasher = RandomState> {
|
||||
map: DashMap<Key, Vec<RateBucket>, Hasher>,
|
||||
info: Cow<'static, [RateBucketInfo]>,
|
||||
pub struct EndpointRateLimiter<Rand = StdRng, Hasher = RandomState> {
|
||||
map: DashMap<EndpointId, Vec<RateBucket>, Hasher>,
|
||||
info: &'static [RateBucketInfo],
|
||||
access_count: AtomicUsize,
|
||||
rand: Mutex<Rand>,
|
||||
}
|
||||
@@ -92,9 +85,9 @@ struct RateBucket {
|
||||
}
|
||||
|
||||
impl RateBucket {
|
||||
fn should_allow_request(&mut self, info: &RateBucketInfo, now: Instant, n: u32) -> bool {
|
||||
fn should_allow_request(&mut self, info: &RateBucketInfo, now: Instant) -> bool {
|
||||
if now - self.start < info.interval {
|
||||
self.count + n <= info.max_rpi
|
||||
self.count < info.max_rpi
|
||||
} else {
|
||||
// bucket expired, reset
|
||||
self.count = 0;
|
||||
@@ -104,8 +97,8 @@ impl RateBucket {
|
||||
}
|
||||
}
|
||||
|
||||
fn inc(&mut self, n: u32) {
|
||||
self.count += n;
|
||||
fn inc(&mut self) {
|
||||
self.count += 1;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -118,7 +111,7 @@ pub struct RateBucketInfo {
|
||||
|
||||
impl std::fmt::Display for RateBucketInfo {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let rps = (self.max_rpi as u64) * 1000 / self.interval.as_millis() as u64;
|
||||
let rps = self.max_rpi * 1000 / self.interval.as_millis() as u32;
|
||||
write!(f, "{rps}@{}", humantime::format_duration(self.interval))
|
||||
}
|
||||
}
|
||||
@@ -143,25 +136,12 @@ impl std::str::FromStr for RateBucketInfo {
|
||||
}
|
||||
|
||||
impl RateBucketInfo {
|
||||
pub const DEFAULT_ENDPOINT_SET: [Self; 3] = [
|
||||
pub const DEFAULT_SET: [Self; 3] = [
|
||||
Self::new(300, Duration::from_secs(1)),
|
||||
Self::new(200, Duration::from_secs(60)),
|
||||
Self::new(100, Duration::from_secs(600)),
|
||||
];
|
||||
|
||||
/// All of these are per endpoint-ip pair.
|
||||
/// Context: 4096 rounds of pbkdf2 take about 1ms of cpu time to execute (1 milli-cpu-second or 1mcpus).
|
||||
///
|
||||
/// First bucket: 300mcpus total per endpoint-ip pair
|
||||
/// * 1228800 requests per second with 1 hash rounds. (endpoint rate limiter will catch this first)
|
||||
/// * 300 requests per second with 4096 hash rounds.
|
||||
/// * 2 requests per second with 600000 hash rounds.
|
||||
pub const DEFAULT_AUTH_SET: [Self; 3] = [
|
||||
Self::new(300 * 4096, Duration::from_secs(1)),
|
||||
Self::new(200 * 4096, Duration::from_secs(60)),
|
||||
Self::new(100 * 4096, Duration::from_secs(600)),
|
||||
];
|
||||
|
||||
pub fn validate(info: &mut [Self]) -> anyhow::Result<()> {
|
||||
info.sort_unstable_by_key(|info| info.interval);
|
||||
let invalid = info
|
||||
@@ -170,7 +150,7 @@ impl RateBucketInfo {
|
||||
.find(|(a, b)| a.max_rpi > b.max_rpi);
|
||||
if let Some((a, b)) = invalid {
|
||||
bail!(
|
||||
"invalid bucket RPS limits. {b} allows fewer requests per bucket than {a} ({} vs {})",
|
||||
"invalid endpoint RPS limits. {b} allows fewer requests per bucket than {a} ({} vs {})",
|
||||
b.max_rpi,
|
||||
a.max_rpi,
|
||||
);
|
||||
@@ -182,24 +162,19 @@ impl RateBucketInfo {
|
||||
pub const fn new(max_rps: u32, interval: Duration) -> Self {
|
||||
Self {
|
||||
interval,
|
||||
max_rpi: ((max_rps as u64) * (interval.as_millis() as u64) / 1000) as u32,
|
||||
max_rpi: max_rps * interval.as_millis() as u32 / 1000,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<K: Hash + Eq> BucketRateLimiter<K> {
|
||||
pub fn new(info: impl Into<Cow<'static, [RateBucketInfo]>>) -> Self {
|
||||
impl EndpointRateLimiter {
|
||||
pub fn new(info: &'static [RateBucketInfo]) -> Self {
|
||||
Self::new_with_rand_and_hasher(info, StdRng::from_entropy(), RandomState::new())
|
||||
}
|
||||
}
|
||||
|
||||
impl<K: Hash + Eq, R: Rng, S: BuildHasher + Clone> BucketRateLimiter<K, R, S> {
|
||||
fn new_with_rand_and_hasher(
|
||||
info: impl Into<Cow<'static, [RateBucketInfo]>>,
|
||||
rand: R,
|
||||
hasher: S,
|
||||
) -> Self {
|
||||
let info = info.into();
|
||||
impl<R: Rng, S: BuildHasher + Clone> EndpointRateLimiter<R, S> {
|
||||
fn new_with_rand_and_hasher(info: &'static [RateBucketInfo], rand: R, hasher: S) -> Self {
|
||||
info!(buckets = ?info, "endpoint rate limiter");
|
||||
Self {
|
||||
info,
|
||||
@@ -210,7 +185,7 @@ impl<K: Hash + Eq, R: Rng, S: BuildHasher + Clone> BucketRateLimiter<K, R, S> {
|
||||
}
|
||||
|
||||
/// Check that number of connections to the endpoint is below `max_rps` rps.
|
||||
pub fn check(&self, key: K, n: u32) -> bool {
|
||||
pub fn check(&self, endpoint: EndpointId) -> bool {
|
||||
// do a partial GC every 2k requests. This cleans up ~ 1/64th of the map.
|
||||
// worst case memory usage is about:
|
||||
// = 2 * 2048 * 64 * (48B + 72B)
|
||||
@@ -220,7 +195,7 @@ impl<K: Hash + Eq, R: Rng, S: BuildHasher + Clone> BucketRateLimiter<K, R, S> {
|
||||
}
|
||||
|
||||
let now = Instant::now();
|
||||
let mut entry = self.map.entry(key).or_insert_with(|| {
|
||||
let mut entry = self.map.entry(endpoint).or_insert_with(|| {
|
||||
vec![
|
||||
RateBucket {
|
||||
start: now,
|
||||
@@ -232,12 +207,12 @@ impl<K: Hash + Eq, R: Rng, S: BuildHasher + Clone> BucketRateLimiter<K, R, S> {
|
||||
|
||||
let should_allow_request = entry
|
||||
.iter_mut()
|
||||
.zip(&*self.info)
|
||||
.all(|(bucket, info)| bucket.should_allow_request(info, now, n));
|
||||
.zip(self.info)
|
||||
.all(|(bucket, info)| bucket.should_allow_request(info, now));
|
||||
|
||||
if should_allow_request {
|
||||
// only increment the bucket counts if the request will actually be accepted
|
||||
entry.iter_mut().for_each(|b| b.inc(n));
|
||||
entry.iter_mut().for_each(RateBucket::inc);
|
||||
}
|
||||
|
||||
should_allow_request
|
||||
@@ -248,7 +223,7 @@ impl<K: Hash + Eq, R: Rng, S: BuildHasher + Clone> BucketRateLimiter<K, R, S> {
|
||||
/// But that way deletion does not aquire mutex on each entry access.
|
||||
pub fn do_gc(&self) {
|
||||
info!(
|
||||
"cleaning up bucket rate limiter, current size = {}",
|
||||
"cleaning up endpoint rate limiter, current size = {}",
|
||||
self.map.len()
|
||||
);
|
||||
let n = self.map.shards().len();
|
||||
@@ -559,7 +534,7 @@ mod tests {
|
||||
use rustc_hash::FxHasher;
|
||||
use tokio::time;
|
||||
|
||||
use super::{BucketRateLimiter, EndpointRateLimiter, Limiter, Outcome};
|
||||
use super::{EndpointRateLimiter, Limiter, Outcome};
|
||||
use crate::{
|
||||
rate_limiter::{RateBucketInfo, RateLimitAlgorithm},
|
||||
EndpointId,
|
||||
@@ -697,12 +672,12 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn default_rate_buckets() {
|
||||
let mut defaults = RateBucketInfo::DEFAULT_ENDPOINT_SET;
|
||||
let mut defaults = RateBucketInfo::DEFAULT_SET;
|
||||
RateBucketInfo::validate(&mut defaults[..]).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic = "invalid bucket RPS limits. 10@10s allows fewer requests per bucket than 300@1s (100 vs 300)"]
|
||||
#[should_panic = "invalid endpoint RPS limits. 10@10s allows fewer requests per bucket than 300@1s (100 vs 300)"]
|
||||
fn rate_buckets_validate() {
|
||||
let mut rates: Vec<RateBucketInfo> = ["300@1s", "10@10s"]
|
||||
.into_iter()
|
||||
@@ -718,42 +693,42 @@ mod tests {
|
||||
.map(|s| s.parse().unwrap())
|
||||
.collect();
|
||||
RateBucketInfo::validate(&mut rates).unwrap();
|
||||
let limiter = EndpointRateLimiter::new(rates);
|
||||
let limiter = EndpointRateLimiter::new(Vec::leak(rates));
|
||||
|
||||
let endpoint = EndpointId::from("ep-my-endpoint-1234");
|
||||
|
||||
time::pause();
|
||||
|
||||
for _ in 0..100 {
|
||||
assert!(limiter.check(endpoint.clone(), 1));
|
||||
assert!(limiter.check(endpoint.clone()));
|
||||
}
|
||||
// more connections fail
|
||||
assert!(!limiter.check(endpoint.clone(), 1));
|
||||
assert!(!limiter.check(endpoint.clone()));
|
||||
|
||||
// fail even after 500ms as it's in the same bucket
|
||||
time::advance(time::Duration::from_millis(500)).await;
|
||||
assert!(!limiter.check(endpoint.clone(), 1));
|
||||
assert!(!limiter.check(endpoint.clone()));
|
||||
|
||||
// after a full 1s, 100 requests are allowed again
|
||||
time::advance(time::Duration::from_millis(500)).await;
|
||||
for _ in 1..6 {
|
||||
for _ in 0..50 {
|
||||
assert!(limiter.check(endpoint.clone(), 2));
|
||||
for _ in 0..100 {
|
||||
assert!(limiter.check(endpoint.clone()));
|
||||
}
|
||||
time::advance(time::Duration::from_millis(1000)).await;
|
||||
}
|
||||
|
||||
// more connections after 600 will exceed the 20rps@30s limit
|
||||
assert!(!limiter.check(endpoint.clone(), 1));
|
||||
assert!(!limiter.check(endpoint.clone()));
|
||||
|
||||
// will still fail before the 30 second limit
|
||||
time::advance(time::Duration::from_millis(30_000 - 6_000 - 1)).await;
|
||||
assert!(!limiter.check(endpoint.clone(), 1));
|
||||
assert!(!limiter.check(endpoint.clone()));
|
||||
|
||||
// after the full 30 seconds, 100 requests are allowed again
|
||||
time::advance(time::Duration::from_millis(1)).await;
|
||||
for _ in 0..100 {
|
||||
assert!(limiter.check(endpoint.clone(), 1));
|
||||
assert!(limiter.check(endpoint.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -763,41 +738,14 @@ mod tests {
|
||||
let rand = rand::rngs::StdRng::from_seed([1; 32]);
|
||||
let hasher = BuildHasherDefault::<FxHasher>::default();
|
||||
|
||||
let limiter = BucketRateLimiter::new_with_rand_and_hasher(
|
||||
&RateBucketInfo::DEFAULT_ENDPOINT_SET,
|
||||
let limiter = EndpointRateLimiter::new_with_rand_and_hasher(
|
||||
&RateBucketInfo::DEFAULT_SET,
|
||||
rand,
|
||||
hasher,
|
||||
);
|
||||
for i in 0..1_000_000 {
|
||||
limiter.check(i, 1);
|
||||
limiter.check(format!("{i}").into());
|
||||
}
|
||||
assert!(limiter.map.len() < 150_000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_default_auth_set() {
|
||||
// these values used to exceed u32::MAX
|
||||
assert_eq!(
|
||||
RateBucketInfo::DEFAULT_AUTH_SET,
|
||||
[
|
||||
RateBucketInfo {
|
||||
interval: Duration::from_secs(1),
|
||||
max_rpi: 300 * 4096,
|
||||
},
|
||||
RateBucketInfo {
|
||||
interval: Duration::from_secs(60),
|
||||
max_rpi: 200 * 4096 * 60,
|
||||
},
|
||||
RateBucketInfo {
|
||||
interval: Duration::from_secs(600),
|
||||
max_rpi: 100 * 4096 * 600,
|
||||
}
|
||||
]
|
||||
);
|
||||
|
||||
for x in RateBucketInfo::DEFAULT_AUTH_SET {
|
||||
let y = x.to_string().parse().unwrap();
|
||||
assert_eq!(x, y);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,2 @@
|
||||
pub mod cancellation_publisher;
|
||||
pub mod connection_with_credentials_provider;
|
||||
pub mod elasticache;
|
||||
pub mod notifications;
|
||||
pub mod publisher;
|
||||
|
||||
@@ -1,161 +0,0 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use pq_proto::CancelKeyData;
|
||||
use redis::AsyncCommands;
|
||||
use tokio::sync::Mutex;
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::rate_limiter::{RateBucketInfo, RedisRateLimiter};
|
||||
|
||||
use super::{
|
||||
connection_with_credentials_provider::ConnectionWithCredentialsProvider,
|
||||
notifications::{CancelSession, Notification, PROXY_CHANNEL_NAME},
|
||||
};
|
||||
|
||||
pub trait CancellationPublisherMut: Send + Sync + 'static {
|
||||
#[allow(async_fn_in_trait)]
|
||||
async fn try_publish(
|
||||
&mut self,
|
||||
cancel_key_data: CancelKeyData,
|
||||
session_id: Uuid,
|
||||
) -> anyhow::Result<()>;
|
||||
}
|
||||
|
||||
pub trait CancellationPublisher: Send + Sync + 'static {
|
||||
#[allow(async_fn_in_trait)]
|
||||
async fn try_publish(
|
||||
&self,
|
||||
cancel_key_data: CancelKeyData,
|
||||
session_id: Uuid,
|
||||
) -> anyhow::Result<()>;
|
||||
}
|
||||
|
||||
impl CancellationPublisher for () {
|
||||
async fn try_publish(
|
||||
&self,
|
||||
_cancel_key_data: CancelKeyData,
|
||||
_session_id: Uuid,
|
||||
) -> anyhow::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl<P: CancellationPublisher> CancellationPublisherMut for P {
|
||||
async fn try_publish(
|
||||
&mut self,
|
||||
cancel_key_data: CancelKeyData,
|
||||
session_id: Uuid,
|
||||
) -> anyhow::Result<()> {
|
||||
<P as CancellationPublisher>::try_publish(self, cancel_key_data, session_id).await
|
||||
}
|
||||
}
|
||||
|
||||
impl<P: CancellationPublisher> CancellationPublisher for Option<P> {
|
||||
async fn try_publish(
|
||||
&self,
|
||||
cancel_key_data: CancelKeyData,
|
||||
session_id: Uuid,
|
||||
) -> anyhow::Result<()> {
|
||||
if let Some(p) = self {
|
||||
p.try_publish(cancel_key_data, session_id).await
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<P: CancellationPublisherMut> CancellationPublisher for Arc<Mutex<P>> {
|
||||
async fn try_publish(
|
||||
&self,
|
||||
cancel_key_data: CancelKeyData,
|
||||
session_id: Uuid,
|
||||
) -> anyhow::Result<()> {
|
||||
self.lock()
|
||||
.await
|
||||
.try_publish(cancel_key_data, session_id)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
pub struct RedisPublisherClient {
|
||||
client: ConnectionWithCredentialsProvider,
|
||||
region_id: String,
|
||||
limiter: RedisRateLimiter,
|
||||
}
|
||||
|
||||
impl RedisPublisherClient {
|
||||
pub fn new(
|
||||
client: ConnectionWithCredentialsProvider,
|
||||
region_id: String,
|
||||
info: &'static [RateBucketInfo],
|
||||
) -> anyhow::Result<Self> {
|
||||
Ok(Self {
|
||||
client,
|
||||
region_id,
|
||||
limiter: RedisRateLimiter::new(info),
|
||||
})
|
||||
}
|
||||
|
||||
async fn publish(
|
||||
&mut self,
|
||||
cancel_key_data: CancelKeyData,
|
||||
session_id: Uuid,
|
||||
) -> anyhow::Result<()> {
|
||||
let payload = serde_json::to_string(&Notification::Cancel(CancelSession {
|
||||
region_id: Some(self.region_id.clone()),
|
||||
cancel_key_data,
|
||||
session_id,
|
||||
}))?;
|
||||
self.client.publish(PROXY_CHANNEL_NAME, payload).await?;
|
||||
Ok(())
|
||||
}
|
||||
pub async fn try_connect(&mut self) -> anyhow::Result<()> {
|
||||
match self.client.connect().await {
|
||||
Ok(()) => {}
|
||||
Err(e) => {
|
||||
tracing::error!("failed to connect to redis: {e}");
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
async fn try_publish_internal(
|
||||
&mut self,
|
||||
cancel_key_data: CancelKeyData,
|
||||
session_id: Uuid,
|
||||
) -> anyhow::Result<()> {
|
||||
if !self.limiter.check() {
|
||||
tracing::info!("Rate limit exceeded. Skipping cancellation message");
|
||||
return Err(anyhow::anyhow!("Rate limit exceeded"));
|
||||
}
|
||||
match self.publish(cancel_key_data, session_id).await {
|
||||
Ok(()) => return Ok(()),
|
||||
Err(e) => {
|
||||
tracing::error!("failed to publish a message: {e}");
|
||||
}
|
||||
}
|
||||
tracing::info!("Publisher is disconnected. Reconnectiong...");
|
||||
self.try_connect().await?;
|
||||
self.publish(cancel_key_data, session_id).await
|
||||
}
|
||||
}
|
||||
|
||||
impl CancellationPublisherMut for RedisPublisherClient {
|
||||
async fn try_publish(
|
||||
&mut self,
|
||||
cancel_key_data: CancelKeyData,
|
||||
session_id: Uuid,
|
||||
) -> anyhow::Result<()> {
|
||||
tracing::info!("publishing cancellation key to Redis");
|
||||
match self.try_publish_internal(cancel_key_data, session_id).await {
|
||||
Ok(()) => {
|
||||
tracing::info!("cancellation key successfuly published to Redis");
|
||||
Ok(())
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::error!("failed to publish a message: {e}");
|
||||
Err(e)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,225 +0,0 @@
|
||||
use std::{sync::Arc, time::Duration};
|
||||
|
||||
use futures::FutureExt;
|
||||
use redis::{
|
||||
aio::{ConnectionLike, MultiplexedConnection},
|
||||
ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisResult,
|
||||
};
|
||||
use tokio::task::JoinHandle;
|
||||
use tracing::{error, info};
|
||||
|
||||
use super::elasticache::CredentialsProvider;
|
||||
|
||||
enum Credentials {
|
||||
Static(ConnectionInfo),
|
||||
Dynamic(Arc<CredentialsProvider>, redis::ConnectionAddr),
|
||||
}
|
||||
|
||||
impl Clone for Credentials {
|
||||
fn clone(&self) -> Self {
|
||||
match self {
|
||||
Credentials::Static(info) => Credentials::Static(info.clone()),
|
||||
Credentials::Dynamic(provider, addr) => {
|
||||
Credentials::Dynamic(Arc::clone(provider), addr.clone())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A wrapper around `redis::MultiplexedConnection` that automatically refreshes the token.
|
||||
/// Provides PubSub connection without credentials refresh.
|
||||
pub struct ConnectionWithCredentialsProvider {
|
||||
credentials: Credentials,
|
||||
con: Option<MultiplexedConnection>,
|
||||
refresh_token_task: Option<JoinHandle<()>>,
|
||||
mutex: tokio::sync::Mutex<()>,
|
||||
}
|
||||
|
||||
impl Clone for ConnectionWithCredentialsProvider {
|
||||
fn clone(&self) -> Self {
|
||||
Self {
|
||||
credentials: self.credentials.clone(),
|
||||
con: None,
|
||||
refresh_token_task: None,
|
||||
mutex: tokio::sync::Mutex::new(()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ConnectionWithCredentialsProvider {
|
||||
pub fn new_with_credentials_provider(
|
||||
host: String,
|
||||
port: u16,
|
||||
credentials_provider: Arc<CredentialsProvider>,
|
||||
) -> Self {
|
||||
Self {
|
||||
credentials: Credentials::Dynamic(
|
||||
credentials_provider,
|
||||
redis::ConnectionAddr::TcpTls {
|
||||
host,
|
||||
port,
|
||||
insecure: false,
|
||||
tls_params: None,
|
||||
},
|
||||
),
|
||||
con: None,
|
||||
refresh_token_task: None,
|
||||
mutex: tokio::sync::Mutex::new(()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_with_static_credentials<T: IntoConnectionInfo>(params: T) -> Self {
|
||||
Self {
|
||||
credentials: Credentials::Static(params.into_connection_info().unwrap()),
|
||||
con: None,
|
||||
refresh_token_task: None,
|
||||
mutex: tokio::sync::Mutex::new(()),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn connect(&mut self) -> anyhow::Result<()> {
|
||||
let _guard = self.mutex.lock().await;
|
||||
if let Some(con) = self.con.as_mut() {
|
||||
match redis::cmd("PING").query_async(con).await {
|
||||
Ok(()) => {
|
||||
return Ok(());
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Error during PING: {e:?}");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
info!("Connection is not established");
|
||||
}
|
||||
info!("Establishing a new connection...");
|
||||
self.con = None;
|
||||
if let Some(f) = self.refresh_token_task.take() {
|
||||
f.abort()
|
||||
}
|
||||
let con = self
|
||||
.get_client()
|
||||
.await?
|
||||
.get_multiplexed_tokio_connection()
|
||||
.await?;
|
||||
if let Credentials::Dynamic(credentials_provider, _) = &self.credentials {
|
||||
let credentials_provider = credentials_provider.clone();
|
||||
let con2 = con.clone();
|
||||
let f = tokio::spawn(async move {
|
||||
let _ = Self::keep_connection(con2, credentials_provider).await;
|
||||
});
|
||||
self.refresh_token_task = Some(f);
|
||||
}
|
||||
self.con = Some(con);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn get_connection_info(&self) -> anyhow::Result<ConnectionInfo> {
|
||||
match &self.credentials {
|
||||
Credentials::Static(info) => Ok(info.clone()),
|
||||
Credentials::Dynamic(provider, addr) => {
|
||||
let (username, password) = provider.provide_credentials().await?;
|
||||
Ok(ConnectionInfo {
|
||||
addr: addr.clone(),
|
||||
redis: RedisConnectionInfo {
|
||||
db: 0,
|
||||
username: Some(username),
|
||||
password: Some(password.clone()),
|
||||
},
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_client(&self) -> anyhow::Result<redis::Client> {
|
||||
let client = redis::Client::open(self.get_connection_info().await?)?;
|
||||
Ok(client)
|
||||
}
|
||||
|
||||
// PubSub does not support credentials refresh.
|
||||
// Requires manual reconnection every 12h.
|
||||
pub async fn get_async_pubsub(&self) -> anyhow::Result<redis::aio::PubSub> {
|
||||
Ok(self.get_client().await?.get_async_pubsub().await?)
|
||||
}
|
||||
|
||||
// The connection lives for 12h.
|
||||
// It can be prolonged with sending `AUTH` commands with the refreshed token.
|
||||
// https://docs.aws.amazon.com/AmazonElastiCache/latest/red-ug/auth-iam.html#auth-iam-limits
|
||||
async fn keep_connection(
|
||||
mut con: MultiplexedConnection,
|
||||
credentials_provider: Arc<CredentialsProvider>,
|
||||
) -> anyhow::Result<()> {
|
||||
loop {
|
||||
// The connection lives for 12h, for the sanity check we refresh it every hour.
|
||||
tokio::time::sleep(Duration::from_secs(60 * 60)).await;
|
||||
match Self::refresh_token(&mut con, credentials_provider.clone()).await {
|
||||
Ok(()) => {
|
||||
info!("Token refreshed");
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Error during token refresh: {e:?}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
async fn refresh_token(
|
||||
con: &mut MultiplexedConnection,
|
||||
credentials_provider: Arc<CredentialsProvider>,
|
||||
) -> anyhow::Result<()> {
|
||||
let (user, password) = credentials_provider.provide_credentials().await?;
|
||||
redis::cmd("AUTH")
|
||||
.arg(user)
|
||||
.arg(password)
|
||||
.query_async(con)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
/// Sends an already encoded (packed) command into the TCP socket and
|
||||
/// reads the single response from it.
|
||||
pub async fn send_packed_command(&mut self, cmd: &redis::Cmd) -> RedisResult<redis::Value> {
|
||||
// Clone connection to avoid having to lock the ArcSwap in write mode
|
||||
let con = self.con.as_mut().ok_or(redis::RedisError::from((
|
||||
redis::ErrorKind::IoError,
|
||||
"Connection not established",
|
||||
)))?;
|
||||
con.send_packed_command(cmd).await
|
||||
}
|
||||
|
||||
/// Sends multiple already encoded (packed) command into the TCP socket
|
||||
/// and reads `count` responses from it. This is used to implement
|
||||
/// pipelining.
|
||||
pub async fn send_packed_commands(
|
||||
&mut self,
|
||||
cmd: &redis::Pipeline,
|
||||
offset: usize,
|
||||
count: usize,
|
||||
) -> RedisResult<Vec<redis::Value>> {
|
||||
// Clone shared connection future to avoid having to lock the ArcSwap in write mode
|
||||
let con = self.con.as_mut().ok_or(redis::RedisError::from((
|
||||
redis::ErrorKind::IoError,
|
||||
"Connection not established",
|
||||
)))?;
|
||||
con.send_packed_commands(cmd, offset, count).await
|
||||
}
|
||||
}
|
||||
|
||||
impl ConnectionLike for ConnectionWithCredentialsProvider {
|
||||
fn req_packed_command<'a>(
|
||||
&'a mut self,
|
||||
cmd: &'a redis::Cmd,
|
||||
) -> redis::RedisFuture<'a, redis::Value> {
|
||||
(async move { self.send_packed_command(cmd).await }).boxed()
|
||||
}
|
||||
|
||||
fn req_packed_commands<'a>(
|
||||
&'a mut self,
|
||||
cmd: &'a redis::Pipeline,
|
||||
offset: usize,
|
||||
count: usize,
|
||||
) -> redis::RedisFuture<'a, Vec<redis::Value>> {
|
||||
(async move { self.send_packed_commands(cmd, offset, count).await }).boxed()
|
||||
}
|
||||
|
||||
fn get_db(&self) -> i64 {
|
||||
0
|
||||
}
|
||||
}
|
||||
@@ -1,110 +0,0 @@
|
||||
use std::time::{Duration, SystemTime};
|
||||
|
||||
use aws_config::meta::credentials::CredentialsProviderChain;
|
||||
use aws_sdk_iam::config::ProvideCredentials;
|
||||
use aws_sigv4::http_request::{
|
||||
self, SignableBody, SignableRequest, SignatureLocation, SigningSettings,
|
||||
};
|
||||
use tracing::info;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct AWSIRSAConfig {
|
||||
region: String,
|
||||
service_name: String,
|
||||
cluster_name: String,
|
||||
user_id: String,
|
||||
token_ttl: Duration,
|
||||
action: String,
|
||||
}
|
||||
|
||||
impl AWSIRSAConfig {
|
||||
pub fn new(region: String, cluster_name: Option<String>, user_id: Option<String>) -> Self {
|
||||
AWSIRSAConfig {
|
||||
region,
|
||||
service_name: "elasticache".to_string(),
|
||||
cluster_name: cluster_name.unwrap_or_default(),
|
||||
user_id: user_id.unwrap_or_default(),
|
||||
// "The IAM authentication token is valid for 15 minutes"
|
||||
// https://docs.aws.amazon.com/memorydb/latest/devguide/auth-iam.html#auth-iam-limits
|
||||
token_ttl: Duration::from_secs(15 * 60),
|
||||
action: "connect".to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Credentials provider for AWS elasticache authentication.
|
||||
///
|
||||
/// Official documentation:
|
||||
/// <https://docs.aws.amazon.com/AmazonElastiCache/latest/red-ug/auth-iam.html>
|
||||
///
|
||||
/// Useful resources:
|
||||
/// <https://aws.amazon.com/blogs/database/simplify-managing-access-to-amazon-elasticache-for-redis-clusters-with-iam/>
|
||||
pub struct CredentialsProvider {
|
||||
config: AWSIRSAConfig,
|
||||
credentials_provider: CredentialsProviderChain,
|
||||
}
|
||||
|
||||
impl CredentialsProvider {
|
||||
pub fn new(config: AWSIRSAConfig, credentials_provider: CredentialsProviderChain) -> Self {
|
||||
CredentialsProvider {
|
||||
config,
|
||||
credentials_provider,
|
||||
}
|
||||
}
|
||||
pub async fn provide_credentials(&self) -> anyhow::Result<(String, String)> {
|
||||
let aws_credentials = self
|
||||
.credentials_provider
|
||||
.provide_credentials()
|
||||
.await?
|
||||
.into();
|
||||
info!("AWS credentials successfully obtained");
|
||||
info!("Connecting to Redis with configuration: {:?}", self.config);
|
||||
let mut settings = SigningSettings::default();
|
||||
settings.signature_location = SignatureLocation::QueryParams;
|
||||
settings.expires_in = Some(self.config.token_ttl);
|
||||
let signing_params = aws_sigv4::sign::v4::SigningParams::builder()
|
||||
.identity(&aws_credentials)
|
||||
.region(&self.config.region)
|
||||
.name(&self.config.service_name)
|
||||
.time(SystemTime::now())
|
||||
.settings(settings)
|
||||
.build()?
|
||||
.into();
|
||||
let auth_params = [
|
||||
("Action", &self.config.action),
|
||||
("User", &self.config.user_id),
|
||||
];
|
||||
let auth_params = url::form_urlencoded::Serializer::new(String::new())
|
||||
.extend_pairs(auth_params)
|
||||
.finish();
|
||||
let auth_uri = http::Uri::builder()
|
||||
.scheme("http")
|
||||
.authority(self.config.cluster_name.as_bytes())
|
||||
.path_and_query(format!("/?{auth_params}"))
|
||||
.build()?;
|
||||
info!("{}", auth_uri);
|
||||
|
||||
// Convert the HTTP request into a signable request
|
||||
let signable_request = SignableRequest::new(
|
||||
"GET",
|
||||
auth_uri.to_string(),
|
||||
std::iter::empty(),
|
||||
SignableBody::Bytes(&[]),
|
||||
)?;
|
||||
|
||||
// Sign and then apply the signature to the request
|
||||
let (si, _) = http_request::sign(signable_request, &signing_params)?.into_parts();
|
||||
let mut signable_request = http::Request::builder()
|
||||
.method("GET")
|
||||
.uri(auth_uri)
|
||||
.body(())?;
|
||||
si.apply_to_request_http1x(&mut signable_request);
|
||||
Ok((
|
||||
self.config.user_id.clone(),
|
||||
signable_request
|
||||
.uri()
|
||||
.to_string()
|
||||
.replacen("http://", "", 1),
|
||||
))
|
||||
}
|
||||
}
|
||||
@@ -6,12 +6,11 @@ use redis::aio::PubSub;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use uuid::Uuid;
|
||||
|
||||
use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
|
||||
use crate::{
|
||||
cache::project_info::ProjectInfoCache,
|
||||
cancellation::{CancelMap, CancellationHandler},
|
||||
cancellation::{CancelMap, CancellationHandler, NotificationsCancellationHandler},
|
||||
intern::{ProjectIdInt, RoleNameInt},
|
||||
metrics::{NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS, REDIS_BROKEN_MESSAGES},
|
||||
metrics::REDIS_BROKEN_MESSAGES,
|
||||
};
|
||||
|
||||
const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
|
||||
@@ -19,13 +18,23 @@ pub(crate) const PROXY_CHANNEL_NAME: &str = "neondb-proxy-to-proxy-updates";
|
||||
const RECONNECT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(20);
|
||||
const INVALIDATION_LAG: std::time::Duration = std::time::Duration::from_secs(20);
|
||||
|
||||
async fn try_connect(client: &ConnectionWithCredentialsProvider) -> anyhow::Result<PubSub> {
|
||||
let mut conn = client.get_async_pubsub().await?;
|
||||
tracing::info!("subscribing to a channel `{CPLANE_CHANNEL_NAME}`");
|
||||
conn.subscribe(CPLANE_CHANNEL_NAME).await?;
|
||||
tracing::info!("subscribing to a channel `{PROXY_CHANNEL_NAME}`");
|
||||
conn.subscribe(PROXY_CHANNEL_NAME).await?;
|
||||
Ok(conn)
|
||||
struct RedisConsumerClient {
|
||||
client: redis::Client,
|
||||
}
|
||||
|
||||
impl RedisConsumerClient {
|
||||
pub fn new(url: &str) -> anyhow::Result<Self> {
|
||||
let client = redis::Client::open(url)?;
|
||||
Ok(Self { client })
|
||||
}
|
||||
async fn try_connect(&self) -> anyhow::Result<PubSub> {
|
||||
let mut conn = self.client.get_async_connection().await?.into_pubsub();
|
||||
tracing::info!("subscribing to a channel `{CPLANE_CHANNEL_NAME}`");
|
||||
conn.subscribe(CPLANE_CHANNEL_NAME).await?;
|
||||
tracing::info!("subscribing to a channel `{PROXY_CHANNEL_NAME}`");
|
||||
conn.subscribe(PROXY_CHANNEL_NAME).await?;
|
||||
Ok(conn)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
|
||||
@@ -71,18 +80,21 @@ where
|
||||
serde_json::from_str(&s).map_err(<D::Error as serde::de::Error>::custom)
|
||||
}
|
||||
|
||||
struct MessageHandler<C: ProjectInfoCache + Send + Sync + 'static> {
|
||||
struct MessageHandler<
|
||||
C: ProjectInfoCache + Send + Sync + 'static,
|
||||
H: NotificationsCancellationHandler + Send + Sync + 'static,
|
||||
> {
|
||||
cache: Arc<C>,
|
||||
cancellation_handler: Arc<CancellationHandler<()>>,
|
||||
cancellation_handler: Arc<H>,
|
||||
region_id: String,
|
||||
}
|
||||
|
||||
impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
|
||||
pub fn new(
|
||||
cache: Arc<C>,
|
||||
cancellation_handler: Arc<CancellationHandler<()>>,
|
||||
region_id: String,
|
||||
) -> Self {
|
||||
impl<
|
||||
C: ProjectInfoCache + Send + Sync + 'static,
|
||||
H: NotificationsCancellationHandler + Send + Sync + 'static,
|
||||
> MessageHandler<C, H>
|
||||
{
|
||||
pub fn new(cache: Arc<C>, cancellation_handler: Arc<H>, region_id: String) -> Self {
|
||||
Self {
|
||||
cache,
|
||||
cancellation_handler,
|
||||
@@ -127,7 +139,7 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
|
||||
// This instance of cancellation_handler doesn't have a RedisPublisherClient so it can't publish the message.
|
||||
match self
|
||||
.cancellation_handler
|
||||
.cancel_session(cancel_session.cancel_key_data, uuid::Uuid::nil())
|
||||
.cancel_session_no_publish(cancel_session.cancel_key_data)
|
||||
.await
|
||||
{
|
||||
Ok(()) => {}
|
||||
@@ -170,7 +182,7 @@ fn invalidate_cache<C: ProjectInfoCache>(cache: Arc<C>, msg: Notification) {
|
||||
/// Handle console's invalidation messages.
|
||||
#[tracing::instrument(name = "console_notifications", skip_all)]
|
||||
pub async fn task_main<C>(
|
||||
redis: ConnectionWithCredentialsProvider,
|
||||
url: String,
|
||||
cache: Arc<C>,
|
||||
cancel_map: CancelMap,
|
||||
region_id: String,
|
||||
@@ -181,15 +193,13 @@ where
|
||||
cache.enable_ttl();
|
||||
let handler = MessageHandler::new(
|
||||
cache,
|
||||
Arc::new(CancellationHandler::<()>::new(
|
||||
cancel_map,
|
||||
NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS,
|
||||
)),
|
||||
Arc::new(CancellationHandler::new(cancel_map, None)),
|
||||
region_id,
|
||||
);
|
||||
|
||||
loop {
|
||||
let mut conn = match try_connect(&redis).await {
|
||||
let redis = RedisConsumerClient::new(&url)?;
|
||||
let conn = match redis.try_connect().await {
|
||||
Ok(conn) => {
|
||||
handler.disable_ttl();
|
||||
conn
|
||||
@@ -202,7 +212,7 @@ where
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let mut stream = conn.on_message();
|
||||
let mut stream = conn.into_on_message();
|
||||
while let Some(msg) = stream.next().await {
|
||||
match handler.handle_message(msg).await {
|
||||
Ok(()) => {}
|
||||
|
||||
80
proxy/src/redis/publisher.rs
Normal file
80
proxy/src/redis/publisher.rs
Normal file
@@ -0,0 +1,80 @@
|
||||
use pq_proto::CancelKeyData;
|
||||
use redis::AsyncCommands;
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::rate_limiter::{RateBucketInfo, RedisRateLimiter};
|
||||
|
||||
use super::notifications::{CancelSession, Notification, PROXY_CHANNEL_NAME};
|
||||
|
||||
pub struct RedisPublisherClient {
|
||||
client: redis::Client,
|
||||
publisher: Option<redis::aio::Connection>,
|
||||
region_id: String,
|
||||
limiter: RedisRateLimiter,
|
||||
}
|
||||
|
||||
impl RedisPublisherClient {
|
||||
pub fn new(
|
||||
url: &str,
|
||||
region_id: String,
|
||||
info: &'static [RateBucketInfo],
|
||||
) -> anyhow::Result<Self> {
|
||||
let client = redis::Client::open(url)?;
|
||||
Ok(Self {
|
||||
client,
|
||||
publisher: None,
|
||||
region_id,
|
||||
limiter: RedisRateLimiter::new(info),
|
||||
})
|
||||
}
|
||||
pub async fn try_publish(
|
||||
&mut self,
|
||||
cancel_key_data: CancelKeyData,
|
||||
session_id: Uuid,
|
||||
) -> anyhow::Result<()> {
|
||||
if !self.limiter.check() {
|
||||
tracing::info!("Rate limit exceeded. Skipping cancellation message");
|
||||
return Err(anyhow::anyhow!("Rate limit exceeded"));
|
||||
}
|
||||
match self.publish(cancel_key_data, session_id).await {
|
||||
Ok(()) => return Ok(()),
|
||||
Err(e) => {
|
||||
tracing::error!("failed to publish a message: {e}");
|
||||
self.publisher = None;
|
||||
}
|
||||
}
|
||||
tracing::info!("Publisher is disconnected. Reconnectiong...");
|
||||
self.try_connect().await?;
|
||||
self.publish(cancel_key_data, session_id).await
|
||||
}
|
||||
|
||||
async fn publish(
|
||||
&mut self,
|
||||
cancel_key_data: CancelKeyData,
|
||||
session_id: Uuid,
|
||||
) -> anyhow::Result<()> {
|
||||
let conn = self
|
||||
.publisher
|
||||
.as_mut()
|
||||
.ok_or_else(|| anyhow::anyhow!("not connected"))?;
|
||||
let payload = serde_json::to_string(&Notification::Cancel(CancelSession {
|
||||
region_id: Some(self.region_id.clone()),
|
||||
cancel_key_data,
|
||||
session_id,
|
||||
}))?;
|
||||
conn.publish(PROXY_CHANNEL_NAME, payload).await?;
|
||||
Ok(())
|
||||
}
|
||||
pub async fn try_connect(&mut self) -> anyhow::Result<()> {
|
||||
match self.client.get_async_connection().await {
|
||||
Ok(conn) => {
|
||||
self.publisher = Some(conn);
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::error!("failed to connect to redis: {e}");
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -3,7 +3,9 @@
|
||||
use std::convert::Infallible;
|
||||
|
||||
use hmac::{Hmac, Mac};
|
||||
use sha2::Sha256;
|
||||
use sha2::digest::FixedOutput;
|
||||
use sha2::{Digest, Sha256};
|
||||
use subtle::{Choice, ConstantTimeEq};
|
||||
use tokio::task::yield_now;
|
||||
|
||||
use super::messages::{
|
||||
@@ -11,7 +13,6 @@ use super::messages::{
|
||||
};
|
||||
use super::secret::ServerSecret;
|
||||
use super::signature::SignatureBuilder;
|
||||
use super::ScramKey;
|
||||
use crate::config;
|
||||
use crate::sasl::{self, ChannelBinding, Error as SaslError};
|
||||
|
||||
@@ -103,7 +104,7 @@ async fn pbkdf2(str: &[u8], salt: &[u8], iterations: u32) -> [u8; 32] {
|
||||
}
|
||||
|
||||
// copied from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L236-L248>
|
||||
async fn derive_client_key(password: &[u8], salt: &[u8], iterations: u32) -> ScramKey {
|
||||
async fn derive_keys(password: &[u8], salt: &[u8], iterations: u32) -> ([u8; 32], [u8; 32]) {
|
||||
let salted_password = pbkdf2(password, salt, iterations).await;
|
||||
|
||||
let make_key = |name| {
|
||||
@@ -115,7 +116,7 @@ async fn derive_client_key(password: &[u8], salt: &[u8], iterations: u32) -> Scr
|
||||
<[u8; 32]>::from(key.into_bytes())
|
||||
};
|
||||
|
||||
make_key(b"Client Key").into()
|
||||
(make_key(b"Client Key"), make_key(b"Server Key"))
|
||||
}
|
||||
|
||||
pub async fn exchange(
|
||||
@@ -123,12 +124,21 @@ pub async fn exchange(
|
||||
password: &[u8],
|
||||
) -> sasl::Result<sasl::Outcome<super::ScramKey>> {
|
||||
let salt = base64::decode(&secret.salt_base64)?;
|
||||
let client_key = derive_client_key(password, &salt, secret.iterations).await;
|
||||
let (client_key, server_key) = derive_keys(password, &salt, secret.iterations).await;
|
||||
let stored_key: [u8; 32] = Sha256::default()
|
||||
.chain_update(client_key)
|
||||
.finalize_fixed()
|
||||
.into();
|
||||
|
||||
if secret.is_password_invalid(&client_key).into() {
|
||||
Ok(sasl::Outcome::Failure("password doesn't match"))
|
||||
// constant time to not leak partial key match
|
||||
let valid = stored_key.ct_eq(&secret.stored_key.as_bytes())
|
||||
| server_key.ct_eq(&secret.server_key.as_bytes())
|
||||
| Choice::from(secret.doomed as u8);
|
||||
|
||||
if valid.into() {
|
||||
Ok(sasl::Outcome::Success(super::ScramKey::from(client_key)))
|
||||
} else {
|
||||
Ok(sasl::Outcome::Success(client_key))
|
||||
Ok(sasl::Outcome::Failure("password doesn't match"))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -210,7 +220,7 @@ impl SaslSentInner {
|
||||
.derive_client_key(&client_final_message.proof);
|
||||
|
||||
// Auth fails either if keys don't match or it's pre-determined to fail.
|
||||
if secret.is_password_invalid(&client_key).into() {
|
||||
if client_key.sha256() != secret.stored_key || secret.doomed {
|
||||
return Ok(sasl::Step::Failure("password doesn't match"));
|
||||
}
|
||||
|
||||
|
||||
@@ -1,31 +1,17 @@
|
||||
//! Tools for client/server/stored key management.
|
||||
|
||||
use subtle::ConstantTimeEq;
|
||||
|
||||
/// Faithfully taken from PostgreSQL.
|
||||
pub const SCRAM_KEY_LEN: usize = 32;
|
||||
|
||||
/// One of the keys derived from the user's password.
|
||||
/// We use the same structure for all keys, i.e.
|
||||
/// `ClientKey`, `StoredKey`, and `ServerKey`.
|
||||
#[derive(Clone, Default, Eq, Debug)]
|
||||
#[derive(Clone, Default, PartialEq, Eq, Debug)]
|
||||
#[repr(transparent)]
|
||||
pub struct ScramKey {
|
||||
bytes: [u8; SCRAM_KEY_LEN],
|
||||
}
|
||||
|
||||
impl PartialEq for ScramKey {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.ct_eq(other).into()
|
||||
}
|
||||
}
|
||||
|
||||
impl ConstantTimeEq for ScramKey {
|
||||
fn ct_eq(&self, other: &Self) -> subtle::Choice {
|
||||
self.bytes.ct_eq(&other.bytes)
|
||||
}
|
||||
}
|
||||
|
||||
impl ScramKey {
|
||||
pub fn sha256(&self) -> Self {
|
||||
super::sha256([self.as_ref()]).into()
|
||||
|
||||
@@ -206,28 +206,6 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_client_first_message_with_invalid_gs2_authz() {
|
||||
assert!(ClientFirstMessage::parse("n,authzid,n=user,r=nonce").is_none())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_client_first_message_with_extra_params() {
|
||||
let msg = ClientFirstMessage::parse("n,,n=user,r=nonce,a=foo,b=bar,c=baz").unwrap();
|
||||
assert_eq!(msg.bare, "n=user,r=nonce,a=foo,b=bar,c=baz");
|
||||
assert_eq!(msg.username, "user");
|
||||
assert_eq!(msg.nonce, "nonce");
|
||||
assert_eq!(msg.cbind_flag, ChannelBinding::NotSupportedClient);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_client_first_message_with_extra_params_invalid() {
|
||||
// must be of the form `<ascii letter>=<...>`
|
||||
assert!(ClientFirstMessage::parse("n,,n=user,r=nonce,abc=foo").is_none());
|
||||
assert!(ClientFirstMessage::parse("n,,n=user,r=nonce,1=foo").is_none());
|
||||
assert!(ClientFirstMessage::parse("n,,n=user,r=nonce,a").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_client_final_message() {
|
||||
let input = [
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
//! Tools for SCRAM server secret management.
|
||||
|
||||
use subtle::{Choice, ConstantTimeEq};
|
||||
|
||||
use super::base64_decode_array;
|
||||
use super::key::ScramKey;
|
||||
|
||||
@@ -42,21 +40,16 @@ impl ServerSecret {
|
||||
Some(secret)
|
||||
}
|
||||
|
||||
pub fn is_password_invalid(&self, client_key: &ScramKey) -> Choice {
|
||||
// constant time to not leak partial key match
|
||||
client_key.sha256().ct_ne(&self.stored_key) | Choice::from(self.doomed as u8)
|
||||
}
|
||||
|
||||
/// To avoid revealing information to an attacker, we use a
|
||||
/// mocked server secret even if the user doesn't exist.
|
||||
/// See `auth-scram.c : mock_scram_secret` for details.
|
||||
pub fn mock(nonce: [u8; 32]) -> Self {
|
||||
pub fn mock(user: &str, nonce: [u8; 32]) -> Self {
|
||||
// Refer to `auth-scram.c : scram_mock_salt`.
|
||||
let mocked_salt = super::sha256([user.as_bytes(), &nonce]);
|
||||
|
||||
Self {
|
||||
// this doesn't reveal much information as we're going to use
|
||||
// iteration count 1 for our generated passwords going forward.
|
||||
// PG16 users can set iteration count=1 already today.
|
||||
iterations: 1,
|
||||
salt_base64: base64::encode(nonce),
|
||||
iterations: 4096,
|
||||
salt_base64: base64::encode(mocked_salt),
|
||||
stored_key: ScramKey::default(),
|
||||
server_key: ScramKey::default(),
|
||||
doomed: true,
|
||||
|
||||
@@ -21,12 +21,11 @@ pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
|
||||
use tokio_util::task::TaskTracker;
|
||||
use tracing::instrument::Instrumented;
|
||||
|
||||
use crate::cancellation::CancellationHandlerMain;
|
||||
use crate::config::ProxyConfig;
|
||||
use crate::context::RequestMonitoring;
|
||||
use crate::protocol2::{ProxyProtocolAccept, WithClientIp, WithConnectionGuard};
|
||||
use crate::rate_limiter::EndpointRateLimiter;
|
||||
use crate::serverless::backend::PoolingBackend;
|
||||
use crate::{cancellation::CancellationHandler, config::ProxyConfig};
|
||||
use hyper::{
|
||||
server::conn::{AddrIncoming, AddrStream},
|
||||
Body, Method, Request, Response,
|
||||
@@ -48,7 +47,7 @@ pub async fn task_main(
|
||||
ws_listener: TcpListener,
|
||||
cancellation_token: CancellationToken,
|
||||
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
cancellation_handler: Arc<CancellationHandlerMain>,
|
||||
cancellation_handler: Arc<CancellationHandler>,
|
||||
) -> anyhow::Result<()> {
|
||||
scopeguard::defer! {
|
||||
info!("websocket server has shut down");
|
||||
@@ -238,7 +237,7 @@ async fn request_handler(
|
||||
config: &'static ProxyConfig,
|
||||
backend: Arc<PoolingBackend>,
|
||||
ws_connections: TaskTracker,
|
||||
cancellation_handler: Arc<CancellationHandlerMain>,
|
||||
cancellation_handler: Arc<CancellationHandler>,
|
||||
peer_addr: IpAddr,
|
||||
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
// used to cancel in-flight HTTP requests. not used to cancel websockets
|
||||
|
||||
@@ -42,12 +42,7 @@ impl PoolingBackend {
|
||||
};
|
||||
|
||||
let secret = match cached_secret.value.clone() {
|
||||
Some(secret) => self.config.authentication_config.check_rate_limit(
|
||||
ctx,
|
||||
secret,
|
||||
&user_info.endpoint,
|
||||
true,
|
||||
)?,
|
||||
Some(secret) => secret,
|
||||
None => {
|
||||
// If we don't have an authentication secret, for the http flow we can just return an error.
|
||||
info!("authentication info not found");
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use crate::{
|
||||
cancellation::CancellationHandlerMain,
|
||||
cancellation::CancellationHandler,
|
||||
config::ProxyConfig,
|
||||
context::RequestMonitoring,
|
||||
error::{io_error, ReportableError},
|
||||
@@ -134,7 +134,7 @@ pub async fn serve_websocket(
|
||||
config: &'static ProxyConfig,
|
||||
mut ctx: RequestMonitoring,
|
||||
websocket: HyperWebsocket,
|
||||
cancellation_handler: Arc<CancellationHandlerMain>,
|
||||
cancellation_handler: Arc<CancellationHandler>,
|
||||
hostname: Option<String>,
|
||||
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
) -> anyhow::Result<()> {
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[toolchain]
|
||||
channel = "1.77.0"
|
||||
channel = "1.76.0"
|
||||
profile = "default"
|
||||
# The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
|
||||
# https://rust-lang.github.io/rustup/concepts/profiles.html
|
||||
|
||||
@@ -225,7 +225,6 @@ async fn write_segment(
|
||||
assert!(from <= to);
|
||||
assert!(to <= wal_seg_size);
|
||||
|
||||
#[allow(clippy::suspicious_open_options)]
|
||||
let mut file = OpenOptions::new()
|
||||
.create(true)
|
||||
.write(true)
|
||||
|
||||
@@ -221,7 +221,6 @@ impl PhysicalStorage {
|
||||
// half initialized segment, first bake it under tmp filename and
|
||||
// then rename.
|
||||
let tmp_path = self.timeline_dir.join("waltmp");
|
||||
#[allow(clippy::suspicious_open_options)]
|
||||
let mut file = OpenOptions::new()
|
||||
.create(true)
|
||||
.write(true)
|
||||
|
||||
@@ -244,7 +244,6 @@ impl SimulationApi {
|
||||
mutex: 0,
|
||||
mineLastElectedTerm: 0,
|
||||
backpressureThrottlingTime: pg_atomic_uint64 { value: 0 },
|
||||
currentClusterSize: pg_atomic_uint64 { value: 0 },
|
||||
shard_ps_feedback: [empty_feedback; 128],
|
||||
num_shards: 0,
|
||||
min_ps_feedback: empty_feedback,
|
||||
|
||||
@@ -1155,17 +1155,13 @@ class NeonEnv:
|
||||
After this method returns, there should be no child processes running.
|
||||
"""
|
||||
self.endpoints.stop_all()
|
||||
|
||||
# Stop storage controller before pageservers: we don't want it to spuriously
|
||||
# detect a pageserver "failure" during test teardown
|
||||
self.storage_controller.stop(immediate=immediate)
|
||||
|
||||
for sk in self.safekeepers:
|
||||
sk.stop(immediate=immediate)
|
||||
for pageserver in self.pageservers:
|
||||
if ps_assert_metric_no_errors:
|
||||
pageserver.assert_no_metric_errors()
|
||||
pageserver.stop(immediate=immediate)
|
||||
self.storage_controller.stop(immediate=immediate)
|
||||
self.broker.stop(immediate=immediate)
|
||||
|
||||
@property
|
||||
@@ -2126,8 +2122,6 @@ class NeonStorageController(MetricsGetter):
|
||||
shard_params = {"count": shard_count}
|
||||
if shard_stripe_size is not None:
|
||||
shard_params["stripe_size"] = shard_stripe_size
|
||||
else:
|
||||
shard_params["stripe_size"] = 32768
|
||||
|
||||
body["shard_parameters"] = shard_params
|
||||
|
||||
@@ -2141,7 +2135,6 @@ class NeonStorageController(MetricsGetter):
|
||||
json=body,
|
||||
headers=self.headers(TokenScope.PAGE_SERVER_API),
|
||||
)
|
||||
response.raise_for_status()
|
||||
log.info(f"tenant_create success: {response.json()}")
|
||||
|
||||
def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]:
|
||||
|
||||
@@ -86,9 +86,6 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
|
||||
# This is especially pronounced in tests that set small checkpoint
|
||||
# distances.
|
||||
".*Flushed oversized open layer with size.*",
|
||||
# During teardown, we stop the storage controller before the pageservers, so pageservers
|
||||
# can experience connection errors doing background deletion queue work.
|
||||
".*WARN deletion backend: calling control plane generation validation API failed.*Connection refused.*",
|
||||
)
|
||||
|
||||
|
||||
@@ -99,8 +96,6 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [
|
||||
".*Call to node.*management API.*failed.*ReceiveBody.*",
|
||||
# Many tests will start up with a node offline
|
||||
".*startup_reconcile: Could not scan node.*",
|
||||
# Tests run in dev mode
|
||||
".*Starting in dev mode.*",
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -62,7 +62,9 @@ def wait_for_upload(
|
||||
)
|
||||
time.sleep(1)
|
||||
raise Exception(
|
||||
f"timed out while waiting for {tenant}/{timeline} remote_consistent_lsn to reach {lsn}, was {current_lsn}"
|
||||
"timed out while waiting for remote_consistent_lsn to reach {}, was {}".format(
|
||||
lsn, current_lsn
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
from contextlib import closing
|
||||
|
||||
import pytest
|
||||
from fixtures.benchmark_fixture import MetricReport
|
||||
from fixtures.compare_fixtures import NeonCompare, PgCompare
|
||||
from fixtures.pageserver.utils import wait_tenant_status_404
|
||||
@@ -18,7 +17,6 @@ from fixtures.types import Lsn
|
||||
# 3. Disk space used
|
||||
# 4. Peak memory usage
|
||||
#
|
||||
@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/7124")
|
||||
def test_bulk_insert(neon_with_baseline: PgCompare):
|
||||
env = neon_with_baseline
|
||||
|
||||
|
||||
@@ -105,7 +105,7 @@ def test_pageserver_multiple_keys(neon_env_builder: NeonEnvBuilder):
|
||||
# The neon_local tool generates one key pair at a hardcoded path by default.
|
||||
# As a preparation for our test, move the public key of the key pair into a
|
||||
# directory at the same location as the hardcoded path by:
|
||||
# 1. moving the file at `configured_pub_key_path` to a temporary location
|
||||
# 1. moving the the file at `configured_pub_key_path` to a temporary location
|
||||
# 2. creating a new directory at `configured_pub_key_path`
|
||||
# 3. moving the file from the temporary location into the newly created directory
|
||||
configured_pub_key_path = Path(env.repo_dir) / "auth_public_key.pem"
|
||||
|
||||
@@ -267,10 +267,9 @@ def test_forward_compatibility(
|
||||
|
||||
def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, repo_dir: Path):
|
||||
ep = env.endpoints.create_start("main")
|
||||
connstr = ep.connstr()
|
||||
|
||||
pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
|
||||
|
||||
connstr = ep.connstr()
|
||||
pg_bin.run_capture(
|
||||
["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump.sql'}"]
|
||||
)
|
||||
@@ -287,9 +286,6 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r
|
||||
timeline_id = env.initial_timeline
|
||||
pg_version = env.pg_version
|
||||
|
||||
# Stop endpoint while we recreate timeline
|
||||
ep.stop()
|
||||
|
||||
try:
|
||||
pageserver_http.timeline_preserve_initdb_archive(tenant_id, timeline_id)
|
||||
except PageserverApiException as e:
|
||||
@@ -314,9 +310,6 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r
|
||||
existing_initdb_timeline_id=timeline_id,
|
||||
)
|
||||
|
||||
# Timeline exists again: restart the endpoint
|
||||
ep.start()
|
||||
|
||||
pg_bin.run_capture(
|
||||
["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump-from-wal.sql'}"]
|
||||
)
|
||||
|
||||
@@ -85,7 +85,6 @@ def test_hot_standby(neon_simple_env: NeonEnv):
|
||||
if slow_down_send:
|
||||
sk_http.configure_failpoints(("sk-send-wal-replica-sleep", "off"))
|
||||
|
||||
|
||||
def test_2_replicas_start(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
|
||||
@@ -94,11 +93,6 @@ def test_2_replicas_start(neon_simple_env: NeonEnv):
|
||||
endpoint_id="primary",
|
||||
) as primary:
|
||||
time.sleep(1)
|
||||
with env.endpoints.new_replica_start(
|
||||
origin=primary, endpoint_id="secondary1"
|
||||
) as secondary1:
|
||||
with env.endpoints.new_replica_start(
|
||||
origin=primary, endpoint_id="secondary2"
|
||||
) as secondary2:
|
||||
wait_replica_caughtup(primary, secondary1)
|
||||
wait_replica_caughtup(primary, secondary2)
|
||||
with env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary1") as secondary1:
|
||||
with env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary2") as secondary2:
|
||||
pass
|
||||
|
||||
@@ -1,275 +0,0 @@
|
||||
import asyncio
|
||||
import os
|
||||
from typing import Tuple
|
||||
|
||||
import psutil
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnv,
|
||||
NeonEnvBuilder,
|
||||
tenant_get_shards,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverHttpClient
|
||||
from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from fixtures.utils import wait_until
|
||||
|
||||
TIMELINE_COUNT = 10
|
||||
ENTRIES_PER_TIMELINE = 10_000
|
||||
CHECKPOINT_TIMEOUT_SECONDS = 60
|
||||
|
||||
|
||||
async def run_worker(env: NeonEnv, tenant_conf, entries: int) -> Tuple[TenantId, TimelineId, Lsn]:
|
||||
tenant, timeline = env.neon_cli.create_tenant(conf=tenant_conf)
|
||||
with env.endpoints.create_start("main", tenant_id=tenant) as ep:
|
||||
conn = await ep.connect_async()
|
||||
try:
|
||||
await conn.execute("CREATE TABLE IF NOT EXISTS t(key serial primary key, value text)")
|
||||
await conn.execute(
|
||||
f"INSERT INTO t SELECT i, CONCAT('payload_', i) FROM generate_series(0,{entries}) as i"
|
||||
)
|
||||
finally:
|
||||
await conn.close(timeout=10)
|
||||
|
||||
last_flush_lsn = Lsn(ep.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
|
||||
return tenant, timeline, last_flush_lsn
|
||||
|
||||
|
||||
async def workload(
|
||||
env: NeonEnv, tenant_conf, timelines: int, entries: int
|
||||
) -> list[Tuple[TenantId, TimelineId, Lsn]]:
|
||||
workers = [asyncio.create_task(run_worker(env, tenant_conf, entries)) for _ in range(timelines)]
|
||||
return await asyncio.gather(*workers)
|
||||
|
||||
|
||||
def wait_until_pageserver_is_caught_up(
|
||||
env: NeonEnv, last_flush_lsns: list[Tuple[TenantId, TimelineId, Lsn]]
|
||||
):
|
||||
for tenant, timeline, last_flush_lsn in last_flush_lsns:
|
||||
shards = tenant_get_shards(env, tenant)
|
||||
for tenant_shard_id, pageserver in shards:
|
||||
waited = wait_for_last_record_lsn(
|
||||
pageserver.http_client(), tenant_shard_id, timeline, last_flush_lsn
|
||||
)
|
||||
assert waited >= last_flush_lsn
|
||||
|
||||
|
||||
def wait_until_pageserver_has_uploaded(
|
||||
env: NeonEnv, last_flush_lsns: list[Tuple[TenantId, TimelineId, Lsn]]
|
||||
):
|
||||
for tenant, timeline, last_flush_lsn in last_flush_lsns:
|
||||
shards = tenant_get_shards(env, tenant)
|
||||
for tenant_shard_id, pageserver in shards:
|
||||
wait_for_upload(pageserver.http_client(), tenant_shard_id, timeline, last_flush_lsn)
|
||||
|
||||
|
||||
def wait_for_wal_ingest_metric(pageserver_http: PageserverHttpClient) -> float:
|
||||
def query():
|
||||
value = pageserver_http.get_metric_value("pageserver_wal_ingest_records_received_total")
|
||||
assert value is not None
|
||||
return value
|
||||
|
||||
# The metric gets initialised on the first update.
|
||||
# Retry a few times, but return 0 if it's stable.
|
||||
try:
|
||||
return float(wait_until(3, 0.5, query))
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
|
||||
def get_dirty_bytes(env):
|
||||
v = env.pageserver.http_client().get_metric_value("pageserver_timeline_ephemeral_bytes") or 0
|
||||
log.info(f"dirty_bytes: {v}")
|
||||
return v
|
||||
|
||||
|
||||
def assert_dirty_bytes(env, v):
|
||||
assert get_dirty_bytes(env) == v
|
||||
|
||||
|
||||
def assert_dirty_bytes_nonzero(env):
|
||||
assert get_dirty_bytes(env) > 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("immediate_shutdown", [True, False])
|
||||
def test_pageserver_small_inmemory_layers(
|
||||
neon_env_builder: NeonEnvBuilder, immediate_shutdown: bool
|
||||
):
|
||||
"""
|
||||
Test that open layers get flushed after the `checkpoint_timeout` config
|
||||
and do not require WAL reingest upon restart.
|
||||
|
||||
The workload creates a number of timelines and writes some data to each,
|
||||
but not enough to trigger flushes via the `checkpoint_distance` config.
|
||||
"""
|
||||
tenant_conf = {
|
||||
# Large `checkpoint_distance` effectively disables size
|
||||
# based checkpointing.
|
||||
"checkpoint_distance": f"{2 * 1024 ** 3}",
|
||||
"checkpoint_timeout": f"{CHECKPOINT_TIMEOUT_SECONDS}s",
|
||||
"compaction_period": "1s",
|
||||
}
|
||||
|
||||
env = neon_env_builder.init_configs()
|
||||
env.start()
|
||||
|
||||
last_flush_lsns = asyncio.run(workload(env, tenant_conf, TIMELINE_COUNT, ENTRIES_PER_TIMELINE))
|
||||
wait_until_pageserver_is_caught_up(env, last_flush_lsns)
|
||||
|
||||
# We didn't write enough data to trigger a size-based checkpoint: we should see dirty data.
|
||||
wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) # type: ignore
|
||||
|
||||
ps_http_client = env.pageserver.http_client()
|
||||
total_wal_ingested_before_restart = wait_for_wal_ingest_metric(ps_http_client)
|
||||
|
||||
# Within ~ the checkpoint interval, all the ephemeral layers should be frozen and flushed,
|
||||
# such that there are zero bytes of ephemeral layer left on the pageserver
|
||||
log.info("Waiting for background checkpoints...")
|
||||
wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0)) # type: ignore
|
||||
|
||||
# Zero ephemeral layer bytes does not imply that all the frozen layers were uploaded: they
|
||||
# must be uploaded to remain visible to the pageserver after restart.
|
||||
wait_until_pageserver_has_uploaded(env, last_flush_lsns)
|
||||
|
||||
env.pageserver.restart(immediate=immediate_shutdown)
|
||||
wait_until_pageserver_is_caught_up(env, last_flush_lsns)
|
||||
|
||||
# Catching up with WAL ingest should have resulted in zero bytes of ephemeral layers, since
|
||||
# we froze, flushed and uploaded everything before restarting. There can be no more WAL writes
|
||||
# because we shut down compute endpoints before flushing.
|
||||
assert get_dirty_bytes(env) == 0
|
||||
|
||||
total_wal_ingested_after_restart = wait_for_wal_ingest_metric(ps_http_client)
|
||||
|
||||
log.info(f"WAL ingested before restart: {total_wal_ingested_before_restart}")
|
||||
log.info(f"WAL ingested after restart: {total_wal_ingested_after_restart}")
|
||||
|
||||
assert total_wal_ingested_after_restart == 0
|
||||
|
||||
|
||||
def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Test that `checkpoint_timeout` is enforced even if there is no safekeeper input.
|
||||
"""
|
||||
tenant_conf = {
|
||||
# Large `checkpoint_distance` effectively disables size
|
||||
# based checkpointing.
|
||||
"checkpoint_distance": f"{2 * 1024 ** 3}",
|
||||
"checkpoint_timeout": f"{CHECKPOINT_TIMEOUT_SECONDS}s",
|
||||
"compaction_period": "1s",
|
||||
}
|
||||
|
||||
env = neon_env_builder.init_configs()
|
||||
env.start()
|
||||
|
||||
last_flush_lsns = asyncio.run(workload(env, tenant_conf, TIMELINE_COUNT, ENTRIES_PER_TIMELINE))
|
||||
wait_until_pageserver_is_caught_up(env, last_flush_lsns)
|
||||
|
||||
# We didn't write enough data to trigger a size-based checkpoint: we should see dirty data.
|
||||
wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) # type: ignore
|
||||
|
||||
# Stop the safekeepers, so that we cannot have any more WAL receiver connections
|
||||
for sk in env.safekeepers:
|
||||
sk.stop()
|
||||
|
||||
# We should have got here fast enough that we didn't hit the background interval yet,
|
||||
# and the teardown of SK connections shouldn't prompt any layer freezing.
|
||||
assert get_dirty_bytes(env) > 0
|
||||
|
||||
# Within ~ the checkpoint interval, all the ephemeral layers should be frozen and flushed,
|
||||
# such that there are zero bytes of ephemeral layer left on the pageserver
|
||||
log.info("Waiting for background checkpoints...")
|
||||
wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0)) # type: ignore
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
# We have to use at least ~100MB of data to hit the lowest limit we can configure, which is
|
||||
# prohibitively slow in debug mode
|
||||
os.getenv("BUILD_TYPE") == "debug",
|
||||
reason="Avoid running bulkier ingest tests in debug mode",
|
||||
)
|
||||
def test_total_size_limit(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Test that checkpoints are done based on total ephemeral layer size, even if no one timeline is
|
||||
individually exceeding checkpoint thresholds.
|
||||
"""
|
||||
|
||||
system_memory = psutil.virtual_memory().total
|
||||
|
||||
# The smallest total size limit we can configure is 1/1024th of the system memory (e.g. 128MB on
|
||||
# a system with 128GB of RAM). We will then write enough data to violate this limit.
|
||||
max_dirty_data = 128 * 1024 * 1024
|
||||
ephemeral_bytes_per_memory_kb = (max_dirty_data * 1024) // system_memory
|
||||
assert ephemeral_bytes_per_memory_kb > 0
|
||||
|
||||
neon_env_builder.pageserver_config_override = f"""
|
||||
ephemeral_bytes_per_memory_kb={ephemeral_bytes_per_memory_kb}
|
||||
"""
|
||||
|
||||
compaction_period_s = 10
|
||||
|
||||
tenant_conf = {
|
||||
# Large space + time thresholds: effectively disable these limits
|
||||
"checkpoint_distance": f"{1024 ** 4}",
|
||||
"checkpoint_timeout": "3600s",
|
||||
"compaction_period": f"{compaction_period_s}s",
|
||||
}
|
||||
|
||||
env = neon_env_builder.init_configs()
|
||||
env.start()
|
||||
|
||||
timeline_count = 10
|
||||
|
||||
# This is about 2MiB of data per timeline
|
||||
entries_per_timeline = 100_000
|
||||
|
||||
last_flush_lsns = asyncio.run(workload(env, tenant_conf, timeline_count, entries_per_timeline))
|
||||
wait_until_pageserver_is_caught_up(env, last_flush_lsns)
|
||||
|
||||
total_bytes_ingested = 0
|
||||
for tenant, timeline, last_flush_lsn in last_flush_lsns:
|
||||
http_client = env.pageserver.http_client()
|
||||
initdb_lsn = Lsn(http_client.timeline_detail(tenant, timeline)["initdb_lsn"])
|
||||
total_bytes_ingested += last_flush_lsn - initdb_lsn
|
||||
|
||||
log.info(f"Ingested {total_bytes_ingested} bytes since initdb (vs max dirty {max_dirty_data})")
|
||||
assert total_bytes_ingested > max_dirty_data
|
||||
|
||||
# Expected end state: the total physical size of all the tenants is in excess of the max dirty
|
||||
# data, but the total amount of dirty data is less than the limit: this demonstrates that we
|
||||
# have exceeded the threshold but then rolled layers in response
|
||||
def get_total_historic_layers():
|
||||
total_ephemeral_layers = 0
|
||||
total_historic_bytes = 0
|
||||
for tenant, timeline, _last_flush_lsn in last_flush_lsns:
|
||||
http_client = env.pageserver.http_client()
|
||||
initdb_lsn = Lsn(http_client.timeline_detail(tenant, timeline)["initdb_lsn"])
|
||||
layer_map = http_client.layer_map_info(tenant, timeline)
|
||||
total_historic_bytes += sum(
|
||||
layer.layer_file_size
|
||||
for layer in layer_map.historic_layers
|
||||
if layer.layer_file_size is not None and Lsn(layer.lsn_start) > initdb_lsn
|
||||
)
|
||||
total_ephemeral_layers += len(layer_map.in_memory_layers)
|
||||
|
||||
log.info(
|
||||
f"Total historic layer bytes: {total_historic_bytes} ({total_ephemeral_layers} ephemeral layers)"
|
||||
)
|
||||
|
||||
return total_historic_bytes
|
||||
|
||||
def assert_bytes_rolled():
|
||||
assert total_bytes_ingested - get_total_historic_layers() <= max_dirty_data
|
||||
|
||||
# Wait until enough layers have rolled that the amount of dirty data is under the threshold.
|
||||
# We do this indirectly via layer maps, rather than the dirty bytes metric, to avoid false-passing
|
||||
# if that metric isn't updated quickly enough to reflect the dirty bytes exceeding the limit.
|
||||
wait_until(compaction_period_s * 2, 1, assert_bytes_rolled)
|
||||
|
||||
# The end state should also have the reported metric under the limit
|
||||
def assert_dirty_data_limited():
|
||||
dirty_bytes = get_dirty_bytes(env)
|
||||
assert dirty_bytes < max_dirty_data
|
||||
|
||||
wait_until(compaction_period_s * 2, 1, lambda: assert_dirty_data_limited()) # type: ignore
|
||||
@@ -1,6 +1,4 @@
|
||||
import gzip
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
@@ -12,11 +10,7 @@ from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
wait_for_last_flush_lsn,
|
||||
)
|
||||
from fixtures.remote_storage import (
|
||||
LocalFsStorage,
|
||||
RemoteStorageKind,
|
||||
remote_storage_to_toml_inline_table,
|
||||
)
|
||||
from fixtures.remote_storage import RemoteStorageKind
|
||||
from fixtures.types import TenantId, TimelineId
|
||||
from pytest_httpserver import HTTPServer
|
||||
from werkzeug.wrappers.request import Request
|
||||
@@ -46,9 +40,6 @@ def test_metric_collection(
|
||||
uploads.put((events, is_last == "true"))
|
||||
return Response(status=200)
|
||||
|
||||
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
||||
assert neon_env_builder.pageserver_remote_storage is not None
|
||||
|
||||
# Require collecting metrics frequently, since we change
|
||||
# the timeline and want something to be logged about it.
|
||||
#
|
||||
@@ -57,11 +48,12 @@ def test_metric_collection(
|
||||
neon_env_builder.pageserver_config_override = f"""
|
||||
metric_collection_interval="1s"
|
||||
metric_collection_endpoint="{metric_collection_endpoint}"
|
||||
metric_collection_bucket={remote_storage_to_toml_inline_table(neon_env_builder.pageserver_remote_storage)}
|
||||
cached_metric_collection_interval="0s"
|
||||
synthetic_size_calculation_interval="3s"
|
||||
"""
|
||||
|
||||
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
||||
|
||||
log.info(f"test_metric_collection endpoint is {metric_collection_endpoint}")
|
||||
|
||||
# mock http server that returns OK for the metrics
|
||||
@@ -78,7 +70,6 @@ def test_metric_collection(
|
||||
# we have a fast rate of calculation, these can happen at shutdown
|
||||
".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*",
|
||||
".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes",
|
||||
".*metrics_collection: failed to upload to S3: Failed to upload data of length .* to storage path.*",
|
||||
]
|
||||
)
|
||||
|
||||
@@ -175,20 +166,6 @@ def test_metric_collection(
|
||||
|
||||
httpserver.check()
|
||||
|
||||
# Check that at least one bucket output object is present, and that all
|
||||
# can be decompressed and decoded.
|
||||
bucket_dumps = {}
|
||||
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
|
||||
for dirpath, _dirs, files in os.walk(env.pageserver_remote_storage.root):
|
||||
for file in files:
|
||||
file_path = os.path.join(dirpath, file)
|
||||
log.info(file_path)
|
||||
if file.endswith(".gz"):
|
||||
bucket_dumps[file_path] = json.load(gzip.open(file_path))
|
||||
|
||||
assert len(bucket_dumps) >= 1
|
||||
assert all("events" in data for data in bucket_dumps.values())
|
||||
|
||||
|
||||
def test_metric_collection_cleans_up_tempfile(
|
||||
httpserver: HTTPServer,
|
||||
|
||||
@@ -11,7 +11,6 @@ from fixtures.pageserver.utils import (
|
||||
assert_prefix_empty,
|
||||
poll_for_remote_storage_iterations,
|
||||
tenant_delete_wait_completed,
|
||||
wait_for_upload_queue_empty,
|
||||
)
|
||||
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage
|
||||
from fixtures.types import TenantId, TimelineId
|
||||
@@ -90,8 +89,6 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
|
||||
# this shutdown case is logged at WARN severity by the time it bubbles up to logical size calculation code
|
||||
# WARN ...: initial size calculation failed: downloading failed, possibly for shutdown
|
||||
".*downloading failed, possibly for shutdown",
|
||||
# {tenant_id=... timeline_id=...}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1664/0/1260 blkno=0 req_lsn=0/149F0D8}: error reading relation or page version: Not found: will not become active. Current state: Stopping\n'
|
||||
".*page_service.*will not become active.*",
|
||||
]
|
||||
)
|
||||
|
||||
@@ -475,10 +472,6 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
|
||||
log.info("Synchronizing after initial write...")
|
||||
ps_attached.http_client().tenant_heatmap_upload(tenant_id)
|
||||
|
||||
# Ensure that everything which appears in the heatmap is also present in S3: heatmap writers
|
||||
# are allowed to upload heatmaps that reference layers which are only enqueued for upload
|
||||
wait_for_upload_queue_empty(ps_attached.http_client(), tenant_id, timeline_id)
|
||||
|
||||
ps_secondary.http_client().tenant_secondary_download(tenant_id)
|
||||
|
||||
assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
|
||||
@@ -491,11 +484,6 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
|
||||
workload.churn_rows(128, ps_attached.id)
|
||||
|
||||
ps_attached.http_client().tenant_heatmap_upload(tenant_id)
|
||||
|
||||
# Ensure that everything which appears in the heatmap is also present in S3: heatmap writers
|
||||
# are allowed to upload heatmaps that reference layers which are only enqueued for upload
|
||||
wait_for_upload_queue_empty(ps_attached.http_client(), tenant_id, timeline_id)
|
||||
|
||||
ps_secondary.http_client().tenant_secondary_download(tenant_id)
|
||||
|
||||
assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user