mirror of
https://github.com/neondatabase/neon.git
synced 2026-03-04 00:40:38 +00:00
Compare commits
58 Commits
release-pr
...
test_repli
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
08cf2749ca | ||
|
|
c85fd74d34 | ||
|
|
2d3c9f0d43 | ||
|
|
21b3e1d13b | ||
|
|
0788760451 | ||
|
|
74b2314a5d | ||
|
|
edcaae6290 | ||
|
|
4fc95d2d71 | ||
|
|
534c099b42 | ||
|
|
ec01292b55 | ||
|
|
66fc465484 | ||
|
|
55da8eff4f | ||
|
|
0fa517eb80 | ||
|
|
8ceb4f0a69 | ||
|
|
6019ccef06 | ||
|
|
0c6367a732 | ||
|
|
e17bc6afb4 | ||
|
|
ac7fc6110b | ||
|
|
862a6b7018 | ||
|
|
4810c22607 | ||
|
|
9d754e984f | ||
|
|
375e15815c | ||
|
|
7ce613354e | ||
|
|
ae15acdee7 | ||
|
|
c5f64fe54f | ||
|
|
40852b955d | ||
|
|
b30b15e7cb | ||
|
|
36b875388f | ||
|
|
3f77f26aa2 | ||
|
|
8b10407be4 | ||
|
|
944313ffe1 | ||
|
|
d443d07518 | ||
|
|
3de416a016 | ||
|
|
bc05d7eb9c | ||
|
|
d8da51e78a | ||
|
|
6e3834d506 | ||
|
|
582cec53c5 | ||
|
|
9957c6a9a0 | ||
|
|
a5777bab09 | ||
|
|
90a8ff55fa | ||
|
|
3b95e8072a | ||
|
|
8ee54ffd30 | ||
|
|
3ab9f56f5f | ||
|
|
7ddc7b4990 | ||
|
|
63213fc814 | ||
|
|
090123a429 | ||
|
|
39d1818ae9 | ||
|
|
90be79fcf5 | ||
|
|
c52b80b930 | ||
|
|
722f271f6e | ||
|
|
be1d8fc4f7 | ||
|
|
25c4b676e0 | ||
|
|
6633332e67 | ||
|
|
5928f6709c | ||
|
|
63b2060aef | ||
|
|
24c5a5ac16 | ||
|
|
7f9cc1bd5e | ||
|
|
cdf12ed008 |
@@ -22,6 +22,7 @@
|
|||||||
!s3_scrubber/
|
!s3_scrubber/
|
||||||
!safekeeper/
|
!safekeeper/
|
||||||
!storage_broker/
|
!storage_broker/
|
||||||
|
!storage_controller/
|
||||||
!trace/
|
!trace/
|
||||||
!vendor/postgres-*/
|
!vendor/postgres-*/
|
||||||
!workspace_hack/
|
!workspace_hack/
|
||||||
|
|||||||
58
.github/workflows/benchmarking.yml
vendored
58
.github/workflows/benchmarking.yml
vendored
@@ -147,15 +147,16 @@ jobs:
|
|||||||
"neonvm-captest-new"
|
"neonvm-captest-new"
|
||||||
],
|
],
|
||||||
"db_size": [ "10gb" ],
|
"db_size": [ "10gb" ],
|
||||||
"include": [{ "platform": "neon-captest-freetier", "db_size": "3gb" },
|
"include": [{ "platform": "neon-captest-freetier", "db_size": "3gb" },
|
||||||
{ "platform": "neon-captest-new", "db_size": "50gb" },
|
{ "platform": "neon-captest-new", "db_size": "50gb" },
|
||||||
{ "platform": "neonvm-captest-freetier", "db_size": "3gb" },
|
{ "platform": "neonvm-captest-freetier", "db_size": "3gb" },
|
||||||
{ "platform": "neonvm-captest-new", "db_size": "50gb" }]
|
{ "platform": "neonvm-captest-new", "db_size": "50gb" },
|
||||||
|
{ "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
|
||||||
}'
|
}'
|
||||||
|
|
||||||
if [ "$(date +%A)" = "Saturday" ]; then
|
if [ "$(date +%A)" = "Saturday" ]; then
|
||||||
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
|
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
|
||||||
{ "platform": "rds-aurora", "db_size": "50gb"}]')
|
{ "platform": "rds-aurora", "db_size": "50gb"}]')
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
||||||
@@ -171,7 +172,7 @@ jobs:
|
|||||||
|
|
||||||
if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
|
if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
|
||||||
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
|
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
|
||||||
{ "platform": "rds-aurora" }]')
|
{ "platform": "rds-aurora" }]')
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
||||||
@@ -190,7 +191,7 @@ jobs:
|
|||||||
|
|
||||||
if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
|
if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
|
||||||
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
|
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
|
||||||
{ "platform": "rds-aurora", "scale": "10" }]')
|
{ "platform": "rds-aurora", "scale": "10" }]')
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
||||||
@@ -253,6 +254,9 @@ jobs:
|
|||||||
neon-captest-reuse)
|
neon-captest-reuse)
|
||||||
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
|
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
|
||||||
;;
|
;;
|
||||||
|
neonvm-captest-sharding-reuse)
|
||||||
|
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
|
||||||
|
;;
|
||||||
neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
|
neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
|
||||||
CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
|
CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
|
||||||
;;
|
;;
|
||||||
@@ -270,11 +274,15 @@ jobs:
|
|||||||
|
|
||||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
QUERY="SELECT version();"
|
QUERIES=("SELECT version()")
|
||||||
if [[ "${PLATFORM}" = "neon"* ]]; then
|
if [[ "${PLATFORM}" = "neon"* ]]; then
|
||||||
QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
|
QUERIES+=("SHOW neon.tenant_id")
|
||||||
|
QUERIES+=("SHOW neon.timeline_id")
|
||||||
fi
|
fi
|
||||||
psql ${CONNSTR} -c "${QUERY}"
|
|
||||||
|
for q in "${QUERIES[@]}"; do
|
||||||
|
psql ${CONNSTR} -c "${q}"
|
||||||
|
done
|
||||||
|
|
||||||
- name: Benchmark init
|
- name: Benchmark init
|
||||||
uses: ./.github/actions/run-python-test-set
|
uses: ./.github/actions/run-python-test-set
|
||||||
@@ -401,11 +409,15 @@ jobs:
|
|||||||
|
|
||||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
QUERY="SELECT version();"
|
QUERIES=("SELECT version()")
|
||||||
if [[ "${PLATFORM}" = "neon"* ]]; then
|
if [[ "${PLATFORM}" = "neon"* ]]; then
|
||||||
QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
|
QUERIES+=("SHOW neon.tenant_id")
|
||||||
|
QUERIES+=("SHOW neon.timeline_id")
|
||||||
fi
|
fi
|
||||||
psql ${CONNSTR} -c "${QUERY}"
|
|
||||||
|
for q in "${QUERIES[@]}"; do
|
||||||
|
psql ${CONNSTR} -c "${q}"
|
||||||
|
done
|
||||||
|
|
||||||
- name: ClickBench benchmark
|
- name: ClickBench benchmark
|
||||||
uses: ./.github/actions/run-python-test-set
|
uses: ./.github/actions/run-python-test-set
|
||||||
@@ -507,11 +519,15 @@ jobs:
|
|||||||
|
|
||||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
QUERY="SELECT version();"
|
QUERIES=("SELECT version()")
|
||||||
if [[ "${PLATFORM}" = "neon"* ]]; then
|
if [[ "${PLATFORM}" = "neon"* ]]; then
|
||||||
QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
|
QUERIES+=("SHOW neon.tenant_id")
|
||||||
|
QUERIES+=("SHOW neon.timeline_id")
|
||||||
fi
|
fi
|
||||||
psql ${CONNSTR} -c "${QUERY}"
|
|
||||||
|
for q in "${QUERIES[@]}"; do
|
||||||
|
psql ${CONNSTR} -c "${q}"
|
||||||
|
done
|
||||||
|
|
||||||
- name: Run TPC-H benchmark
|
- name: Run TPC-H benchmark
|
||||||
uses: ./.github/actions/run-python-test-set
|
uses: ./.github/actions/run-python-test-set
|
||||||
@@ -597,11 +613,15 @@ jobs:
|
|||||||
|
|
||||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
QUERY="SELECT version();"
|
QUERIES=("SELECT version()")
|
||||||
if [[ "${PLATFORM}" = "neon"* ]]; then
|
if [[ "${PLATFORM}" = "neon"* ]]; then
|
||||||
QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
|
QUERIES+=("SHOW neon.tenant_id")
|
||||||
|
QUERIES+=("SHOW neon.timeline_id")
|
||||||
fi
|
fi
|
||||||
psql ${CONNSTR} -c "${QUERY}"
|
|
||||||
|
for q in "${QUERIES[@]}"; do
|
||||||
|
psql ${CONNSTR} -c "${q}"
|
||||||
|
done
|
||||||
|
|
||||||
- name: Run user examples
|
- name: Run user examples
|
||||||
uses: ./.github/actions/run-python-test-set
|
uses: ./.github/actions/run-python-test-set
|
||||||
|
|||||||
3
.github/workflows/build_and_test.yml
vendored
3
.github/workflows/build_and_test.yml
vendored
@@ -1127,6 +1127,7 @@ jobs:
|
|||||||
-f deployProxy=false \
|
-f deployProxy=false \
|
||||||
-f deployStorage=true \
|
-f deployStorage=true \
|
||||||
-f deployStorageBroker=true \
|
-f deployStorageBroker=true \
|
||||||
|
-f deployStorageController=true \
|
||||||
-f branch=main \
|
-f branch=main \
|
||||||
-f dockerTag=${{needs.tag.outputs.build-tag}} \
|
-f dockerTag=${{needs.tag.outputs.build-tag}} \
|
||||||
-f deployPreprodRegion=true
|
-f deployPreprodRegion=true
|
||||||
@@ -1136,6 +1137,7 @@ jobs:
|
|||||||
-f deployProxy=false \
|
-f deployProxy=false \
|
||||||
-f deployStorage=true \
|
-f deployStorage=true \
|
||||||
-f deployStorageBroker=true \
|
-f deployStorageBroker=true \
|
||||||
|
-f deployStorageController=true \
|
||||||
-f branch=main \
|
-f branch=main \
|
||||||
-f dockerTag=${{needs.tag.outputs.build-tag}}
|
-f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||||
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
|
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
|
||||||
@@ -1144,6 +1146,7 @@ jobs:
|
|||||||
-f deployProxy=true \
|
-f deployProxy=true \
|
||||||
-f deployStorage=false \
|
-f deployStorage=false \
|
||||||
-f deployStorageBroker=false \
|
-f deployStorageBroker=false \
|
||||||
|
-f deployStorageController=false \
|
||||||
-f branch=main \
|
-f branch=main \
|
||||||
-f dockerTag=${{needs.tag.outputs.build-tag}} \
|
-f dockerTag=${{needs.tag.outputs.build-tag}} \
|
||||||
-f deployPreprodRegion=true
|
-f deployPreprodRegion=true
|
||||||
|
|||||||
90
.github/workflows/trigger-e2e-tests.yml
vendored
90
.github/workflows/trigger-e2e-tests.yml
vendored
@@ -62,14 +62,14 @@ jobs:
|
|||||||
|
|
||||||
trigger-e2e-tests:
|
trigger-e2e-tests:
|
||||||
needs: [ tag ]
|
needs: [ tag ]
|
||||||
runs-on: [ self-hosted, gen3, small ]
|
runs-on: ubuntu-latest
|
||||||
env:
|
env:
|
||||||
TAG: ${{ needs.tag.outputs.build-tag }}
|
TAG: ${{ needs.tag.outputs.build-tag }}
|
||||||
container:
|
|
||||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
|
||||||
options: --init
|
|
||||||
steps:
|
steps:
|
||||||
- name: check if ecr image are present
|
- name: check if ecr image are present
|
||||||
|
env:
|
||||||
|
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||||
|
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||||
run: |
|
run: |
|
||||||
for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
|
for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
|
||||||
OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
|
OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
|
||||||
@@ -79,41 +79,55 @@ jobs:
|
|||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
- name: Set PR's status to pending and request a remote CI test
|
- name: Set e2e-platforms
|
||||||
|
id: e2e-platforms
|
||||||
|
env:
|
||||||
|
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||||
|
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
run: |
|
run: |
|
||||||
# For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit
|
# Default set of platforms to run e2e tests on
|
||||||
# but we need to use a real sha of a latest commit in the PR's branch for the e2e job,
|
platforms='["docker", "k8s"]'
|
||||||
# to place a job run status update later.
|
|
||||||
COMMIT_SHA=${{ github.event.pull_request.head.sha }}
|
|
||||||
# For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those
|
|
||||||
COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
|
|
||||||
|
|
||||||
REMOTE_REPO="${{ github.repository_owner }}/cloud"
|
# If the PR changes vendor/, pgxn/ or libs/vm_monitor/ directories, or Dockerfile.compute-node, add k8s-neonvm to the list of platforms.
|
||||||
|
# If the workflow run is not a pull request, add k8s-neonvm to the list.
|
||||||
|
if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then
|
||||||
|
for f in $(gh api "/repos/${GITHUB_REPOSITORY}/pulls/${PR_NUMBER}/files" --paginate --jq '.[].filename'); do
|
||||||
|
case "$f" in
|
||||||
|
vendor/*|pgxn/*|libs/vm_monitor/*|Dockerfile.compute-node)
|
||||||
|
platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
# no-op
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
else
|
||||||
|
platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
|
||||||
|
fi
|
||||||
|
|
||||||
curl -f -X POST \
|
echo "e2e-platforms=${platforms}" | tee -a $GITHUB_OUTPUT
|
||||||
https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
|
|
||||||
-H "Accept: application/vnd.github.v3+json" \
|
|
||||||
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
|
|
||||||
--data \
|
|
||||||
"{
|
|
||||||
\"state\": \"pending\",
|
|
||||||
\"context\": \"neon-cloud-e2e\",
|
|
||||||
\"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
|
|
||||||
}"
|
|
||||||
|
|
||||||
curl -f -X POST \
|
- name: Set PR's status to pending and request a remote CI test
|
||||||
https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
|
env:
|
||||||
-H "Accept: application/vnd.github.v3+json" \
|
E2E_PLATFORMS: ${{ steps.e2e-platforms.outputs.e2e-platforms }}
|
||||||
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
|
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||||
--data \
|
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||||
"{
|
run: |
|
||||||
\"ref\": \"main\",
|
REMOTE_REPO="${GITHUB_REPOSITORY_OWNER}/cloud"
|
||||||
\"inputs\": {
|
|
||||||
\"ci_job_name\": \"neon-cloud-e2e\",
|
gh api "/repos/${GITHUB_REPOSITORY}/statuses/${COMMIT_SHA}" \
|
||||||
\"commit_hash\": \"$COMMIT_SHA\",
|
--method POST \
|
||||||
\"remote_repo\": \"${{ github.repository }}\",
|
--raw-field "state=pending" \
|
||||||
\"storage_image_tag\": \"${TAG}\",
|
--raw-field "description=[$REMOTE_REPO] Remote CI job is about to start" \
|
||||||
\"compute_image_tag\": \"${TAG}\",
|
--raw-field "context=neon-cloud-e2e"
|
||||||
\"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
|
|
||||||
}
|
gh workflow --repo ${REMOTE_REPO} \
|
||||||
}"
|
run testing.yml \
|
||||||
|
--ref "main" \
|
||||||
|
--raw-field "ci_job_name=neon-cloud-e2e" \
|
||||||
|
--raw-field "commit_hash=$COMMIT_SHA" \
|
||||||
|
--raw-field "remote_repo=${GITHUB_REPOSITORY}" \
|
||||||
|
--raw-field "storage_image_tag=${TAG}" \
|
||||||
|
--raw-field "compute_image_tag=${TAG}" \
|
||||||
|
--raw-field "concurrency_group=${E2E_CONCURRENCY_GROUP}" \
|
||||||
|
--raw-field "e2e-platforms=${E2E_PLATFORMS}"
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
/compute_tools/ @neondatabase/control-plane @neondatabase/compute
|
/compute_tools/ @neondatabase/control-plane @neondatabase/compute
|
||||||
/control_plane/attachment_service @neondatabase/storage
|
/storage_controller @neondatabase/storage
|
||||||
/libs/pageserver_api/ @neondatabase/storage
|
/libs/pageserver_api/ @neondatabase/storage
|
||||||
/libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers
|
/libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers
|
||||||
/libs/remote_storage/ @neondatabase/storage
|
/libs/remote_storage/ @neondatabase/storage
|
||||||
|
|||||||
122
Cargo.lock
generated
122
Cargo.lock
generated
@@ -270,44 +270,6 @@ dependencies = [
|
|||||||
"critical-section",
|
"critical-section",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "attachment_service"
|
|
||||||
version = "0.1.0"
|
|
||||||
dependencies = [
|
|
||||||
"anyhow",
|
|
||||||
"aws-config",
|
|
||||||
"bytes",
|
|
||||||
"camino",
|
|
||||||
"clap",
|
|
||||||
"control_plane",
|
|
||||||
"diesel",
|
|
||||||
"diesel_migrations",
|
|
||||||
"fail",
|
|
||||||
"futures",
|
|
||||||
"git-version",
|
|
||||||
"hex",
|
|
||||||
"humantime",
|
|
||||||
"hyper",
|
|
||||||
"lasso",
|
|
||||||
"measured",
|
|
||||||
"metrics",
|
|
||||||
"once_cell",
|
|
||||||
"pageserver_api",
|
|
||||||
"pageserver_client",
|
|
||||||
"postgres_connection",
|
|
||||||
"r2d2",
|
|
||||||
"reqwest",
|
|
||||||
"routerify",
|
|
||||||
"serde",
|
|
||||||
"serde_json",
|
|
||||||
"thiserror",
|
|
||||||
"tokio",
|
|
||||||
"tokio-util",
|
|
||||||
"tracing",
|
|
||||||
"utils",
|
|
||||||
"workspace_hack",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "autocfg"
|
name = "autocfg"
|
||||||
version = "1.1.0"
|
version = "1.1.0"
|
||||||
@@ -2234,9 +2196,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "h2"
|
name = "h2"
|
||||||
version = "0.3.24"
|
version = "0.3.26"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "bb2c4422095b67ee78da96fbb51a4cc413b3b25883c7717ff7ca1ab31022c9c9"
|
checksum = "81fe527a889e1532da5c525686d96d4c2e74cdd345badf8dfef9f6b39dd5f5e8"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bytes",
|
"bytes",
|
||||||
"fnv",
|
"fnv",
|
||||||
@@ -3435,9 +3397,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ordered-multimap"
|
name = "ordered-multimap"
|
||||||
version = "0.7.1"
|
version = "0.7.3"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "a4d6a8c22fc714f0c2373e6091bf6f5e9b37b1bc0b1184874b7e0a4e303d318f"
|
checksum = "49203cdcae0030493bad186b28da2fa25645fa276a51b6fec8010d281e02ef79"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"dlv-list",
|
"dlv-list",
|
||||||
"hashbrown 0.14.0",
|
"hashbrown 0.14.0",
|
||||||
@@ -4199,6 +4161,7 @@ name = "proxy"
|
|||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
|
"async-compression",
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"aws-config",
|
"aws-config",
|
||||||
"aws-sdk-iam",
|
"aws-sdk-iam",
|
||||||
@@ -5621,6 +5584,65 @@ dependencies = [
|
|||||||
"workspace_hack",
|
"workspace_hack",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "storage_controller"
|
||||||
|
version = "0.1.0"
|
||||||
|
dependencies = [
|
||||||
|
"anyhow",
|
||||||
|
"aws-config",
|
||||||
|
"bytes",
|
||||||
|
"camino",
|
||||||
|
"clap",
|
||||||
|
"control_plane",
|
||||||
|
"diesel",
|
||||||
|
"diesel_migrations",
|
||||||
|
"fail",
|
||||||
|
"futures",
|
||||||
|
"git-version",
|
||||||
|
"hex",
|
||||||
|
"humantime",
|
||||||
|
"hyper",
|
||||||
|
"itertools",
|
||||||
|
"lasso",
|
||||||
|
"measured",
|
||||||
|
"metrics",
|
||||||
|
"once_cell",
|
||||||
|
"pageserver_api",
|
||||||
|
"pageserver_client",
|
||||||
|
"postgres_connection",
|
||||||
|
"r2d2",
|
||||||
|
"reqwest",
|
||||||
|
"routerify",
|
||||||
|
"serde",
|
||||||
|
"serde_json",
|
||||||
|
"thiserror",
|
||||||
|
"tokio",
|
||||||
|
"tokio-util",
|
||||||
|
"tracing",
|
||||||
|
"utils",
|
||||||
|
"workspace_hack",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "storcon_cli"
|
||||||
|
version = "0.1.0"
|
||||||
|
dependencies = [
|
||||||
|
"anyhow",
|
||||||
|
"clap",
|
||||||
|
"comfy-table",
|
||||||
|
"hyper",
|
||||||
|
"pageserver_api",
|
||||||
|
"pageserver_client",
|
||||||
|
"reqwest",
|
||||||
|
"serde",
|
||||||
|
"serde_json",
|
||||||
|
"thiserror",
|
||||||
|
"tokio",
|
||||||
|
"tracing",
|
||||||
|
"utils",
|
||||||
|
"workspace_hack",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "stringprep"
|
name = "stringprep"
|
||||||
version = "0.1.2"
|
version = "0.1.2"
|
||||||
@@ -5777,23 +5799,23 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "test-context"
|
name = "test-context"
|
||||||
version = "0.1.4"
|
version = "0.3.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "055831a02a4f5aa28fede67f2902014273eb8c21b958ac5ebbd59b71ef30dbc3"
|
checksum = "6676ab8513edfd2601a108621103fdb45cac9098305ca25ec93f7023b06b05d9"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-trait",
|
|
||||||
"futures",
|
"futures",
|
||||||
"test-context-macros",
|
"test-context-macros",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "test-context-macros"
|
name = "test-context-macros"
|
||||||
version = "0.1.4"
|
version = "0.3.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "8901a55b0a7a06ebc4a674dcca925170da8e613fa3b163a1df804ed10afb154d"
|
checksum = "78ea17a2dc368aeca6f554343ced1b1e31f76d63683fa8016e5844bd7a5144a1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"syn 1.0.109",
|
"syn 2.0.52",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -5934,9 +5956,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tokio"
|
name = "tokio"
|
||||||
version = "1.36.0"
|
version = "1.37.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931"
|
checksum = "1adbebffeca75fcfd058afa480fb6c0b81e165a0323f9c9d39c9697e37c46787"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"backtrace",
|
"backtrace",
|
||||||
"bytes",
|
"bytes",
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ resolver = "2"
|
|||||||
members = [
|
members = [
|
||||||
"compute_tools",
|
"compute_tools",
|
||||||
"control_plane",
|
"control_plane",
|
||||||
"control_plane/attachment_service",
|
"control_plane/storcon_cli",
|
||||||
"pageserver",
|
"pageserver",
|
||||||
"pageserver/compaction",
|
"pageserver/compaction",
|
||||||
"pageserver/ctl",
|
"pageserver/ctl",
|
||||||
@@ -12,6 +12,7 @@ members = [
|
|||||||
"proxy",
|
"proxy",
|
||||||
"safekeeper",
|
"safekeeper",
|
||||||
"storage_broker",
|
"storage_broker",
|
||||||
|
"storage_controller",
|
||||||
"s3_scrubber",
|
"s3_scrubber",
|
||||||
"workspace_hack",
|
"workspace_hack",
|
||||||
"trace",
|
"trace",
|
||||||
@@ -158,7 +159,7 @@ svg_fmt = "0.4.1"
|
|||||||
sync_wrapper = "0.1.2"
|
sync_wrapper = "0.1.2"
|
||||||
tar = "0.4"
|
tar = "0.4"
|
||||||
task-local-extensions = "0.1.4"
|
task-local-extensions = "0.1.4"
|
||||||
test-context = "0.1"
|
test-context = "0.3"
|
||||||
thiserror = "1.0"
|
thiserror = "1.0"
|
||||||
tikv-jemallocator = "0.5"
|
tikv-jemallocator = "0.5"
|
||||||
tikv-jemalloc-ctl = "0.5"
|
tikv-jemalloc-ctl = "0.5"
|
||||||
|
|||||||
@@ -944,6 +944,9 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
|
|||||||
COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
|
COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
|
||||||
COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
|
COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
|
||||||
|
|
||||||
|
# Create remote extension download directory
|
||||||
|
RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions
|
||||||
|
|
||||||
# Install:
|
# Install:
|
||||||
# libreadline8 for psql
|
# libreadline8 for psql
|
||||||
# libicu67, locales for collations (including ICU and plpgsql_check)
|
# libicu67, locales for collations (including ICU and plpgsql_check)
|
||||||
|
|||||||
@@ -1262,10 +1262,12 @@ LIMIT 100",
|
|||||||
.await
|
.await
|
||||||
.map_err(DownloadError::Other);
|
.map_err(DownloadError::Other);
|
||||||
|
|
||||||
self.ext_download_progress
|
if download_size.is_ok() {
|
||||||
.write()
|
self.ext_download_progress
|
||||||
.expect("bad lock")
|
.write()
|
||||||
.insert(ext_archive_name.to_string(), (download_start, true));
|
.expect("bad lock")
|
||||||
|
.insert(ext_archive_name.to_string(), (download_start, true));
|
||||||
|
}
|
||||||
|
|
||||||
download_size
|
download_size
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -302,9 +302,9 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
|||||||
RoleAction::Create => {
|
RoleAction::Create => {
|
||||||
// This branch only runs when roles are created through the console, so it is
|
// This branch only runs when roles are created through the console, so it is
|
||||||
// safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
|
// safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
|
||||||
// from neon_superuser. (NOTE: REPLICATION has been removed from here for now).
|
// from neon_superuser.
|
||||||
let mut query: String = format!(
|
let mut query: String = format!(
|
||||||
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
|
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
|
||||||
name.pg_quote()
|
name.pg_quote()
|
||||||
);
|
);
|
||||||
info!("running role create query: '{}'", &query);
|
info!("running role create query: '{}'", &query);
|
||||||
@@ -743,21 +743,24 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
|
|||||||
// which may happen in two cases:
|
// which may happen in two cases:
|
||||||
// - extension was just installed
|
// - extension was just installed
|
||||||
// - extension was already installed and is up to date
|
// - extension was already installed and is up to date
|
||||||
// DISABLED due to compute node unpinning epic
|
let query = "ALTER EXTENSION neon UPDATE";
|
||||||
// let query = "ALTER EXTENSION neon UPDATE";
|
info!("update neon extension version with query: {}", query);
|
||||||
// info!("update neon extension version with query: {}", query);
|
if let Err(e) = client.simple_query(query) {
|
||||||
// client.simple_query(query)?;
|
error!(
|
||||||
|
"failed to upgrade neon extension during `handle_extension_neon`: {}",
|
||||||
|
e
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[instrument(skip_all)]
|
#[instrument(skip_all)]
|
||||||
pub fn handle_neon_extension_upgrade(_client: &mut Client) -> Result<()> {
|
pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
|
||||||
info!("handle neon extension upgrade (not really)");
|
info!("handle neon extension upgrade");
|
||||||
// DISABLED due to compute node unpinning epic
|
let query = "ALTER EXTENSION neon UPDATE";
|
||||||
// let query = "ALTER EXTENSION neon UPDATE";
|
info!("update neon extension version with query: {}", query);
|
||||||
// info!("update neon extension version with query: {}", query);
|
client.simple_query(query)?;
|
||||||
// client.simple_query(query)?;
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -806,19 +809,8 @@ $$;"#,
|
|||||||
"",
|
"",
|
||||||
"",
|
"",
|
||||||
"",
|
"",
|
||||||
|
"",
|
||||||
// Add new migrations below.
|
// Add new migrations below.
|
||||||
r#"
|
|
||||||
DO $$
|
|
||||||
DECLARE
|
|
||||||
role_name TEXT;
|
|
||||||
BEGIN
|
|
||||||
FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
|
|
||||||
LOOP
|
|
||||||
RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
|
|
||||||
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
|
|
||||||
END LOOP;
|
|
||||||
END
|
|
||||||
$$;"#,
|
|
||||||
];
|
];
|
||||||
|
|
||||||
let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
|
let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
|
||||||
|
|||||||
@@ -14,9 +14,7 @@ use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
|
|||||||
use control_plane::safekeeper::SafekeeperNode;
|
use control_plane::safekeeper::SafekeeperNode;
|
||||||
use control_plane::storage_controller::StorageController;
|
use control_plane::storage_controller::StorageController;
|
||||||
use control_plane::{broker, local_env};
|
use control_plane::{broker, local_env};
|
||||||
use pageserver_api::controller_api::{
|
use pageserver_api::controller_api::PlacementPolicy;
|
||||||
NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, PlacementPolicy,
|
|
||||||
};
|
|
||||||
use pageserver_api::models::{
|
use pageserver_api::models::{
|
||||||
ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
|
ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
|
||||||
};
|
};
|
||||||
@@ -1060,21 +1058,6 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Some(("set-state", subcommand_args)) => {
|
|
||||||
let pageserver = get_pageserver(env, subcommand_args)?;
|
|
||||||
let scheduling = subcommand_args.get_one("scheduling");
|
|
||||||
let availability = subcommand_args.get_one("availability");
|
|
||||||
|
|
||||||
let storage_controller = StorageController::from_env(env);
|
|
||||||
storage_controller
|
|
||||||
.node_configure(NodeConfigureRequest {
|
|
||||||
node_id: pageserver.conf.id,
|
|
||||||
scheduling: scheduling.cloned(),
|
|
||||||
availability: availability.cloned(),
|
|
||||||
})
|
|
||||||
.await?;
|
|
||||||
}
|
|
||||||
|
|
||||||
Some(("status", subcommand_args)) => {
|
Some(("status", subcommand_args)) => {
|
||||||
match get_pageserver(env, subcommand_args)?.check_status().await {
|
match get_pageserver(env, subcommand_args)?.check_status().await {
|
||||||
Ok(_) => println!("Page server is up and running"),
|
Ok(_) => println!("Page server is up and running"),
|
||||||
@@ -1515,12 +1498,6 @@ fn cli() -> Command {
|
|||||||
.about("Restart local pageserver")
|
.about("Restart local pageserver")
|
||||||
.arg(pageserver_config_args.clone())
|
.arg(pageserver_config_args.clone())
|
||||||
)
|
)
|
||||||
.subcommand(Command::new("set-state")
|
|
||||||
.arg(Arg::new("availability").value_parser(value_parser!(NodeAvailability)).long("availability").action(ArgAction::Set).help("Availability state: offline,active"))
|
|
||||||
.arg(Arg::new("scheduling").value_parser(value_parser!(NodeSchedulingPolicy)).long("scheduling").action(ArgAction::Set).help("Scheduling state: draining,pause,filling,active"))
|
|
||||||
.about("Set scheduling or availability state of pageserver node")
|
|
||||||
.arg(pageserver_config_args.clone())
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
.subcommand(
|
.subcommand(
|
||||||
Command::new("storage_controller")
|
Command::new("storage_controller")
|
||||||
|
|||||||
@@ -389,6 +389,10 @@ impl PageServerNode {
|
|||||||
.remove("image_creation_threshold")
|
.remove("image_creation_threshold")
|
||||||
.map(|x| x.parse::<usize>())
|
.map(|x| x.parse::<usize>())
|
||||||
.transpose()?,
|
.transpose()?,
|
||||||
|
image_layer_creation_check_threshold: settings
|
||||||
|
.remove("image_layer_creation_check_threshold")
|
||||||
|
.map(|x| x.parse::<u8>())
|
||||||
|
.transpose()?,
|
||||||
pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
|
pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
|
||||||
walreceiver_connect_timeout: settings
|
walreceiver_connect_timeout: settings
|
||||||
.remove("walreceiver_connect_timeout")
|
.remove("walreceiver_connect_timeout")
|
||||||
@@ -501,6 +505,12 @@ impl PageServerNode {
|
|||||||
.map(|x| x.parse::<usize>())
|
.map(|x| x.parse::<usize>())
|
||||||
.transpose()
|
.transpose()
|
||||||
.context("Failed to parse 'image_creation_threshold' as non zero integer")?,
|
.context("Failed to parse 'image_creation_threshold' as non zero integer")?,
|
||||||
|
image_layer_creation_check_threshold: settings
|
||||||
|
.remove("image_layer_creation_check_threshold")
|
||||||
|
.map(|x| x.parse::<u8>())
|
||||||
|
.transpose()
|
||||||
|
.context("Failed to parse 'image_creation_check_threshold' as integer")?,
|
||||||
|
|
||||||
pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
|
pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
|
||||||
walreceiver_connect_timeout: settings
|
walreceiver_connect_timeout: settings
|
||||||
.remove("walreceiver_connect_timeout")
|
.remove("walreceiver_connect_timeout")
|
||||||
|
|||||||
23
control_plane/storcon_cli/Cargo.toml
Normal file
23
control_plane/storcon_cli/Cargo.toml
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
[package]
|
||||||
|
name = "storcon_cli"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition.workspace = true
|
||||||
|
license.workspace = true
|
||||||
|
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
anyhow.workspace = true
|
||||||
|
clap.workspace = true
|
||||||
|
comfy-table.workspace = true
|
||||||
|
hyper.workspace = true
|
||||||
|
pageserver_api.workspace = true
|
||||||
|
pageserver_client.workspace = true
|
||||||
|
reqwest.workspace = true
|
||||||
|
serde.workspace = true
|
||||||
|
serde_json = { workspace = true, features = ["raw_value"] }
|
||||||
|
thiserror.workspace = true
|
||||||
|
tokio.workspace = true
|
||||||
|
tracing.workspace = true
|
||||||
|
utils.workspace = true
|
||||||
|
workspace_hack.workspace = true
|
||||||
|
|
||||||
587
control_plane/storcon_cli/src/main.rs
Normal file
587
control_plane/storcon_cli/src/main.rs
Normal file
@@ -0,0 +1,587 @@
|
|||||||
|
use std::{collections::HashMap, str::FromStr};
|
||||||
|
|
||||||
|
use clap::{Parser, Subcommand};
|
||||||
|
use hyper::Method;
|
||||||
|
use pageserver_api::{
|
||||||
|
controller_api::{
|
||||||
|
NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
|
||||||
|
TenantDescribeResponse, TenantPolicyRequest,
|
||||||
|
},
|
||||||
|
models::{
|
||||||
|
ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest,
|
||||||
|
TenantShardSplitRequest, TenantShardSplitResponse,
|
||||||
|
},
|
||||||
|
shard::{ShardStripeSize, TenantShardId},
|
||||||
|
};
|
||||||
|
use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
|
||||||
|
use reqwest::Url;
|
||||||
|
use serde::{de::DeserializeOwned, Serialize};
|
||||||
|
use utils::id::{NodeId, TenantId};
|
||||||
|
|
||||||
|
use pageserver_api::controller_api::{
|
||||||
|
NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
|
||||||
|
TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse,
|
||||||
|
};
|
||||||
|
|
||||||
|
#[derive(Subcommand, Debug)]
|
||||||
|
enum Command {
|
||||||
|
/// Register a pageserver with the storage controller. This shouldn't usually be necessary,
|
||||||
|
/// since pageservers auto-register when they start up
|
||||||
|
NodeRegister {
|
||||||
|
#[arg(long)]
|
||||||
|
node_id: NodeId,
|
||||||
|
|
||||||
|
#[arg(long)]
|
||||||
|
listen_pg_addr: String,
|
||||||
|
#[arg(long)]
|
||||||
|
listen_pg_port: u16,
|
||||||
|
|
||||||
|
#[arg(long)]
|
||||||
|
listen_http_addr: String,
|
||||||
|
#[arg(long)]
|
||||||
|
listen_http_port: u16,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Modify a node's configuration in the storage controller
|
||||||
|
NodeConfigure {
|
||||||
|
#[arg(long)]
|
||||||
|
node_id: NodeId,
|
||||||
|
|
||||||
|
/// Availability is usually auto-detected based on heartbeats. Set 'offline' here to
|
||||||
|
/// manually mark a node offline
|
||||||
|
#[arg(long)]
|
||||||
|
availability: Option<NodeAvailabilityArg>,
|
||||||
|
/// Scheduling policy controls whether tenant shards may be scheduled onto this node.
|
||||||
|
#[arg(long)]
|
||||||
|
scheduling: Option<NodeSchedulingPolicy>,
|
||||||
|
},
|
||||||
|
/// Modify a tenant's policies in the storage controller
|
||||||
|
TenantPolicy {
|
||||||
|
#[arg(long)]
|
||||||
|
tenant_id: TenantId,
|
||||||
|
/// Placement policy controls whether a tenant is `detached`, has only a secondary location (`secondary`),
|
||||||
|
/// or is in the normal attached state with N secondary locations (`attached:N`)
|
||||||
|
#[arg(long)]
|
||||||
|
placement: Option<PlacementPolicyArg>,
|
||||||
|
/// Scheduling policy enables pausing the controller's scheduling activity involving this tenant. `active` is normal,
|
||||||
|
/// `essential` disables optimization scheduling changes, `pause` disables all scheduling changes, and `stop` prevents
|
||||||
|
/// all reconciliation activity including for scheduling changes already made. `pause` and `stop` can make a tenant
|
||||||
|
/// unavailable, and are only for use in emergencies.
|
||||||
|
#[arg(long)]
|
||||||
|
scheduling: Option<ShardSchedulingPolicyArg>,
|
||||||
|
},
|
||||||
|
/// List nodes known to the storage controller
|
||||||
|
Nodes {},
|
||||||
|
/// List tenants known to the storage controller
|
||||||
|
Tenants {},
|
||||||
|
/// Create a new tenant in the storage controller, and by extension on pageservers.
|
||||||
|
TenantCreate {
|
||||||
|
#[arg(long)]
|
||||||
|
tenant_id: TenantId,
|
||||||
|
},
|
||||||
|
/// Delete a tenant in the storage controller, and by extension on pageservers.
|
||||||
|
TenantDelete {
|
||||||
|
#[arg(long)]
|
||||||
|
tenant_id: TenantId,
|
||||||
|
},
|
||||||
|
/// Split an existing tenant into a higher number of shards than its current shard count.
|
||||||
|
TenantShardSplit {
|
||||||
|
#[arg(long)]
|
||||||
|
tenant_id: TenantId,
|
||||||
|
#[arg(long)]
|
||||||
|
shard_count: u8,
|
||||||
|
/// Optional, in 8kiB pages. e.g. set 2048 for 16MB stripes.
|
||||||
|
#[arg(long)]
|
||||||
|
stripe_size: Option<u32>,
|
||||||
|
},
|
||||||
|
/// Migrate the attached location for a tenant shard to a specific pageserver.
|
||||||
|
TenantShardMigrate {
|
||||||
|
#[arg(long)]
|
||||||
|
tenant_shard_id: TenantShardId,
|
||||||
|
#[arg(long)]
|
||||||
|
node: NodeId,
|
||||||
|
},
|
||||||
|
/// Modify the pageserver tenant configuration of a tenant: this is the configuration structure
|
||||||
|
/// that is passed through to pageservers, and does not affect storage controller behavior.
|
||||||
|
TenantConfig {
|
||||||
|
#[arg(long)]
|
||||||
|
tenant_id: TenantId,
|
||||||
|
#[arg(long)]
|
||||||
|
config: String,
|
||||||
|
},
|
||||||
|
/// Attempt to balance the locations for a tenant across pageservers. This is a client-side
|
||||||
|
/// alternative to the storage controller's scheduling optimization behavior.
|
||||||
|
TenantScatter {
|
||||||
|
#[arg(long)]
|
||||||
|
tenant_id: TenantId,
|
||||||
|
},
|
||||||
|
/// Print details about a particular tenant, including all its shards' states.
|
||||||
|
TenantDescribe {
|
||||||
|
#[arg(long)]
|
||||||
|
tenant_id: TenantId,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Parser)]
|
||||||
|
#[command(
|
||||||
|
author,
|
||||||
|
version,
|
||||||
|
about,
|
||||||
|
long_about = "CLI for Storage Controller Support/Debug"
|
||||||
|
)]
|
||||||
|
#[command(arg_required_else_help(true))]
|
||||||
|
struct Cli {
|
||||||
|
#[arg(long)]
|
||||||
|
/// URL to storage controller. e.g. http://127.0.0.1:1234 when using `neon_local`
|
||||||
|
api: Url,
|
||||||
|
|
||||||
|
#[arg(long)]
|
||||||
|
/// JWT token for authenticating with storage controller. Depending on the API used, this
|
||||||
|
/// should have either `pageserverapi` or `admin` scopes: for convenience, you should mint
|
||||||
|
/// a token with both scopes to use with this tool.
|
||||||
|
jwt: Option<String>,
|
||||||
|
|
||||||
|
#[command(subcommand)]
|
||||||
|
command: Command,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
struct PlacementPolicyArg(PlacementPolicy);
|
||||||
|
|
||||||
|
impl FromStr for PlacementPolicyArg {
|
||||||
|
type Err = anyhow::Error;
|
||||||
|
|
||||||
|
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||||
|
match s {
|
||||||
|
"detached" => Ok(Self(PlacementPolicy::Detached)),
|
||||||
|
"secondary" => Ok(Self(PlacementPolicy::Secondary)),
|
||||||
|
_ if s.starts_with("attached:") => {
|
||||||
|
let mut splitter = s.split(':');
|
||||||
|
let _prefix = splitter.next().unwrap();
|
||||||
|
match splitter.next().and_then(|s| s.parse::<usize>().ok()) {
|
||||||
|
Some(n) => Ok(Self(PlacementPolicy::Attached(n))),
|
||||||
|
None => Err(anyhow::anyhow!(
|
||||||
|
"Invalid format '{s}', a valid example is 'attached:1'"
|
||||||
|
)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => Err(anyhow::anyhow!(
|
||||||
|
"Unknown placement policy '{s}', try detached,secondary,attached:<n>"
|
||||||
|
)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
struct ShardSchedulingPolicyArg(ShardSchedulingPolicy);
|
||||||
|
|
||||||
|
impl FromStr for ShardSchedulingPolicyArg {
|
||||||
|
type Err = anyhow::Error;
|
||||||
|
|
||||||
|
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||||
|
match s {
|
||||||
|
"active" => Ok(Self(ShardSchedulingPolicy::Active)),
|
||||||
|
"essential" => Ok(Self(ShardSchedulingPolicy::Essential)),
|
||||||
|
"pause" => Ok(Self(ShardSchedulingPolicy::Pause)),
|
||||||
|
"stop" => Ok(Self(ShardSchedulingPolicy::Stop)),
|
||||||
|
_ => Err(anyhow::anyhow!(
|
||||||
|
"Unknown scheduling policy '{s}', try active,essential,pause,stop"
|
||||||
|
)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
struct NodeAvailabilityArg(NodeAvailabilityWrapper);
|
||||||
|
|
||||||
|
impl FromStr for NodeAvailabilityArg {
|
||||||
|
type Err = anyhow::Error;
|
||||||
|
|
||||||
|
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||||
|
match s {
|
||||||
|
"active" => Ok(Self(NodeAvailabilityWrapper::Active)),
|
||||||
|
"offline" => Ok(Self(NodeAvailabilityWrapper::Offline)),
|
||||||
|
_ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct Client {
|
||||||
|
base_url: Url,
|
||||||
|
jwt_token: Option<String>,
|
||||||
|
client: reqwest::Client,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Client {
|
||||||
|
fn new(base_url: Url, jwt_token: Option<String>) -> Self {
|
||||||
|
Self {
|
||||||
|
base_url,
|
||||||
|
jwt_token,
|
||||||
|
client: reqwest::ClientBuilder::new()
|
||||||
|
.build()
|
||||||
|
.expect("Failed to construct http client"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Simple HTTP request wrapper for calling into storage controller
|
||||||
|
async fn dispatch<RQ, RS>(
|
||||||
|
&self,
|
||||||
|
method: hyper::Method,
|
||||||
|
path: String,
|
||||||
|
body: Option<RQ>,
|
||||||
|
) -> mgmt_api::Result<RS>
|
||||||
|
where
|
||||||
|
RQ: Serialize + Sized,
|
||||||
|
RS: DeserializeOwned + Sized,
|
||||||
|
{
|
||||||
|
// The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
|
||||||
|
// for general purpose API access.
|
||||||
|
let url = Url::from_str(&format!(
|
||||||
|
"http://{}:{}/{path}",
|
||||||
|
self.base_url.host_str().unwrap(),
|
||||||
|
self.base_url.port().unwrap()
|
||||||
|
))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let mut builder = self.client.request(method, url);
|
||||||
|
if let Some(body) = body {
|
||||||
|
builder = builder.json(&body)
|
||||||
|
}
|
||||||
|
if let Some(jwt_token) = &self.jwt_token {
|
||||||
|
builder = builder.header(
|
||||||
|
reqwest::header::AUTHORIZATION,
|
||||||
|
format!("Bearer {jwt_token}"),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?;
|
||||||
|
let response = response.error_from_body().await?;
|
||||||
|
|
||||||
|
response
|
||||||
|
.json()
|
||||||
|
.await
|
||||||
|
.map_err(pageserver_client::mgmt_api::Error::ReceiveBody)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() -> anyhow::Result<()> {
|
||||||
|
let cli = Cli::parse();
|
||||||
|
|
||||||
|
let storcon_client = Client::new(cli.api.clone(), cli.jwt.clone());
|
||||||
|
|
||||||
|
let mut trimmed = cli.api.to_string();
|
||||||
|
trimmed.pop();
|
||||||
|
let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref());
|
||||||
|
|
||||||
|
match cli.command {
|
||||||
|
Command::NodeRegister {
|
||||||
|
node_id,
|
||||||
|
listen_pg_addr,
|
||||||
|
listen_pg_port,
|
||||||
|
listen_http_addr,
|
||||||
|
listen_http_port,
|
||||||
|
} => {
|
||||||
|
storcon_client
|
||||||
|
.dispatch::<_, ()>(
|
||||||
|
Method::POST,
|
||||||
|
"control/v1/node".to_string(),
|
||||||
|
Some(NodeRegisterRequest {
|
||||||
|
node_id,
|
||||||
|
listen_pg_addr,
|
||||||
|
listen_pg_port,
|
||||||
|
listen_http_addr,
|
||||||
|
listen_http_port,
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
|
Command::TenantCreate { tenant_id } => {
|
||||||
|
vps_client
|
||||||
|
.tenant_create(&TenantCreateRequest {
|
||||||
|
new_tenant_id: TenantShardId::unsharded(tenant_id),
|
||||||
|
generation: None,
|
||||||
|
shard_parameters: ShardParameters::default(),
|
||||||
|
placement_policy: Some(PlacementPolicy::Attached(1)),
|
||||||
|
config: TenantConfig::default(),
|
||||||
|
})
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
|
Command::TenantDelete { tenant_id } => {
|
||||||
|
let status = vps_client
|
||||||
|
.tenant_delete(TenantShardId::unsharded(tenant_id))
|
||||||
|
.await?;
|
||||||
|
tracing::info!("Delete status: {}", status);
|
||||||
|
}
|
||||||
|
Command::Nodes {} => {
|
||||||
|
let resp = storcon_client
|
||||||
|
.dispatch::<(), Vec<NodeDescribeResponse>>(
|
||||||
|
Method::GET,
|
||||||
|
"control/v1/node".to_string(),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
let mut table = comfy_table::Table::new();
|
||||||
|
table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
|
||||||
|
for node in resp {
|
||||||
|
table.add_row([
|
||||||
|
format!("{}", node.id),
|
||||||
|
node.listen_http_addr,
|
||||||
|
format!("{:?}", node.scheduling),
|
||||||
|
format!("{:?}", node.availability),
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
println!("{table}");
|
||||||
|
}
|
||||||
|
Command::NodeConfigure {
|
||||||
|
node_id,
|
||||||
|
availability,
|
||||||
|
scheduling,
|
||||||
|
} => {
|
||||||
|
let req = NodeConfigureRequest {
|
||||||
|
node_id,
|
||||||
|
availability: availability.map(|a| a.0),
|
||||||
|
scheduling,
|
||||||
|
};
|
||||||
|
storcon_client
|
||||||
|
.dispatch::<_, ()>(
|
||||||
|
Method::PUT,
|
||||||
|
format!("control/v1/node/{node_id}/config"),
|
||||||
|
Some(req),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
|
Command::Tenants {} => {
|
||||||
|
let resp = storcon_client
|
||||||
|
.dispatch::<(), Vec<TenantDescribeResponse>>(
|
||||||
|
Method::GET,
|
||||||
|
"control/v1/tenant".to_string(),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
let mut table = comfy_table::Table::new();
|
||||||
|
table.set_header([
|
||||||
|
"TenantId",
|
||||||
|
"ShardCount",
|
||||||
|
"StripeSize",
|
||||||
|
"Placement",
|
||||||
|
"Scheduling",
|
||||||
|
]);
|
||||||
|
for tenant in resp {
|
||||||
|
let shard_zero = tenant.shards.into_iter().next().unwrap();
|
||||||
|
table.add_row([
|
||||||
|
format!("{}", tenant.tenant_id),
|
||||||
|
format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
|
||||||
|
format!("{:?}", tenant.stripe_size),
|
||||||
|
format!("{:?}", tenant.policy),
|
||||||
|
format!("{:?}", shard_zero.scheduling_policy),
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
|
||||||
|
println!("{table}");
|
||||||
|
}
|
||||||
|
Command::TenantPolicy {
|
||||||
|
tenant_id,
|
||||||
|
placement,
|
||||||
|
scheduling,
|
||||||
|
} => {
|
||||||
|
let req = TenantPolicyRequest {
|
||||||
|
scheduling: scheduling.map(|s| s.0),
|
||||||
|
placement: placement.map(|p| p.0),
|
||||||
|
};
|
||||||
|
storcon_client
|
||||||
|
.dispatch::<_, ()>(
|
||||||
|
Method::PUT,
|
||||||
|
format!("control/v1/tenant/{tenant_id}/policy"),
|
||||||
|
Some(req),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
|
Command::TenantShardSplit {
|
||||||
|
tenant_id,
|
||||||
|
shard_count,
|
||||||
|
stripe_size,
|
||||||
|
} => {
|
||||||
|
let req = TenantShardSplitRequest {
|
||||||
|
new_shard_count: shard_count,
|
||||||
|
new_stripe_size: stripe_size.map(ShardStripeSize),
|
||||||
|
};
|
||||||
|
|
||||||
|
let response = storcon_client
|
||||||
|
.dispatch::<TenantShardSplitRequest, TenantShardSplitResponse>(
|
||||||
|
Method::PUT,
|
||||||
|
format!("control/v1/tenant/{tenant_id}/shard_split"),
|
||||||
|
Some(req),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
println!(
|
||||||
|
"Split tenant {} into {} shards: {}",
|
||||||
|
tenant_id,
|
||||||
|
shard_count,
|
||||||
|
response
|
||||||
|
.new_shards
|
||||||
|
.iter()
|
||||||
|
.map(|s| format!("{:?}", s))
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
.join(",")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
Command::TenantShardMigrate {
|
||||||
|
tenant_shard_id,
|
||||||
|
node,
|
||||||
|
} => {
|
||||||
|
let req = TenantShardMigrateRequest {
|
||||||
|
tenant_shard_id,
|
||||||
|
node_id: node,
|
||||||
|
};
|
||||||
|
|
||||||
|
storcon_client
|
||||||
|
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
|
||||||
|
Method::PUT,
|
||||||
|
format!("control/v1/tenant/{tenant_shard_id}/migrate"),
|
||||||
|
Some(req),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
|
Command::TenantConfig { tenant_id, config } => {
|
||||||
|
let tenant_conf = serde_json::from_str(&config)?;
|
||||||
|
|
||||||
|
vps_client
|
||||||
|
.tenant_config(&TenantConfigRequest {
|
||||||
|
tenant_id,
|
||||||
|
config: tenant_conf,
|
||||||
|
})
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
|
Command::TenantScatter { tenant_id } => {
|
||||||
|
// Find the shards
|
||||||
|
let locate_response = storcon_client
|
||||||
|
.dispatch::<(), TenantLocateResponse>(
|
||||||
|
Method::GET,
|
||||||
|
format!("control/v1/tenant/{tenant_id}/locate"),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
let shards = locate_response.shards;
|
||||||
|
|
||||||
|
let mut node_to_shards: HashMap<NodeId, Vec<TenantShardId>> = HashMap::new();
|
||||||
|
let shard_count = shards.len();
|
||||||
|
for s in shards {
|
||||||
|
let entry = node_to_shards.entry(s.node_id).or_default();
|
||||||
|
entry.push(s.shard_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load list of available nodes
|
||||||
|
let nodes_resp = storcon_client
|
||||||
|
.dispatch::<(), Vec<NodeDescribeResponse>>(
|
||||||
|
Method::GET,
|
||||||
|
"control/v1/node".to_string(),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
for node in nodes_resp {
|
||||||
|
if matches!(node.availability, NodeAvailabilityWrapper::Active) {
|
||||||
|
node_to_shards.entry(node.id).or_default();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let max_shard_per_node = shard_count / node_to_shards.len();
|
||||||
|
|
||||||
|
loop {
|
||||||
|
let mut migrate_shard = None;
|
||||||
|
for shards in node_to_shards.values_mut() {
|
||||||
|
if shards.len() > max_shard_per_node {
|
||||||
|
// Pick the emptiest
|
||||||
|
migrate_shard = Some(shards.pop().unwrap());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let Some(migrate_shard) = migrate_shard else {
|
||||||
|
break;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Pick the emptiest node to migrate to
|
||||||
|
let mut destinations = node_to_shards
|
||||||
|
.iter()
|
||||||
|
.map(|(k, v)| (k, v.len()))
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
destinations.sort_by_key(|i| i.1);
|
||||||
|
let (destination_node, destination_count) = *destinations.first().unwrap();
|
||||||
|
if destination_count + 1 > max_shard_per_node {
|
||||||
|
// Even the emptiest destination doesn't have space: we're done
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
let destination_node = *destination_node;
|
||||||
|
|
||||||
|
node_to_shards
|
||||||
|
.get_mut(&destination_node)
|
||||||
|
.unwrap()
|
||||||
|
.push(migrate_shard);
|
||||||
|
|
||||||
|
println!("Migrate {} -> {} ...", migrate_shard, destination_node);
|
||||||
|
|
||||||
|
storcon_client
|
||||||
|
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
|
||||||
|
Method::PUT,
|
||||||
|
format!("control/v1/tenant/{migrate_shard}/migrate"),
|
||||||
|
Some(TenantShardMigrateRequest {
|
||||||
|
tenant_shard_id: migrate_shard,
|
||||||
|
node_id: destination_node,
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
println!("Migrate {} -> {} OK", migrate_shard, destination_node);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Spread the shards across the nodes
|
||||||
|
}
|
||||||
|
Command::TenantDescribe { tenant_id } => {
|
||||||
|
let describe_response = storcon_client
|
||||||
|
.dispatch::<(), TenantDescribeResponse>(
|
||||||
|
Method::GET,
|
||||||
|
format!("control/v1/tenant/{tenant_id}"),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
let shards = describe_response.shards;
|
||||||
|
let mut table = comfy_table::Table::new();
|
||||||
|
table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]);
|
||||||
|
for shard in shards {
|
||||||
|
let secondary = shard
|
||||||
|
.node_secondary
|
||||||
|
.iter()
|
||||||
|
.map(|n| format!("{}", n))
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
.join(",");
|
||||||
|
|
||||||
|
let mut status_parts = Vec::new();
|
||||||
|
if shard.is_reconciling {
|
||||||
|
status_parts.push("reconciling");
|
||||||
|
}
|
||||||
|
|
||||||
|
if shard.is_pending_compute_notification {
|
||||||
|
status_parts.push("pending_compute");
|
||||||
|
}
|
||||||
|
|
||||||
|
if shard.is_splitting {
|
||||||
|
status_parts.push("splitting");
|
||||||
|
}
|
||||||
|
let status = status_parts.join(",");
|
||||||
|
|
||||||
|
table.add_row([
|
||||||
|
format!("{}", shard.tenant_shard_id),
|
||||||
|
shard
|
||||||
|
.node_attached
|
||||||
|
.map(|n| format!("{}", n))
|
||||||
|
.unwrap_or(String::new()),
|
||||||
|
secondary,
|
||||||
|
shard.last_error,
|
||||||
|
status,
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
println!("{table}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
@@ -2,8 +2,8 @@
|
|||||||
# see https://diesel.rs/guides/configuring-diesel-cli
|
# see https://diesel.rs/guides/configuring-diesel-cli
|
||||||
|
|
||||||
[print_schema]
|
[print_schema]
|
||||||
file = "control_plane/attachment_service/src/schema.rs"
|
file = "storage_controller/src/schema.rs"
|
||||||
custom_type_derives = ["diesel::query_builder::QueryId"]
|
custom_type_derives = ["diesel::query_builder::QueryId"]
|
||||||
|
|
||||||
[migrations_directory]
|
[migrations_directory]
|
||||||
dir = "control_plane/attachment_service/migrations"
|
dir = "storage_controller/migrations"
|
||||||
|
|||||||
@@ -7,6 +7,11 @@ Below you will find a brief overview of each subdir in the source tree in alphab
|
|||||||
Neon storage broker, providing messaging between safekeepers and pageservers.
|
Neon storage broker, providing messaging between safekeepers and pageservers.
|
||||||
[storage_broker.md](./storage_broker.md)
|
[storage_broker.md](./storage_broker.md)
|
||||||
|
|
||||||
|
`storage_controller`:
|
||||||
|
|
||||||
|
Neon storage controller, manages a cluster of pageservers and exposes an API that enables
|
||||||
|
managing a many-sharded tenant as a single entity.
|
||||||
|
|
||||||
`/control_plane`:
|
`/control_plane`:
|
||||||
|
|
||||||
Local control plane.
|
Local control plane.
|
||||||
|
|||||||
@@ -2,9 +2,9 @@ use std::str::FromStr;
|
|||||||
|
|
||||||
/// Request/response types for the storage controller
|
/// Request/response types for the storage controller
|
||||||
/// API (`/control/v1` prefix). Implemented by the server
|
/// API (`/control/v1` prefix). Implemented by the server
|
||||||
/// in [`attachment_service::http`]
|
/// in [`storage_controller::http`]
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use utils::id::NodeId;
|
use utils::id::{NodeId, TenantId};
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
models::{ShardParameters, TenantConfig},
|
models::{ShardParameters, TenantConfig},
|
||||||
@@ -42,6 +42,12 @@ pub struct NodeConfigureRequest {
|
|||||||
pub scheduling: Option<NodeSchedulingPolicy>,
|
pub scheduling: Option<NodeSchedulingPolicy>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize)]
|
||||||
|
pub struct TenantPolicyRequest {
|
||||||
|
pub placement: Option<PlacementPolicy>,
|
||||||
|
pub scheduling: Option<ShardSchedulingPolicy>,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
pub struct TenantLocateResponseShard {
|
pub struct TenantLocateResponseShard {
|
||||||
pub shard_id: TenantShardId,
|
pub shard_id: TenantShardId,
|
||||||
@@ -62,12 +68,27 @@ pub struct TenantLocateResponse {
|
|||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize)]
|
||||||
pub struct TenantDescribeResponse {
|
pub struct TenantDescribeResponse {
|
||||||
|
pub tenant_id: TenantId,
|
||||||
pub shards: Vec<TenantDescribeResponseShard>,
|
pub shards: Vec<TenantDescribeResponseShard>,
|
||||||
pub stripe_size: ShardStripeSize,
|
pub stripe_size: ShardStripeSize,
|
||||||
pub policy: PlacementPolicy,
|
pub policy: PlacementPolicy,
|
||||||
pub config: TenantConfig,
|
pub config: TenantConfig,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize)]
|
||||||
|
pub struct NodeDescribeResponse {
|
||||||
|
pub id: NodeId,
|
||||||
|
|
||||||
|
pub availability: NodeAvailabilityWrapper,
|
||||||
|
pub scheduling: NodeSchedulingPolicy,
|
||||||
|
|
||||||
|
pub listen_http_addr: String,
|
||||||
|
pub listen_http_port: u16,
|
||||||
|
|
||||||
|
pub listen_pg_addr: String,
|
||||||
|
pub listen_pg_port: u16,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize)]
|
||||||
pub struct TenantDescribeResponseShard {
|
pub struct TenantDescribeResponseShard {
|
||||||
pub tenant_shard_id: TenantShardId,
|
pub tenant_shard_id: TenantShardId,
|
||||||
@@ -83,6 +104,8 @@ pub struct TenantDescribeResponseShard {
|
|||||||
pub is_pending_compute_notification: bool,
|
pub is_pending_compute_notification: bool,
|
||||||
/// A shard split is currently underway
|
/// A shard split is currently underway
|
||||||
pub is_splitting: bool,
|
pub is_splitting: bool,
|
||||||
|
|
||||||
|
pub scheduling_policy: ShardSchedulingPolicy,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Explicitly migrating a particular shard is a low level operation
|
/// Explicitly migrating a particular shard is a low level operation
|
||||||
@@ -97,7 +120,7 @@ pub struct TenantShardMigrateRequest {
|
|||||||
/// Utilisation score indicating how good a candidate a pageserver
|
/// Utilisation score indicating how good a candidate a pageserver
|
||||||
/// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
|
/// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
|
||||||
/// Lower values are better.
|
/// Lower values are better.
|
||||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
|
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)]
|
||||||
pub struct UtilizationScore(pub u64);
|
pub struct UtilizationScore(pub u64);
|
||||||
|
|
||||||
impl UtilizationScore {
|
impl UtilizationScore {
|
||||||
@@ -106,7 +129,7 @@ impl UtilizationScore {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Clone, Copy)]
|
#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
|
||||||
#[serde(into = "NodeAvailabilityWrapper")]
|
#[serde(into = "NodeAvailabilityWrapper")]
|
||||||
pub enum NodeAvailability {
|
pub enum NodeAvailability {
|
||||||
// Normal, happy state
|
// Normal, happy state
|
||||||
@@ -129,7 +152,7 @@ impl Eq for NodeAvailability {}
|
|||||||
// This wrapper provides serde functionality and it should only be used to
|
// This wrapper provides serde functionality and it should only be used to
|
||||||
// communicate with external callers which don't know or care about the
|
// communicate with external callers which don't know or care about the
|
||||||
// utilisation score of the pageserver it is targeting.
|
// utilisation score of the pageserver it is targeting.
|
||||||
#[derive(Serialize, Deserialize, Clone)]
|
#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
|
||||||
pub enum NodeAvailabilityWrapper {
|
pub enum NodeAvailabilityWrapper {
|
||||||
Active,
|
Active,
|
||||||
Offline,
|
Offline,
|
||||||
@@ -155,22 +178,33 @@ impl From<NodeAvailability> for NodeAvailabilityWrapper {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl FromStr for NodeAvailability {
|
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
|
||||||
type Err = anyhow::Error;
|
pub enum ShardSchedulingPolicy {
|
||||||
|
// Normal mode: the tenant's scheduled locations may be updated at will, including
|
||||||
|
// for non-essential optimization.
|
||||||
|
Active,
|
||||||
|
|
||||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
// Disable optimizations, but permit scheduling when necessary to fulfil the PlacementPolicy.
|
||||||
match s {
|
// For example, this still permits a node's attachment location to change to a secondary in
|
||||||
// This is used when parsing node configuration requests from neon-local.
|
// response to a node failure, or to assign a new secondary if a node was removed.
|
||||||
// Assume the worst possible utilisation score
|
Essential,
|
||||||
// and let it get updated via the heartbeats.
|
|
||||||
"active" => Ok(Self::Active(UtilizationScore::worst())),
|
// No scheduling: leave the shard running wherever it currently is. Even if the shard is
|
||||||
"offline" => Ok(Self::Offline),
|
// unavailable, it will not be rescheduled to another node.
|
||||||
_ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
|
Pause,
|
||||||
}
|
|
||||||
|
// No reconciling: we will make no location_conf API calls to pageservers at all. If the
|
||||||
|
// shard is unavailable, it stays that way. If a node fails, this shard doesn't get failed over.
|
||||||
|
Stop,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for ShardSchedulingPolicy {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::Active
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
|
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
|
||||||
pub enum NodeSchedulingPolicy {
|
pub enum NodeSchedulingPolicy {
|
||||||
Active,
|
Active,
|
||||||
Filling,
|
Filling,
|
||||||
|
|||||||
@@ -301,6 +301,7 @@ pub struct TenantConfig {
|
|||||||
pub heatmap_period: Option<String>,
|
pub heatmap_period: Option<String>,
|
||||||
pub lazy_slru_download: Option<bool>,
|
pub lazy_slru_download: Option<bool>,
|
||||||
pub timeline_get_throttle: Option<ThrottleConfig>,
|
pub timeline_get_throttle: Option<ThrottleConfig>,
|
||||||
|
pub image_layer_creation_check_threshold: Option<u8>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||||
|
|||||||
@@ -565,6 +565,16 @@ impl GenericRemoteStorage {
|
|||||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
pub struct StorageMetadata(HashMap<String, String>);
|
pub struct StorageMetadata(HashMap<String, String>);
|
||||||
|
|
||||||
|
impl<const N: usize> From<[(&str, &str); N]> for StorageMetadata {
|
||||||
|
fn from(arr: [(&str, &str); N]) -> Self {
|
||||||
|
let map: HashMap<String, String> = arr
|
||||||
|
.iter()
|
||||||
|
.map(|(k, v)| (k.to_string(), v.to_string()))
|
||||||
|
.collect();
|
||||||
|
Self(map)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// External backup storage configuration, enough for creating a client for that storage.
|
/// External backup storage configuration, enough for creating a client for that storage.
|
||||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
pub struct RemoteStorageConfig {
|
pub struct RemoteStorageConfig {
|
||||||
|
|||||||
@@ -57,7 +57,6 @@ enum MaybeEnabledStorage {
|
|||||||
Disabled,
|
Disabled,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
|
||||||
impl AsyncTestContext for MaybeEnabledStorage {
|
impl AsyncTestContext for MaybeEnabledStorage {
|
||||||
async fn setup() -> Self {
|
async fn setup() -> Self {
|
||||||
ensure_logging_ready();
|
ensure_logging_ready();
|
||||||
@@ -86,7 +85,6 @@ struct AzureWithTestBlobs {
|
|||||||
remote_blobs: HashSet<RemotePath>,
|
remote_blobs: HashSet<RemotePath>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
|
||||||
impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
|
impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
|
||||||
async fn setup() -> Self {
|
async fn setup() -> Self {
|
||||||
ensure_logging_ready();
|
ensure_logging_ready();
|
||||||
@@ -148,7 +146,6 @@ struct AzureWithSimpleTestBlobs {
|
|||||||
remote_blobs: HashSet<RemotePath>,
|
remote_blobs: HashSet<RemotePath>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
|
||||||
impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
|
impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
|
||||||
async fn setup() -> Self {
|
async fn setup() -> Self {
|
||||||
ensure_logging_ready();
|
ensure_logging_ready();
|
||||||
|
|||||||
@@ -219,7 +219,6 @@ enum MaybeEnabledStorage {
|
|||||||
Disabled,
|
Disabled,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
|
||||||
impl AsyncTestContext for MaybeEnabledStorage {
|
impl AsyncTestContext for MaybeEnabledStorage {
|
||||||
async fn setup() -> Self {
|
async fn setup() -> Self {
|
||||||
ensure_logging_ready();
|
ensure_logging_ready();
|
||||||
@@ -248,7 +247,6 @@ struct S3WithTestBlobs {
|
|||||||
remote_blobs: HashSet<RemotePath>,
|
remote_blobs: HashSet<RemotePath>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
|
||||||
impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
|
impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
|
||||||
async fn setup() -> Self {
|
async fn setup() -> Self {
|
||||||
ensure_logging_ready();
|
ensure_logging_ready();
|
||||||
@@ -310,7 +308,6 @@ struct S3WithSimpleTestBlobs {
|
|||||||
remote_blobs: HashSet<RemotePath>,
|
remote_blobs: HashSet<RemotePath>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
|
||||||
impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
|
impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
|
||||||
async fn setup() -> Self {
|
async fn setup() -> Self {
|
||||||
ensure_logging_ready();
|
ensure_logging_ready();
|
||||||
|
|||||||
@@ -182,6 +182,18 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Check if [`Self::wait_for`] or [`Self::wait_for_timeout`] would wait if called with `num`.
|
||||||
|
pub fn would_wait_for(&self, num: V) -> Result<(), V> {
|
||||||
|
let internal = self.internal.lock().unwrap();
|
||||||
|
let cnt = internal.current.cnt_value();
|
||||||
|
drop(internal);
|
||||||
|
if cnt >= num {
|
||||||
|
Ok(())
|
||||||
|
} else {
|
||||||
|
Err(cnt)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Register and return a channel that will be notified when a number arrives,
|
/// Register and return a channel that will be notified when a number arrives,
|
||||||
/// or None, if it has already arrived.
|
/// or None, if it has already arrived.
|
||||||
fn queue_for_wait(&self, num: V) -> Result<Option<Receiver<()>>, SeqWaitError> {
|
fn queue_for_wait(&self, num: V) -> Result<Option<Receiver<()>>, SeqWaitError> {
|
||||||
|
|||||||
@@ -27,25 +27,25 @@
|
|||||||
//!
|
//!
|
||||||
//! # Reference Numbers
|
//! # Reference Numbers
|
||||||
//!
|
//!
|
||||||
//! 2024-03-20 on i3en.3xlarge
|
//! 2024-04-04 on i3en.3xlarge
|
||||||
//!
|
//!
|
||||||
//! ```text
|
//! ```text
|
||||||
//! short/1 time: [26.483 µs 26.614 µs 26.767 µs]
|
//! short/1 time: [25.925 µs 26.060 µs 26.209 µs]
|
||||||
//! short/2 time: [32.223 µs 32.465 µs 32.767 µs]
|
//! short/2 time: [31.277 µs 31.483 µs 31.722 µs]
|
||||||
//! short/4 time: [47.203 µs 47.583 µs 47.984 µs]
|
//! short/4 time: [45.496 µs 45.831 µs 46.182 µs]
|
||||||
//! short/8 time: [89.135 µs 89.612 µs 90.139 µs]
|
//! short/8 time: [84.298 µs 84.920 µs 85.566 µs]
|
||||||
//! short/16 time: [190.12 µs 191.52 µs 192.88 µs]
|
//! short/16 time: [185.04 µs 186.41 µs 187.88 µs]
|
||||||
//! short/32 time: [380.96 µs 382.63 µs 384.20 µs]
|
//! short/32 time: [385.01 µs 386.77 µs 388.70 µs]
|
||||||
//! short/64 time: [736.86 µs 741.07 µs 745.03 µs]
|
//! short/64 time: [770.24 µs 773.04 µs 776.04 µs]
|
||||||
//! short/128 time: [1.4106 ms 1.4206 ms 1.4294 ms]
|
//! short/128 time: [1.5017 ms 1.5064 ms 1.5113 ms]
|
||||||
//! medium/1 time: [111.81 µs 112.25 µs 112.79 µs]
|
//! medium/1 time: [106.65 µs 107.20 µs 107.85 µs]
|
||||||
//! medium/2 time: [158.26 µs 159.13 µs 160.21 µs]
|
//! medium/2 time: [153.28 µs 154.24 µs 155.56 µs]
|
||||||
//! medium/4 time: [334.65 µs 337.14 µs 340.07 µs]
|
//! medium/4 time: [325.67 µs 327.01 µs 328.71 µs]
|
||||||
//! medium/8 time: [675.32 µs 679.91 µs 685.25 µs]
|
//! medium/8 time: [646.82 µs 650.17 µs 653.91 µs]
|
||||||
//! medium/16 time: [1.2929 ms 1.2996 ms 1.3067 ms]
|
//! medium/16 time: [1.2645 ms 1.2701 ms 1.2762 ms]
|
||||||
//! medium/32 time: [2.4295 ms 2.4461 ms 2.4623 ms]
|
//! medium/32 time: [2.4409 ms 2.4550 ms 2.4692 ms]
|
||||||
//! medium/64 time: [4.3973 ms 4.4458 ms 4.4875 ms]
|
//! medium/64 time: [4.6814 ms 4.7114 ms 4.7408 ms]
|
||||||
//! medium/128 time: [7.5955 ms 7.7847 ms 7.9481 ms]
|
//! medium/128 time: [8.7790 ms 8.9037 ms 9.0282 ms]
|
||||||
//! ```
|
//! ```
|
||||||
|
|
||||||
use bytes::{Buf, Bytes};
|
use bytes::{Buf, Bytes};
|
||||||
|
|||||||
@@ -128,12 +128,12 @@ impl Client {
|
|||||||
|
|
||||||
pub async fn timeline_info(
|
pub async fn timeline_info(
|
||||||
&self,
|
&self,
|
||||||
tenant_id: TenantId,
|
tenant_shard_id: TenantShardId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
force_await_logical_size: ForceAwaitLogicalSize,
|
force_await_logical_size: ForceAwaitLogicalSize,
|
||||||
) -> Result<pageserver_api::models::TimelineInfo> {
|
) -> Result<pageserver_api::models::TimelineInfo> {
|
||||||
let uri = format!(
|
let uri = format!(
|
||||||
"{}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
|
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}",
|
||||||
self.mgmt_api_endpoint
|
self.mgmt_api_endpoint
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -151,11 +151,11 @@ impl Client {
|
|||||||
|
|
||||||
pub async fn keyspace(
|
pub async fn keyspace(
|
||||||
&self,
|
&self,
|
||||||
tenant_id: TenantId,
|
tenant_shard_id: TenantShardId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
) -> Result<pageserver_api::models::partitioning::Partitioning> {
|
) -> Result<pageserver_api::models::partitioning::Partitioning> {
|
||||||
let uri = format!(
|
let uri = format!(
|
||||||
"{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/keyspace",
|
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/keyspace",
|
||||||
self.mgmt_api_endpoint
|
self.mgmt_api_endpoint
|
||||||
);
|
);
|
||||||
self.get(&uri)
|
self.get(&uri)
|
||||||
|
|||||||
@@ -43,7 +43,8 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
|
|||||||
fanout: u64,
|
fanout: u64,
|
||||||
ctx: &E::RequestContext,
|
ctx: &E::RequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
assert!(fanout >= 2);
|
assert!(fanout >= 1, "fanout needs to be at least 1 but is {fanout}");
|
||||||
|
let exp_base = fanout.max(2);
|
||||||
// Start at L0
|
// Start at L0
|
||||||
let mut current_level_no = 0;
|
let mut current_level_no = 0;
|
||||||
let mut current_level_target_height = target_file_size;
|
let mut current_level_target_height = target_file_size;
|
||||||
@@ -106,7 +107,7 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
current_level_no += 1;
|
current_level_no += 1;
|
||||||
current_level_target_height = current_level_target_height.saturating_mul(fanout);
|
current_level_target_height = current_level_target_height.saturating_mul(exp_base);
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
|
use pageserver_api::shard::TenantShardId;
|
||||||
use pageserver_client::mgmt_api::ForceAwaitLogicalSize;
|
use pageserver_client::mgmt_api::ForceAwaitLogicalSize;
|
||||||
use pageserver_client::page_service::BasebackupRequest;
|
use pageserver_client::page_service::BasebackupRequest;
|
||||||
|
|
||||||
@@ -95,7 +96,7 @@ async fn main_impl(
|
|||||||
let timeline = *timeline;
|
let timeline = *timeline;
|
||||||
let info = mgmt_api_client
|
let info = mgmt_api_client
|
||||||
.timeline_info(
|
.timeline_info(
|
||||||
timeline.tenant_id,
|
TenantShardId::unsharded(timeline.tenant_id),
|
||||||
timeline.timeline_id,
|
timeline.timeline_id,
|
||||||
ForceAwaitLogicalSize::No,
|
ForceAwaitLogicalSize::No,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ use pageserver_api::key::{is_rel_block_key, key_to_rel_block, Key};
|
|||||||
use pageserver_api::keyspace::KeySpaceAccum;
|
use pageserver_api::keyspace::KeySpaceAccum;
|
||||||
use pageserver_api::models::PagestreamGetPageRequest;
|
use pageserver_api::models::PagestreamGetPageRequest;
|
||||||
|
|
||||||
|
use pageserver_api::shard::TenantShardId;
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
use utils::id::TenantTimelineId;
|
use utils::id::TenantTimelineId;
|
||||||
use utils::lsn::Lsn;
|
use utils::lsn::Lsn;
|
||||||
@@ -173,7 +174,10 @@ async fn main_impl(
|
|||||||
let timeline = *timeline;
|
let timeline = *timeline;
|
||||||
async move {
|
async move {
|
||||||
let partitioning = mgmt_api_client
|
let partitioning = mgmt_api_client
|
||||||
.keyspace(timeline.tenant_id, timeline.timeline_id)
|
.keyspace(
|
||||||
|
TenantShardId::unsharded(timeline.tenant_id),
|
||||||
|
timeline.timeline_id,
|
||||||
|
)
|
||||||
.await?;
|
.await?;
|
||||||
let lsn = partitioning.at_lsn;
|
let lsn = partitioning.at_lsn;
|
||||||
let start = Instant::now();
|
let start = Instant::now();
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use humantime::Duration;
|
use humantime::Duration;
|
||||||
|
use pageserver_api::shard::TenantShardId;
|
||||||
use tokio::task::JoinSet;
|
use tokio::task::JoinSet;
|
||||||
use utils::id::TenantTimelineId;
|
use utils::id::TenantTimelineId;
|
||||||
|
|
||||||
@@ -59,7 +60,11 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
|
|||||||
let mgmt_api_client = Arc::clone(&mgmt_api_client);
|
let mgmt_api_client = Arc::clone(&mgmt_api_client);
|
||||||
js.spawn(async move {
|
js.spawn(async move {
|
||||||
let info = mgmt_api_client
|
let info = mgmt_api_client
|
||||||
.timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes)
|
.timeline_info(
|
||||||
|
TenantShardId::unsharded(tl.tenant_id),
|
||||||
|
tl.timeline_id,
|
||||||
|
ForceAwaitLogicalSize::Yes,
|
||||||
|
)
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
@@ -74,7 +79,11 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
|
|||||||
while !info.current_logical_size_is_accurate {
|
while !info.current_logical_size_is_accurate {
|
||||||
ticker.tick().await;
|
ticker.tick().await;
|
||||||
info = mgmt_api_client
|
info = mgmt_api_client
|
||||||
.timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes)
|
.timeline_info(
|
||||||
|
TenantShardId::unsharded(tl.tenant_id),
|
||||||
|
tl.timeline_id,
|
||||||
|
ForceAwaitLogicalSize::Yes,
|
||||||
|
)
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
|
|||||||
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
|
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
|
||||||
use pageserver::tenant::{secondary, TenantSharedResources};
|
use pageserver::tenant::{secondary, TenantSharedResources};
|
||||||
use remote_storage::GenericRemoteStorage;
|
use remote_storage::GenericRemoteStorage;
|
||||||
|
use tokio::signal::unix::SignalKind;
|
||||||
use tokio::time::Instant;
|
use tokio::time::Instant;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
|
|
||||||
@@ -671,42 +672,37 @@ fn start_pageserver(
|
|||||||
let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
|
let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
|
||||||
|
|
||||||
// All started up! Now just sit and wait for shutdown signal.
|
// All started up! Now just sit and wait for shutdown signal.
|
||||||
{
|
|
||||||
use signal_hook::consts::*;
|
|
||||||
let signal_handler = BACKGROUND_RUNTIME.spawn_blocking(move || {
|
|
||||||
let mut signals =
|
|
||||||
signal_hook::iterator::Signals::new([SIGINT, SIGTERM, SIGQUIT]).unwrap();
|
|
||||||
return signals
|
|
||||||
.forever()
|
|
||||||
.next()
|
|
||||||
.expect("forever() never returns None unless explicitly closed");
|
|
||||||
});
|
|
||||||
let signal = BACKGROUND_RUNTIME
|
|
||||||
.block_on(signal_handler)
|
|
||||||
.expect("join error");
|
|
||||||
match signal {
|
|
||||||
SIGQUIT => {
|
|
||||||
info!("Got signal {signal}. Terminating in immediate shutdown mode",);
|
|
||||||
std::process::exit(111);
|
|
||||||
}
|
|
||||||
SIGINT | SIGTERM => {
|
|
||||||
info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);
|
|
||||||
|
|
||||||
// This cancels the `shutdown_pageserver` cancellation tree.
|
{
|
||||||
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
|
BACKGROUND_RUNTIME.block_on(async move {
|
||||||
// The plan is to change that over time.
|
let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap();
|
||||||
shutdown_pageserver.take();
|
let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap();
|
||||||
let bg_remote_storage = remote_storage.clone();
|
let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap();
|
||||||
let bg_deletion_queue = deletion_queue.clone();
|
let signal = tokio::select! {
|
||||||
BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
|
_ = sigquit.recv() => {
|
||||||
&tenant_manager,
|
info!("Got signal SIGQUIT. Terminating in immediate shutdown mode",);
|
||||||
bg_remote_storage.map(|_| bg_deletion_queue),
|
std::process::exit(111);
|
||||||
0,
|
}
|
||||||
));
|
_ = sigint.recv() => { "SIGINT" },
|
||||||
unreachable!()
|
_ = sigterm.recv() => { "SIGTERM" },
|
||||||
}
|
};
|
||||||
_ => unreachable!(),
|
|
||||||
}
|
info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);
|
||||||
|
|
||||||
|
// This cancels the `shutdown_pageserver` cancellation tree.
|
||||||
|
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
|
||||||
|
// The plan is to change that over time.
|
||||||
|
shutdown_pageserver.take();
|
||||||
|
let bg_remote_storage = remote_storage.clone();
|
||||||
|
let bg_deletion_queue = deletion_queue.clone();
|
||||||
|
pageserver::shutdown_pageserver(
|
||||||
|
&tenant_manager,
|
||||||
|
bg_remote_storage.map(|_| bg_deletion_queue),
|
||||||
|
0,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
unreachable!()
|
||||||
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ use pageserver_api::{
|
|||||||
use serde::{de::DeserializeOwned, Serialize};
|
use serde::{de::DeserializeOwned, Serialize};
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
use url::Url;
|
use url::Url;
|
||||||
use utils::{backoff, generation::Generation, id::NodeId};
|
use utils::{backoff, failpoint_support, generation::Generation, id::NodeId};
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
config::{NodeMetadata, PageServerConf},
|
config::{NodeMetadata, PageServerConf},
|
||||||
@@ -210,7 +210,10 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
|||||||
.collect(),
|
.collect(),
|
||||||
};
|
};
|
||||||
|
|
||||||
fail::fail_point!("control-plane-client-validate");
|
failpoint_support::sleep_millis_async!("control-plane-client-validate-sleep", &self.cancel);
|
||||||
|
if self.cancel.is_cancelled() {
|
||||||
|
return Err(RetryForeverError::ShuttingDown);
|
||||||
|
}
|
||||||
|
|
||||||
let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;
|
let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;
|
||||||
|
|
||||||
|
|||||||
@@ -1629,7 +1629,7 @@ components:
|
|||||||
type: integer
|
type: integer
|
||||||
format: int64
|
format: int64
|
||||||
minimum: 0
|
minimum: 0
|
||||||
description: The amount of disk space currently utilized by layer files.
|
description: The amount of disk space currently used.
|
||||||
free_space_bytes:
|
free_space_bytes:
|
||||||
type: integer
|
type: integer
|
||||||
format: int64
|
format: int64
|
||||||
|
|||||||
@@ -993,11 +993,26 @@ async fn tenant_status(
|
|||||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||||
let state = get_state(&request);
|
let state = get_state(&request);
|
||||||
|
|
||||||
|
// In tests, sometimes we want to query the state of a tenant without auto-activating it if it's currently waiting.
|
||||||
|
let activate = true;
|
||||||
|
#[cfg(feature = "testing")]
|
||||||
|
let activate = parse_query_param(&request, "activate")?.unwrap_or(activate);
|
||||||
|
|
||||||
let tenant_info = async {
|
let tenant_info = async {
|
||||||
let tenant = state
|
let tenant = state
|
||||||
.tenant_manager
|
.tenant_manager
|
||||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||||
|
|
||||||
|
if activate {
|
||||||
|
// This is advisory: we prefer to let the tenant activate on-demand when this function is
|
||||||
|
// called, but it is still valid to return 200 and describe the current state of the tenant
|
||||||
|
// if it doesn't make it into an active state.
|
||||||
|
tenant
|
||||||
|
.wait_to_become_active(ACTIVE_TENANT_TIMEOUT)
|
||||||
|
.await
|
||||||
|
.ok();
|
||||||
|
}
|
||||||
|
|
||||||
// Calculate total physical size of all timelines
|
// Calculate total physical size of all timelines
|
||||||
let mut current_physical_size = 0;
|
let mut current_physical_size = 0;
|
||||||
for timeline in tenant.list_timelines().iter() {
|
for timeline in tenant.list_timelines().iter() {
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ use anyhow::{bail, ensure, Context, Result};
|
|||||||
use bytes::Bytes;
|
use bytes::Bytes;
|
||||||
use camino::Utf8Path;
|
use camino::Utf8Path;
|
||||||
use futures::StreamExt;
|
use futures::StreamExt;
|
||||||
|
use pageserver_api::key::rel_block_to_key;
|
||||||
use tokio::io::{AsyncRead, AsyncReadExt};
|
use tokio::io::{AsyncRead, AsyncReadExt};
|
||||||
use tokio_tar::Archive;
|
use tokio_tar::Archive;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
@@ -170,7 +171,10 @@ async fn import_rel(
|
|||||||
let r = reader.read_exact(&mut buf).await;
|
let r = reader.read_exact(&mut buf).await;
|
||||||
match r {
|
match r {
|
||||||
Ok(_) => {
|
Ok(_) => {
|
||||||
modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
|
let key = rel_block_to_key(rel, blknum);
|
||||||
|
if modification.tline.get_shard_identity().is_key_local(&key) {
|
||||||
|
modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: UnexpectedEof is expected
|
// TODO: UnexpectedEof is expected
|
||||||
|
|||||||
@@ -1483,12 +1483,18 @@ pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
|
|||||||
});
|
});
|
||||||
|
|
||||||
pub(crate) struct WalIngestMetrics {
|
pub(crate) struct WalIngestMetrics {
|
||||||
|
pub(crate) bytes_received: IntCounter,
|
||||||
pub(crate) records_received: IntCounter,
|
pub(crate) records_received: IntCounter,
|
||||||
pub(crate) records_committed: IntCounter,
|
pub(crate) records_committed: IntCounter,
|
||||||
pub(crate) records_filtered: IntCounter,
|
pub(crate) records_filtered: IntCounter,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
|
pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
|
||||||
|
bytes_received: register_int_counter!(
|
||||||
|
"pageserver_wal_ingest_bytes_received",
|
||||||
|
"Bytes of WAL ingested from safekeepers",
|
||||||
|
)
|
||||||
|
.unwrap(),
|
||||||
records_received: register_int_counter!(
|
records_received: register_int_counter!(
|
||||||
"pageserver_wal_ingest_records_received",
|
"pageserver_wal_ingest_records_received",
|
||||||
"Number of WAL records received from safekeepers"
|
"Number of WAL records received from safekeepers"
|
||||||
|
|||||||
@@ -876,7 +876,13 @@ impl PageServerHandler {
|
|||||||
if lsn <= last_record_lsn {
|
if lsn <= last_record_lsn {
|
||||||
lsn = last_record_lsn;
|
lsn = last_record_lsn;
|
||||||
} else {
|
} else {
|
||||||
timeline.wait_lsn(lsn, ctx).await?;
|
timeline
|
||||||
|
.wait_lsn(
|
||||||
|
lsn,
|
||||||
|
crate::tenant::timeline::WaitLsnWaiter::PageService,
|
||||||
|
ctx,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
// Since we waited for 'lsn' to arrive, that is now the last
|
// Since we waited for 'lsn' to arrive, that is now the last
|
||||||
// record LSN. (Or close enough for our purposes; the
|
// record LSN. (Or close enough for our purposes; the
|
||||||
// last-record LSN can advance immediately after we return
|
// last-record LSN can advance immediately after we return
|
||||||
@@ -888,7 +894,13 @@ impl PageServerHandler {
|
|||||||
"invalid LSN(0) in request".into(),
|
"invalid LSN(0) in request".into(),
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
timeline.wait_lsn(lsn, ctx).await?;
|
timeline
|
||||||
|
.wait_lsn(
|
||||||
|
lsn,
|
||||||
|
crate::tenant::timeline::WaitLsnWaiter::PageService,
|
||||||
|
ctx,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
}
|
}
|
||||||
|
|
||||||
if lsn < **latest_gc_cutoff_lsn {
|
if lsn < **latest_gc_cutoff_lsn {
|
||||||
@@ -1215,7 +1227,13 @@ impl PageServerHandler {
|
|||||||
if let Some(lsn) = lsn {
|
if let Some(lsn) = lsn {
|
||||||
// Backup was requested at a particular LSN. Wait for it to arrive.
|
// Backup was requested at a particular LSN. Wait for it to arrive.
|
||||||
info!("waiting for {}", lsn);
|
info!("waiting for {}", lsn);
|
||||||
timeline.wait_lsn(lsn, ctx).await?;
|
timeline
|
||||||
|
.wait_lsn(
|
||||||
|
lsn,
|
||||||
|
crate::tenant::timeline::WaitLsnWaiter::PageService,
|
||||||
|
ctx,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
timeline
|
timeline
|
||||||
.check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
|
.check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
|
||||||
.context("invalid basebackup lsn")?;
|
.context("invalid basebackup lsn")?;
|
||||||
|
|||||||
@@ -214,13 +214,12 @@ pub enum TaskKind {
|
|||||||
/// Internally, `Client` hands over requests to the `Connection` object.
|
/// Internally, `Client` hands over requests to the `Connection` object.
|
||||||
/// The `Connection` object is responsible for speaking the wire protocol.
|
/// The `Connection` object is responsible for speaking the wire protocol.
|
||||||
///
|
///
|
||||||
/// Walreceiver uses its own abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
|
/// Walreceiver uses a legacy abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
|
||||||
/// That abstraction doesn't use `task_mgr`.
|
|
||||||
/// The `WalReceiverManager` task ensures that this `TaskHandle` task does not outlive the `WalReceiverManager` task.
|
/// The `WalReceiverManager` task ensures that this `TaskHandle` task does not outlive the `WalReceiverManager` task.
|
||||||
/// For the `RequestContext` that we hand to the TaskHandle, we use the [`WalReceiverConnectionHandler`] task kind.
|
/// For the `RequestContext` that we hand to the TaskHandle, we use the [`WalReceiverConnectionHandler`] task kind.
|
||||||
///
|
///
|
||||||
/// Once the connection is established, the `TaskHandle` task creates a
|
/// Once the connection is established, the `TaskHandle` task spawns a
|
||||||
/// [`WalReceiverConnectionPoller`] task_mgr task that is responsible for polling
|
/// [`WalReceiverConnectionPoller`] task that is responsible for polling
|
||||||
/// the `Connection` object.
|
/// the `Connection` object.
|
||||||
/// A `CancellationToken` created by the `TaskHandle` task ensures
|
/// A `CancellationToken` created by the `TaskHandle` task ensures
|
||||||
/// that the [`WalReceiverConnectionPoller`] task will cancel soon after as the `TaskHandle` is dropped.
|
/// that the [`WalReceiverConnectionPoller`] task will cancel soon after as the `TaskHandle` is dropped.
|
||||||
@@ -230,7 +229,6 @@ pub enum TaskKind {
|
|||||||
WalReceiverManager,
|
WalReceiverManager,
|
||||||
|
|
||||||
/// The `TaskHandle` task that executes `handle_walreceiver_connection`.
|
/// The `TaskHandle` task that executes `handle_walreceiver_connection`.
|
||||||
/// Not a `task_mgr` task, but we use this `TaskKind` for its `RequestContext`.
|
|
||||||
/// See the comment on [`WalReceiverManager`].
|
/// See the comment on [`WalReceiverManager`].
|
||||||
///
|
///
|
||||||
/// [`WalReceiverManager`]: Self::WalReceiverManager
|
/// [`WalReceiverManager`]: Self::WalReceiverManager
|
||||||
|
|||||||
@@ -12,6 +12,7 @@
|
|||||||
//!
|
//!
|
||||||
|
|
||||||
use anyhow::{bail, Context};
|
use anyhow::{bail, Context};
|
||||||
|
use arc_swap::ArcSwap;
|
||||||
use camino::Utf8Path;
|
use camino::Utf8Path;
|
||||||
use camino::Utf8PathBuf;
|
use camino::Utf8PathBuf;
|
||||||
use enumset::EnumSet;
|
use enumset::EnumSet;
|
||||||
@@ -98,7 +99,7 @@ use std::ops::Bound::Included;
|
|||||||
use std::sync::atomic::AtomicU64;
|
use std::sync::atomic::AtomicU64;
|
||||||
use std::sync::atomic::Ordering;
|
use std::sync::atomic::Ordering;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::sync::{Mutex, RwLock};
|
use std::sync::Mutex;
|
||||||
use std::time::{Duration, Instant};
|
use std::time::{Duration, Instant};
|
||||||
|
|
||||||
use crate::span;
|
use crate::span;
|
||||||
@@ -260,7 +261,7 @@ pub struct Tenant {
|
|||||||
// We keep TenantConfOpt sturct here to preserve the information
|
// We keep TenantConfOpt sturct here to preserve the information
|
||||||
// about parameters that are not set.
|
// about parameters that are not set.
|
||||||
// This is necessary to allow global config updates.
|
// This is necessary to allow global config updates.
|
||||||
tenant_conf: Arc<RwLock<AttachedTenantConf>>,
|
tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
|
||||||
|
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_shard_id: TenantShardId,
|
||||||
|
|
||||||
@@ -1515,7 +1516,7 @@ impl Tenant {
|
|||||||
// sizes etc. and that would get confused if the previous page versions
|
// sizes etc. and that would get confused if the previous page versions
|
||||||
// are not in the repository yet.
|
// are not in the repository yet.
|
||||||
ancestor_timeline
|
ancestor_timeline
|
||||||
.wait_lsn(*lsn, ctx)
|
.wait_lsn(*lsn, timeline::WaitLsnWaiter::Tenant, ctx)
|
||||||
.await
|
.await
|
||||||
.map_err(|e| match e {
|
.map_err(|e| match e {
|
||||||
e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => {
|
e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => {
|
||||||
@@ -1606,7 +1607,7 @@ impl Tenant {
|
|||||||
);
|
);
|
||||||
|
|
||||||
{
|
{
|
||||||
let conf = self.tenant_conf.read().unwrap();
|
let conf = self.tenant_conf.load();
|
||||||
|
|
||||||
if !conf.location.may_delete_layers_hint() {
|
if !conf.location.may_delete_layers_hint() {
|
||||||
info!("Skipping GC in location state {:?}", conf.location);
|
info!("Skipping GC in location state {:?}", conf.location);
|
||||||
@@ -1633,7 +1634,7 @@ impl Tenant {
|
|||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
let conf = self.tenant_conf.read().unwrap();
|
let conf = self.tenant_conf.load();
|
||||||
if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() {
|
if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() {
|
||||||
info!("Skipping compaction in location state {:?}", conf.location);
|
info!("Skipping compaction in location state {:?}", conf.location);
|
||||||
return Ok(());
|
return Ok(());
|
||||||
@@ -1782,7 +1783,7 @@ impl Tenant {
|
|||||||
async fn shutdown(
|
async fn shutdown(
|
||||||
&self,
|
&self,
|
||||||
shutdown_progress: completion::Barrier,
|
shutdown_progress: completion::Barrier,
|
||||||
freeze_and_flush: bool,
|
shutdown_mode: timeline::ShutdownMode,
|
||||||
) -> Result<(), completion::Barrier> {
|
) -> Result<(), completion::Barrier> {
|
||||||
span::debug_assert_current_span_has_tenant_id();
|
span::debug_assert_current_span_has_tenant_id();
|
||||||
|
|
||||||
@@ -1829,16 +1830,8 @@ impl Tenant {
|
|||||||
timelines.values().for_each(|timeline| {
|
timelines.values().for_each(|timeline| {
|
||||||
let timeline = Arc::clone(timeline);
|
let timeline = Arc::clone(timeline);
|
||||||
let timeline_id = timeline.timeline_id;
|
let timeline_id = timeline.timeline_id;
|
||||||
|
let span = tracing::info_span!("timeline_shutdown", %timeline_id, ?shutdown_mode);
|
||||||
let span =
|
js.spawn(async move { timeline.shutdown(shutdown_mode).instrument(span).await });
|
||||||
tracing::info_span!("timeline_shutdown", %timeline_id, ?freeze_and_flush);
|
|
||||||
js.spawn(async move {
|
|
||||||
if freeze_and_flush {
|
|
||||||
timeline.flush_and_shutdown().instrument(span).await
|
|
||||||
} else {
|
|
||||||
timeline.shutdown().instrument(span).await
|
|
||||||
}
|
|
||||||
});
|
|
||||||
})
|
})
|
||||||
};
|
};
|
||||||
// test_long_timeline_create_then_tenant_delete is leaning on this message
|
// test_long_timeline_create_then_tenant_delete is leaning on this message
|
||||||
@@ -2082,14 +2075,14 @@ impl Tenant {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn get_attach_mode(&self) -> AttachmentMode {
|
pub(crate) fn get_attach_mode(&self) -> AttachmentMode {
|
||||||
self.tenant_conf.read().unwrap().location.attach_mode
|
self.tenant_conf.load().location.attach_mode
|
||||||
}
|
}
|
||||||
|
|
||||||
/// For API access: generate a LocationConfig equivalent to the one that would be used to
|
/// For API access: generate a LocationConfig equivalent to the one that would be used to
|
||||||
/// create a Tenant in the same state. Do not use this in hot paths: it's for relatively
|
/// create a Tenant in the same state. Do not use this in hot paths: it's for relatively
|
||||||
/// rare external API calls, like a reconciliation at startup.
|
/// rare external API calls, like a reconciliation at startup.
|
||||||
pub(crate) fn get_location_conf(&self) -> models::LocationConfig {
|
pub(crate) fn get_location_conf(&self) -> models::LocationConfig {
|
||||||
let conf = self.tenant_conf.read().unwrap();
|
let conf = self.tenant_conf.load();
|
||||||
|
|
||||||
let location_config_mode = match conf.location.attach_mode {
|
let location_config_mode = match conf.location.attach_mode {
|
||||||
AttachmentMode::Single => models::LocationConfigMode::AttachedSingle,
|
AttachmentMode::Single => models::LocationConfigMode::AttachedSingle,
|
||||||
@@ -2236,7 +2229,7 @@ where
|
|||||||
|
|
||||||
impl Tenant {
|
impl Tenant {
|
||||||
pub fn tenant_specific_overrides(&self) -> TenantConfOpt {
|
pub fn tenant_specific_overrides(&self) -> TenantConfOpt {
|
||||||
self.tenant_conf.read().unwrap().tenant_conf.clone()
|
self.tenant_conf.load().tenant_conf.clone()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn effective_config(&self) -> TenantConf {
|
pub fn effective_config(&self) -> TenantConf {
|
||||||
@@ -2245,84 +2238,84 @@ impl Tenant {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_checkpoint_distance(&self) -> u64 {
|
pub fn get_checkpoint_distance(&self) -> u64 {
|
||||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||||
tenant_conf
|
tenant_conf
|
||||||
.checkpoint_distance
|
.checkpoint_distance
|
||||||
.unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
|
.unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_checkpoint_timeout(&self) -> Duration {
|
pub fn get_checkpoint_timeout(&self) -> Duration {
|
||||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||||
tenant_conf
|
tenant_conf
|
||||||
.checkpoint_timeout
|
.checkpoint_timeout
|
||||||
.unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
|
.unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_compaction_target_size(&self) -> u64 {
|
pub fn get_compaction_target_size(&self) -> u64 {
|
||||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||||
tenant_conf
|
tenant_conf
|
||||||
.compaction_target_size
|
.compaction_target_size
|
||||||
.unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
|
.unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_compaction_period(&self) -> Duration {
|
pub fn get_compaction_period(&self) -> Duration {
|
||||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||||
tenant_conf
|
tenant_conf
|
||||||
.compaction_period
|
.compaction_period
|
||||||
.unwrap_or(self.conf.default_tenant_conf.compaction_period)
|
.unwrap_or(self.conf.default_tenant_conf.compaction_period)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_compaction_threshold(&self) -> usize {
|
pub fn get_compaction_threshold(&self) -> usize {
|
||||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||||
tenant_conf
|
tenant_conf
|
||||||
.compaction_threshold
|
.compaction_threshold
|
||||||
.unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
|
.unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_gc_horizon(&self) -> u64 {
|
pub fn get_gc_horizon(&self) -> u64 {
|
||||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||||
tenant_conf
|
tenant_conf
|
||||||
.gc_horizon
|
.gc_horizon
|
||||||
.unwrap_or(self.conf.default_tenant_conf.gc_horizon)
|
.unwrap_or(self.conf.default_tenant_conf.gc_horizon)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_gc_period(&self) -> Duration {
|
pub fn get_gc_period(&self) -> Duration {
|
||||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||||
tenant_conf
|
tenant_conf
|
||||||
.gc_period
|
.gc_period
|
||||||
.unwrap_or(self.conf.default_tenant_conf.gc_period)
|
.unwrap_or(self.conf.default_tenant_conf.gc_period)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_image_creation_threshold(&self) -> usize {
|
pub fn get_image_creation_threshold(&self) -> usize {
|
||||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||||
tenant_conf
|
tenant_conf
|
||||||
.image_creation_threshold
|
.image_creation_threshold
|
||||||
.unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
|
.unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_pitr_interval(&self) -> Duration {
|
pub fn get_pitr_interval(&self) -> Duration {
|
||||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||||
tenant_conf
|
tenant_conf
|
||||||
.pitr_interval
|
.pitr_interval
|
||||||
.unwrap_or(self.conf.default_tenant_conf.pitr_interval)
|
.unwrap_or(self.conf.default_tenant_conf.pitr_interval)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_trace_read_requests(&self) -> bool {
|
pub fn get_trace_read_requests(&self) -> bool {
|
||||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||||
tenant_conf
|
tenant_conf
|
||||||
.trace_read_requests
|
.trace_read_requests
|
||||||
.unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
|
.unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_min_resident_size_override(&self) -> Option<u64> {
|
pub fn get_min_resident_size_override(&self) -> Option<u64> {
|
||||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||||
tenant_conf
|
tenant_conf
|
||||||
.min_resident_size_override
|
.min_resident_size_override
|
||||||
.or(self.conf.default_tenant_conf.min_resident_size_override)
|
.or(self.conf.default_tenant_conf.min_resident_size_override)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_heatmap_period(&self) -> Option<Duration> {
|
pub fn get_heatmap_period(&self) -> Option<Duration> {
|
||||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||||
let heatmap_period = tenant_conf
|
let heatmap_period = tenant_conf
|
||||||
.heatmap_period
|
.heatmap_period
|
||||||
.unwrap_or(self.conf.default_tenant_conf.heatmap_period);
|
.unwrap_or(self.conf.default_tenant_conf.heatmap_period);
|
||||||
@@ -2334,26 +2327,40 @@ impl Tenant {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
|
pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
|
||||||
self.tenant_conf.write().unwrap().tenant_conf = new_tenant_conf;
|
// Use read-copy-update in order to avoid overwriting the location config
|
||||||
self.tenant_conf_updated();
|
// state if this races with [`Tenant::set_new_location_config`]. Note that
|
||||||
|
// this race is not possible if both request types come from the storage
|
||||||
|
// controller (as they should!) because an exclusive op lock is required
|
||||||
|
// on the storage controller side.
|
||||||
|
self.tenant_conf.rcu(|inner| {
|
||||||
|
Arc::new(AttachedTenantConf {
|
||||||
|
tenant_conf: new_tenant_conf.clone(),
|
||||||
|
location: inner.location,
|
||||||
|
})
|
||||||
|
});
|
||||||
|
|
||||||
|
self.tenant_conf_updated(&new_tenant_conf);
|
||||||
// Don't hold self.timelines.lock() during the notifies.
|
// Don't hold self.timelines.lock() during the notifies.
|
||||||
// There's no risk of deadlock right now, but there could be if we consolidate
|
// There's no risk of deadlock right now, but there could be if we consolidate
|
||||||
// mutexes in struct Timeline in the future.
|
// mutexes in struct Timeline in the future.
|
||||||
let timelines = self.list_timelines();
|
let timelines = self.list_timelines();
|
||||||
for timeline in timelines {
|
for timeline in timelines {
|
||||||
timeline.tenant_conf_updated();
|
timeline.tenant_conf_updated(&new_tenant_conf);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) {
|
pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) {
|
||||||
*self.tenant_conf.write().unwrap() = new_conf;
|
let new_tenant_conf = new_conf.tenant_conf.clone();
|
||||||
self.tenant_conf_updated();
|
|
||||||
|
self.tenant_conf.store(Arc::new(new_conf));
|
||||||
|
|
||||||
|
self.tenant_conf_updated(&new_tenant_conf);
|
||||||
// Don't hold self.timelines.lock() during the notifies.
|
// Don't hold self.timelines.lock() during the notifies.
|
||||||
// There's no risk of deadlock right now, but there could be if we consolidate
|
// There's no risk of deadlock right now, but there could be if we consolidate
|
||||||
// mutexes in struct Timeline in the future.
|
// mutexes in struct Timeline in the future.
|
||||||
let timelines = self.list_timelines();
|
let timelines = self.list_timelines();
|
||||||
for timeline in timelines {
|
for timeline in timelines {
|
||||||
timeline.tenant_conf_updated();
|
timeline.tenant_conf_updated(&new_tenant_conf);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2367,11 +2374,8 @@ impl Tenant {
|
|||||||
.unwrap_or(psconf.default_tenant_conf.timeline_get_throttle.clone())
|
.unwrap_or(psconf.default_tenant_conf.timeline_get_throttle.clone())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn tenant_conf_updated(&self) {
|
pub(crate) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) {
|
||||||
let conf = {
|
let conf = Self::get_timeline_get_throttle_config(self.conf, new_conf);
|
||||||
let guard = self.tenant_conf.read().unwrap();
|
|
||||||
Self::get_timeline_get_throttle_config(self.conf, &guard.tenant_conf)
|
|
||||||
};
|
|
||||||
self.timeline_get_throttle.reconfigure(conf)
|
self.timeline_get_throttle.reconfigure(conf)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2519,7 +2523,7 @@ impl Tenant {
|
|||||||
Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf),
|
Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf),
|
||||||
&crate::metrics::tenant_throttling::TIMELINE_GET,
|
&crate::metrics::tenant_throttling::TIMELINE_GET,
|
||||||
)),
|
)),
|
||||||
tenant_conf: Arc::new(RwLock::new(attached_conf)),
|
tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -3505,7 +3509,7 @@ impl Tenant {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt {
|
pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt {
|
||||||
self.tenant_conf.read().unwrap().tenant_conf.clone()
|
self.tenant_conf.load().tenant_conf.clone()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -3653,6 +3657,9 @@ pub(crate) mod harness {
|
|||||||
heatmap_period: Some(tenant_conf.heatmap_period),
|
heatmap_period: Some(tenant_conf.heatmap_period),
|
||||||
lazy_slru_download: Some(tenant_conf.lazy_slru_download),
|
lazy_slru_download: Some(tenant_conf.lazy_slru_download),
|
||||||
timeline_get_throttle: Some(tenant_conf.timeline_get_throttle),
|
timeline_get_throttle: Some(tenant_conf.timeline_get_throttle),
|
||||||
|
image_layer_creation_check_threshold: Some(
|
||||||
|
tenant_conf.image_layer_creation_check_threshold,
|
||||||
|
),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -3851,6 +3858,7 @@ mod tests {
|
|||||||
use hex_literal::hex;
|
use hex_literal::hex;
|
||||||
use pageserver_api::keyspace::KeySpace;
|
use pageserver_api::keyspace::KeySpace;
|
||||||
use rand::{thread_rng, Rng};
|
use rand::{thread_rng, Rng};
|
||||||
|
use tests::timeline::ShutdownMode;
|
||||||
|
|
||||||
static TEST_KEY: Lazy<Key> =
|
static TEST_KEY: Lazy<Key> =
|
||||||
Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
|
Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
|
||||||
@@ -4296,7 +4304,7 @@ mod tests {
|
|||||||
make_some_layers(tline.as_ref(), Lsn(0x8000), &ctx).await?;
|
make_some_layers(tline.as_ref(), Lsn(0x8000), &ctx).await?;
|
||||||
// so that all uploads finish & we can call harness.load() below again
|
// so that all uploads finish & we can call harness.load() below again
|
||||||
tenant
|
tenant
|
||||||
.shutdown(Default::default(), true)
|
.shutdown(Default::default(), ShutdownMode::FreezeAndFlush)
|
||||||
.instrument(harness.span())
|
.instrument(harness.span())
|
||||||
.await
|
.await
|
||||||
.ok()
|
.ok()
|
||||||
@@ -4337,7 +4345,7 @@ mod tests {
|
|||||||
|
|
||||||
// so that all uploads finish & we can call harness.load() below again
|
// so that all uploads finish & we can call harness.load() below again
|
||||||
tenant
|
tenant
|
||||||
.shutdown(Default::default(), true)
|
.shutdown(Default::default(), ShutdownMode::FreezeAndFlush)
|
||||||
.instrument(harness.span())
|
.instrument(harness.span())
|
||||||
.await
|
.await
|
||||||
.ok()
|
.ok()
|
||||||
@@ -5118,7 +5126,7 @@ mod tests {
|
|||||||
// Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again
|
// Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again
|
||||||
let raw_tline = tline.raw_timeline().unwrap();
|
let raw_tline = tline.raw_timeline().unwrap();
|
||||||
raw_tline
|
raw_tline
|
||||||
.shutdown()
|
.shutdown(super::timeline::ShutdownMode::Hard)
|
||||||
.instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, shard_id=%raw_tline.tenant_shard_id.shard_slug(), timeline_id=%TIMELINE_ID))
|
.instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, shard_id=%raw_tline.tenant_shard_id.shard_slug(), timeline_id=%TIMELINE_ID))
|
||||||
.await;
|
.await;
|
||||||
std::mem::forget(tline);
|
std::mem::forget(tline);
|
||||||
|
|||||||
@@ -57,6 +57,9 @@ pub mod defaults {
|
|||||||
// throughputs up to 1GiB/s per timeline.
|
// throughputs up to 1GiB/s per timeline.
|
||||||
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
|
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
|
||||||
pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
|
pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
|
||||||
|
// By default ingest enough WAL for two new L0 layers before checking if new image
|
||||||
|
// image layers should be created.
|
||||||
|
pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
|
||||||
|
|
||||||
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
|
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
|
||||||
}
|
}
|
||||||
@@ -362,6 +365,10 @@ pub struct TenantConf {
|
|||||||
pub lazy_slru_download: bool,
|
pub lazy_slru_download: bool,
|
||||||
|
|
||||||
pub timeline_get_throttle: pageserver_api::models::ThrottleConfig,
|
pub timeline_get_throttle: pageserver_api::models::ThrottleConfig,
|
||||||
|
|
||||||
|
// How much WAL must be ingested before checking again whether a new image layer is required.
|
||||||
|
// Expresed in multiples of checkpoint distance.
|
||||||
|
pub image_layer_creation_check_threshold: u8,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Same as TenantConf, but this struct preserves the information about
|
/// Same as TenantConf, but this struct preserves the information about
|
||||||
@@ -454,6 +461,9 @@ pub struct TenantConfOpt {
|
|||||||
|
|
||||||
#[serde(skip_serializing_if = "Option::is_none")]
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
pub timeline_get_throttle: Option<pageserver_api::models::ThrottleConfig>,
|
pub timeline_get_throttle: Option<pageserver_api::models::ThrottleConfig>,
|
||||||
|
|
||||||
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
|
pub image_layer_creation_check_threshold: Option<u8>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TenantConfOpt {
|
impl TenantConfOpt {
|
||||||
@@ -508,6 +518,9 @@ impl TenantConfOpt {
|
|||||||
.timeline_get_throttle
|
.timeline_get_throttle
|
||||||
.clone()
|
.clone()
|
||||||
.unwrap_or(global_conf.timeline_get_throttle),
|
.unwrap_or(global_conf.timeline_get_throttle),
|
||||||
|
image_layer_creation_check_threshold: self
|
||||||
|
.image_layer_creation_check_threshold
|
||||||
|
.unwrap_or(global_conf.image_layer_creation_check_threshold),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -548,6 +561,7 @@ impl Default for TenantConf {
|
|||||||
heatmap_period: Duration::ZERO,
|
heatmap_period: Duration::ZERO,
|
||||||
lazy_slru_download: false,
|
lazy_slru_download: false,
|
||||||
timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
|
timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
|
||||||
|
image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -621,6 +635,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
|
|||||||
heatmap_period: value.heatmap_period.map(humantime),
|
heatmap_period: value.heatmap_period.map(humantime),
|
||||||
lazy_slru_download: value.lazy_slru_download,
|
lazy_slru_download: value.lazy_slru_download,
|
||||||
timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
|
timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
|
||||||
|
image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -14,7 +14,10 @@ use crate::{
|
|||||||
config::PageServerConf,
|
config::PageServerConf,
|
||||||
context::RequestContext,
|
context::RequestContext,
|
||||||
task_mgr::{self, TaskKind},
|
task_mgr::{self, TaskKind},
|
||||||
tenant::mgr::{TenantSlot, TenantsMapRemoveResult},
|
tenant::{
|
||||||
|
mgr::{TenantSlot, TenantsMapRemoveResult},
|
||||||
|
timeline::ShutdownMode,
|
||||||
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
use super::{
|
use super::{
|
||||||
@@ -463,7 +466,7 @@ impl DeleteTenantFlow {
|
|||||||
// tenant.shutdown
|
// tenant.shutdown
|
||||||
// Its also bad that we're holding tenants.read here.
|
// Its also bad that we're holding tenants.read here.
|
||||||
// TODO relax set_stopping to be idempotent?
|
// TODO relax set_stopping to be idempotent?
|
||||||
if tenant.shutdown(progress, false).await.is_err() {
|
if tenant.shutdown(progress, ShutdownMode::Hard).await.is_err() {
|
||||||
return Err(DeleteTenantError::Other(anyhow::anyhow!(
|
return Err(DeleteTenantError::Other(anyhow::anyhow!(
|
||||||
"tenant shutdown is already in progress"
|
"tenant shutdown is already in progress"
|
||||||
)));
|
)));
|
||||||
|
|||||||
@@ -72,6 +72,10 @@ impl EphemeralFile {
|
|||||||
self.len
|
self.len
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn id(&self) -> page_cache::FileId {
|
||||||
|
self.page_cache_file_id
|
||||||
|
}
|
||||||
|
|
||||||
pub(crate) async fn read_blk(
|
pub(crate) async fn read_blk(
|
||||||
&self,
|
&self,
|
||||||
blknum: u32,
|
blknum: u32,
|
||||||
|
|||||||
@@ -346,35 +346,6 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(PartialEq, Eq, Hash, Debug, Clone)]
|
|
||||||
pub enum InMemoryLayerHandle {
|
|
||||||
Open {
|
|
||||||
lsn_floor: Lsn,
|
|
||||||
end_lsn: Lsn,
|
|
||||||
},
|
|
||||||
Frozen {
|
|
||||||
idx: usize,
|
|
||||||
lsn_floor: Lsn,
|
|
||||||
end_lsn: Lsn,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
impl InMemoryLayerHandle {
|
|
||||||
pub fn get_lsn_floor(&self) -> Lsn {
|
|
||||||
match self {
|
|
||||||
InMemoryLayerHandle::Open { lsn_floor, .. } => *lsn_floor,
|
|
||||||
InMemoryLayerHandle::Frozen { lsn_floor, .. } => *lsn_floor,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn get_end_lsn(&self) -> Lsn {
|
|
||||||
match self {
|
|
||||||
InMemoryLayerHandle::Open { end_lsn, .. } => *end_lsn,
|
|
||||||
InMemoryLayerHandle::Frozen { end_lsn, .. } => *end_lsn,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl LayerMap {
|
impl LayerMap {
|
||||||
///
|
///
|
||||||
/// Find the latest layer (by lsn.end) that covers the given
|
/// Find the latest layer (by lsn.end) that covers the given
|
||||||
@@ -576,41 +547,18 @@ impl LayerMap {
|
|||||||
self.historic.iter()
|
self.historic.iter()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get a handle for the first in memory layer that matches the provided predicate.
|
/// Get a ref counted pointer for the first in memory layer that matches the provided predicate.
|
||||||
/// The handle should be used with [`Self::get_in_memory_layer`] to retrieve the actual layer.
|
pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<Arc<InMemoryLayer>>
|
||||||
///
|
|
||||||
/// Note: [`Self::find_in_memory_layer`] and [`Self::get_in_memory_layer`] should be called during
|
|
||||||
/// the same exclusive region established by holding the layer manager lock.
|
|
||||||
pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<InMemoryLayerHandle>
|
|
||||||
where
|
where
|
||||||
Pred: FnMut(&Arc<InMemoryLayer>) -> bool,
|
Pred: FnMut(&Arc<InMemoryLayer>) -> bool,
|
||||||
{
|
{
|
||||||
if let Some(open) = &self.open_layer {
|
if let Some(open) = &self.open_layer {
|
||||||
if pred(open) {
|
if pred(open) {
|
||||||
return Some(InMemoryLayerHandle::Open {
|
return Some(open.clone());
|
||||||
lsn_floor: open.get_lsn_range().start,
|
|
||||||
end_lsn: open.get_lsn_range().end,
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let pos = self.frozen_layers.iter().rev().position(pred);
|
self.frozen_layers.iter().rfind(|l| pred(l)).cloned()
|
||||||
pos.map(|rev_idx| {
|
|
||||||
let idx = self.frozen_layers.len() - 1 - rev_idx;
|
|
||||||
InMemoryLayerHandle::Frozen {
|
|
||||||
idx,
|
|
||||||
lsn_floor: self.frozen_layers[idx].get_lsn_range().start,
|
|
||||||
end_lsn: self.frozen_layers[idx].get_lsn_range().end,
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Get the layer pointed to by the provided handle.
|
|
||||||
pub fn get_in_memory_layer(&self, handle: &InMemoryLayerHandle) -> Option<Arc<InMemoryLayer>> {
|
|
||||||
match handle {
|
|
||||||
InMemoryLayerHandle::Open { .. } => self.open_layer.clone(),
|
|
||||||
InMemoryLayerHandle::Frozen { idx, .. } => self.frozen_layers.get(*idx).cloned(),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
///
|
///
|
||||||
|
|||||||
@@ -44,6 +44,7 @@ use crate::tenant::config::{
|
|||||||
use crate::tenant::delete::DeleteTenantFlow;
|
use crate::tenant::delete::DeleteTenantFlow;
|
||||||
use crate::tenant::span::debug_assert_current_span_has_tenant_id;
|
use crate::tenant::span::debug_assert_current_span_has_tenant_id;
|
||||||
use crate::tenant::storage_layer::inmemory_layer;
|
use crate::tenant::storage_layer::inmemory_layer;
|
||||||
|
use crate::tenant::timeline::ShutdownMode;
|
||||||
use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
|
use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
|
||||||
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX};
|
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX};
|
||||||
|
|
||||||
@@ -783,11 +784,9 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
|
|||||||
shutdown_state.insert(tenant_shard_id, TenantSlot::Attached(t.clone()));
|
shutdown_state.insert(tenant_shard_id, TenantSlot::Attached(t.clone()));
|
||||||
join_set.spawn(
|
join_set.spawn(
|
||||||
async move {
|
async move {
|
||||||
let freeze_and_flush = true;
|
|
||||||
|
|
||||||
let res = {
|
let res = {
|
||||||
let (_guard, shutdown_progress) = completion::channel();
|
let (_guard, shutdown_progress) = completion::channel();
|
||||||
t.shutdown(shutdown_progress, freeze_and_flush).await
|
t.shutdown(shutdown_progress, ShutdownMode::FreezeAndFlush).await
|
||||||
};
|
};
|
||||||
|
|
||||||
if let Err(other_progress) = res {
|
if let Err(other_progress) = res {
|
||||||
@@ -1107,7 +1106,7 @@ impl TenantManager {
|
|||||||
};
|
};
|
||||||
|
|
||||||
info!("Shutting down attached tenant");
|
info!("Shutting down attached tenant");
|
||||||
match tenant.shutdown(progress, false).await {
|
match tenant.shutdown(progress, ShutdownMode::Hard).await {
|
||||||
Ok(()) => {}
|
Ok(()) => {}
|
||||||
Err(barrier) => {
|
Err(barrier) => {
|
||||||
info!("Shutdown already in progress, waiting for it to complete");
|
info!("Shutdown already in progress, waiting for it to complete");
|
||||||
@@ -1223,7 +1222,7 @@ impl TenantManager {
|
|||||||
TenantSlot::Attached(tenant) => {
|
TenantSlot::Attached(tenant) => {
|
||||||
let (_guard, progress) = utils::completion::channel();
|
let (_guard, progress) = utils::completion::channel();
|
||||||
info!("Shutting down just-spawned tenant, because tenant manager is shut down");
|
info!("Shutting down just-spawned tenant, because tenant manager is shut down");
|
||||||
match tenant.shutdown(progress, false).await {
|
match tenant.shutdown(progress, ShutdownMode::Hard).await {
|
||||||
Ok(()) => {
|
Ok(()) => {
|
||||||
info!("Finished shutting down just-spawned tenant");
|
info!("Finished shutting down just-spawned tenant");
|
||||||
}
|
}
|
||||||
@@ -1273,7 +1272,7 @@ impl TenantManager {
|
|||||||
};
|
};
|
||||||
|
|
||||||
let (_guard, progress) = utils::completion::channel();
|
let (_guard, progress) = utils::completion::channel();
|
||||||
match tenant.shutdown(progress, false).await {
|
match tenant.shutdown(progress, ShutdownMode::Hard).await {
|
||||||
Ok(()) => {
|
Ok(()) => {
|
||||||
slot_guard.drop_old_value()?;
|
slot_guard.drop_old_value()?;
|
||||||
}
|
}
|
||||||
@@ -1649,7 +1648,14 @@ impl TenantManager {
|
|||||||
fail::fail_point!("shard-split-lsn-wait", |_| Err(anyhow::anyhow!(
|
fail::fail_point!("shard-split-lsn-wait", |_| Err(anyhow::anyhow!(
|
||||||
"failpoint"
|
"failpoint"
|
||||||
)));
|
)));
|
||||||
if let Err(e) = timeline.wait_lsn(*target_lsn, ctx).await {
|
if let Err(e) = timeline
|
||||||
|
.wait_lsn(
|
||||||
|
*target_lsn,
|
||||||
|
crate::tenant::timeline::WaitLsnWaiter::Tenant,
|
||||||
|
ctx,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
// Failure here might mean shutdown, in any case this part is an optimization
|
// Failure here might mean shutdown, in any case this part is an optimization
|
||||||
// and we shouldn't hold up the split operation.
|
// and we shouldn't hold up the split operation.
|
||||||
tracing::warn!(
|
tracing::warn!(
|
||||||
@@ -1670,7 +1676,7 @@ impl TenantManager {
|
|||||||
|
|
||||||
// Phase 5: Shut down the parent shard, and erase it from disk
|
// Phase 5: Shut down the parent shard, and erase it from disk
|
||||||
let (_guard, progress) = completion::channel();
|
let (_guard, progress) = completion::channel();
|
||||||
match parent.shutdown(progress, false).await {
|
match parent.shutdown(progress, ShutdownMode::Hard).await {
|
||||||
Ok(()) => {}
|
Ok(()) => {}
|
||||||
Err(other) => {
|
Err(other) => {
|
||||||
other.wait().await;
|
other.wait().await;
|
||||||
@@ -2657,11 +2663,11 @@ where
|
|||||||
let attached_tenant = match slot_guard.get_old_value() {
|
let attached_tenant = match slot_guard.get_old_value() {
|
||||||
Some(TenantSlot::Attached(tenant)) => {
|
Some(TenantSlot::Attached(tenant)) => {
|
||||||
// whenever we remove a tenant from memory, we don't want to flush and wait for upload
|
// whenever we remove a tenant from memory, we don't want to flush and wait for upload
|
||||||
let freeze_and_flush = false;
|
let shutdown_mode = ShutdownMode::Hard;
|
||||||
|
|
||||||
// shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
|
// shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
|
||||||
// that we can continue safely to cleanup.
|
// that we can continue safely to cleanup.
|
||||||
match tenant.shutdown(progress, freeze_and_flush).await {
|
match tenant.shutdown(progress, shutdown_mode).await {
|
||||||
Ok(()) => {}
|
Ok(()) => {}
|
||||||
Err(_other) => {
|
Err(_other) => {
|
||||||
// if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
|
// if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
|
||||||
|
|||||||
@@ -200,6 +200,7 @@ use utils::backoff::{
|
|||||||
use std::collections::{HashMap, VecDeque};
|
use std::collections::{HashMap, VecDeque};
|
||||||
use std::sync::atomic::{AtomicU32, Ordering};
|
use std::sync::atomic::{AtomicU32, Ordering};
|
||||||
use std::sync::{Arc, Mutex};
|
use std::sync::{Arc, Mutex};
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath, TimeoutOrCancel};
|
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath, TimeoutOrCancel};
|
||||||
use std::ops::DerefMut;
|
use std::ops::DerefMut;
|
||||||
@@ -207,7 +208,7 @@ use tracing::{debug, error, info, instrument, warn};
|
|||||||
use tracing::{info_span, Instrument};
|
use tracing::{info_span, Instrument};
|
||||||
use utils::lsn::Lsn;
|
use utils::lsn::Lsn;
|
||||||
|
|
||||||
use crate::deletion_queue::DeletionQueueClient;
|
use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
|
||||||
use crate::metrics::{
|
use crate::metrics::{
|
||||||
MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
|
MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
|
||||||
RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
|
RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
|
||||||
@@ -261,6 +262,10 @@ pub(crate) const INITDB_PRESERVED_PATH: &str = "initdb-preserved.tar.zst";
|
|||||||
/// Default buffer size when interfacing with [`tokio::fs::File`].
|
/// Default buffer size when interfacing with [`tokio::fs::File`].
|
||||||
pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
|
pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
|
||||||
|
|
||||||
|
/// Doing non-essential flushes of deletion queue is subject to this timeout, after
|
||||||
|
/// which we warn and skip.
|
||||||
|
const DELETION_QUEUE_FLUSH_TIMEOUT: Duration = Duration::from_secs(10);
|
||||||
|
|
||||||
pub enum MaybeDeletedIndexPart {
|
pub enum MaybeDeletedIndexPart {
|
||||||
IndexPart(IndexPart),
|
IndexPart(IndexPart),
|
||||||
Deleted(IndexPart),
|
Deleted(IndexPart),
|
||||||
@@ -588,14 +593,14 @@ impl RemoteTimelineClient {
|
|||||||
upload_queue: &mut UploadQueueInitialized,
|
upload_queue: &mut UploadQueueInitialized,
|
||||||
metadata: TimelineMetadata,
|
metadata: TimelineMetadata,
|
||||||
) {
|
) {
|
||||||
|
let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
|
||||||
|
|
||||||
info!(
|
info!(
|
||||||
"scheduling metadata upload with {} files ({} changed)",
|
"scheduling metadata upload up to consistent LSN {disk_consistent_lsn} with {} files ({} changed)",
|
||||||
upload_queue.latest_files.len(),
|
upload_queue.latest_files.len(),
|
||||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled,
|
upload_queue.latest_files_changes_since_metadata_upload_scheduled,
|
||||||
);
|
);
|
||||||
|
|
||||||
let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
|
|
||||||
|
|
||||||
let index_part = IndexPart::new(
|
let index_part = IndexPart::new(
|
||||||
upload_queue.latest_files.clone(),
|
upload_queue.latest_files.clone(),
|
||||||
disk_consistent_lsn,
|
disk_consistent_lsn,
|
||||||
@@ -1050,6 +1055,26 @@ impl RemoteTimelineClient {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn flush_deletion_queue(&self) -> Result<(), DeletionQueueError> {
|
||||||
|
match tokio::time::timeout(
|
||||||
|
DELETION_QUEUE_FLUSH_TIMEOUT,
|
||||||
|
self.deletion_queue_client.flush_immediate(),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(result) => result,
|
||||||
|
Err(_timeout) => {
|
||||||
|
// Flushing remote deletions is not mandatory: we flush here to make the system easier to test, and
|
||||||
|
// to ensure that _usually_ objects are really gone after a DELETE is acked. However, in case of deletion
|
||||||
|
// queue issues (https://github.com/neondatabase/neon/issues/6440), we don't want to wait indefinitely here.
|
||||||
|
tracing::warn!(
|
||||||
|
"Timed out waiting for deletion queue flush, acking deletion anyway"
|
||||||
|
);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set.
|
/// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set.
|
||||||
/// The function deletes layer files one by one, then lists the prefix to see if we leaked something
|
/// The function deletes layer files one by one, then lists the prefix to see if we leaked something
|
||||||
/// deletes leaked files if any and proceeds with deletion of index file at the end.
|
/// deletes leaked files if any and proceeds with deletion of index file at the end.
|
||||||
@@ -1099,7 +1124,7 @@ impl RemoteTimelineClient {
|
|||||||
|
|
||||||
// Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't
|
// Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't
|
||||||
// taking the burden of listing all the layers that we already know we should delete.
|
// taking the burden of listing all the layers that we already know we should delete.
|
||||||
self.deletion_queue_client.flush_immediate().await?;
|
self.flush_deletion_queue().await?;
|
||||||
|
|
||||||
let cancel = shutdown_token();
|
let cancel = shutdown_token();
|
||||||
|
|
||||||
@@ -1173,7 +1198,7 @@ impl RemoteTimelineClient {
|
|||||||
|
|
||||||
// Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait
|
// Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait
|
||||||
// for a flush to a persistent deletion list so that we may be sure deletion will occur.
|
// for a flush to a persistent deletion list so that we may be sure deletion will occur.
|
||||||
self.deletion_queue_client.flush_immediate().await?;
|
self.flush_deletion_queue().await?;
|
||||||
|
|
||||||
fail::fail_point!("timeline-delete-after-index-delete", |_| {
|
fail::fail_point!("timeline-delete-after-index-delete", |_| {
|
||||||
Err(anyhow::anyhow!(
|
Err(anyhow::anyhow!(
|
||||||
@@ -1569,7 +1594,7 @@ impl RemoteTimelineClient {
|
|||||||
/// Use [`RemoteTimelineClient::shutdown`] for graceful stop.
|
/// Use [`RemoteTimelineClient::shutdown`] for graceful stop.
|
||||||
///
|
///
|
||||||
/// In-progress operations will still be running after this function returns.
|
/// In-progress operations will still be running after this function returns.
|
||||||
/// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))`
|
/// Use `task_mgr::shutdown_tasks(Some(TaskKind::RemoteUploadTask), Some(self.tenant_shard_id), Some(timeline_id))`
|
||||||
/// to wait for them to complete, after calling this function.
|
/// to wait for them to complete, after calling this function.
|
||||||
pub(crate) fn stop(&self) {
|
pub(crate) fn stop(&self) {
|
||||||
// Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue
|
// Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue
|
||||||
|
|||||||
@@ -786,6 +786,35 @@ impl<'a> TenantDownloader<'a> {
|
|||||||
// Existing on-disk layers: just update their access time.
|
// Existing on-disk layers: just update their access time.
|
||||||
if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) {
|
if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) {
|
||||||
tracing::debug!("Layer {} is already on disk", layer.name);
|
tracing::debug!("Layer {} is already on disk", layer.name);
|
||||||
|
|
||||||
|
if cfg!(debug_assertions) {
|
||||||
|
// Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think
|
||||||
|
// are already present on disk are really there.
|
||||||
|
let local_path = self
|
||||||
|
.conf
|
||||||
|
.timeline_path(tenant_shard_id, &timeline.timeline_id)
|
||||||
|
.join(layer.name.file_name());
|
||||||
|
match tokio::fs::metadata(&local_path).await {
|
||||||
|
Ok(meta) => {
|
||||||
|
tracing::debug!(
|
||||||
|
"Layer {} present at {}, size {}",
|
||||||
|
layer.name,
|
||||||
|
local_path,
|
||||||
|
meta.len(),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
tracing::warn!(
|
||||||
|
"Layer {} not found at {} ({})",
|
||||||
|
layer.name,
|
||||||
|
local_path,
|
||||||
|
e
|
||||||
|
);
|
||||||
|
debug_assert!(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if on_disk.metadata != LayerFileMetadata::from(&layer.metadata)
|
if on_disk.metadata != LayerFileMetadata::from(&layer.metadata)
|
||||||
|| on_disk.access_time != layer.access_time
|
|| on_disk.access_time != layer.access_time
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ use std::cmp::{Ordering, Reverse};
|
|||||||
use std::collections::hash_map::Entry;
|
use std::collections::hash_map::Entry;
|
||||||
use std::collections::{BinaryHeap, HashMap};
|
use std::collections::{BinaryHeap, HashMap};
|
||||||
use std::ops::Range;
|
use std::ops::Range;
|
||||||
use std::sync::Mutex;
|
use std::sync::{Arc, Mutex};
|
||||||
use std::time::{Duration, SystemTime, UNIX_EPOCH};
|
use std::time::{Duration, SystemTime, UNIX_EPOCH};
|
||||||
use tracing::warn;
|
use tracing::warn;
|
||||||
use utils::history_buffer::HistoryBufferWithDropCounter;
|
use utils::history_buffer::HistoryBufferWithDropCounter;
|
||||||
@@ -41,8 +41,8 @@ pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
|
|||||||
|
|
||||||
pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
|
pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
|
||||||
|
|
||||||
use super::layer_map::InMemoryLayerHandle;
|
use self::inmemory_layer::InMemoryLayerFileId;
|
||||||
use super::timeline::layer_manager::LayerManager;
|
|
||||||
use super::timeline::GetVectoredError;
|
use super::timeline::GetVectoredError;
|
||||||
use super::PageReconstructError;
|
use super::PageReconstructError;
|
||||||
|
|
||||||
@@ -204,23 +204,30 @@ impl Default for ValuesReconstructState {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Description of layer to be read - the layer map can turn
|
/// A key that uniquely identifies a layer in a timeline
|
||||||
/// this description into the actual layer.
|
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
|
||||||
#[derive(PartialEq, Eq, Hash, Debug, Clone)]
|
pub(crate) enum LayerId {
|
||||||
pub(crate) enum ReadableLayerDesc {
|
PersitentLayerId(PersistentLayerKey),
|
||||||
Persistent {
|
InMemoryLayerId(InMemoryLayerFileId),
|
||||||
desc: PersistentLayerDesc,
|
|
||||||
lsn_range: Range<Lsn>,
|
|
||||||
},
|
|
||||||
InMemory {
|
|
||||||
handle: InMemoryLayerHandle,
|
|
||||||
lsn_ceil: Lsn,
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Wraper for 'ReadableLayerDesc' sorted by Lsn
|
/// Layer wrapper for the read path. Note that it is valid
|
||||||
|
/// to use these layers even after external operations have
|
||||||
|
/// been performed on them (compaction, freeze, etc.).
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
struct ReadableLayerDescOrdered(ReadableLayerDesc);
|
pub(crate) enum ReadableLayer {
|
||||||
|
PersistentLayer(Layer),
|
||||||
|
InMemoryLayer(Arc<InMemoryLayer>),
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A partial description of a read to be done.
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
struct ReadDesc {
|
||||||
|
/// An id used to resolve the readable layer within the fringe
|
||||||
|
layer_id: LayerId,
|
||||||
|
/// Lsn range for the read, used for selecting the next read
|
||||||
|
lsn_range: Range<Lsn>,
|
||||||
|
}
|
||||||
|
|
||||||
/// Data structure which maintains a fringe of layers for the
|
/// Data structure which maintains a fringe of layers for the
|
||||||
/// read path. The fringe is the set of layers which intersects
|
/// read path. The fringe is the set of layers which intersects
|
||||||
@@ -231,41 +238,64 @@ struct ReadableLayerDescOrdered(ReadableLayerDesc);
|
|||||||
/// a two layer indexing scheme.
|
/// a two layer indexing scheme.
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub(crate) struct LayerFringe {
|
pub(crate) struct LayerFringe {
|
||||||
layers_by_lsn: BinaryHeap<ReadableLayerDescOrdered>,
|
planned_reads_by_lsn: BinaryHeap<ReadDesc>,
|
||||||
layers: HashMap<ReadableLayerDesc, KeySpace>,
|
layers: HashMap<LayerId, LayerKeyspace>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
struct LayerKeyspace {
|
||||||
|
layer: ReadableLayer,
|
||||||
|
target_keyspace: KeySpace,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl LayerFringe {
|
impl LayerFringe {
|
||||||
pub(crate) fn new() -> Self {
|
pub(crate) fn new() -> Self {
|
||||||
LayerFringe {
|
LayerFringe {
|
||||||
layers_by_lsn: BinaryHeap::new(),
|
planned_reads_by_lsn: BinaryHeap::new(),
|
||||||
layers: HashMap::new(),
|
layers: HashMap::new(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayerDesc, KeySpace)> {
|
pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayer, KeySpace, Range<Lsn>)> {
|
||||||
let handle = match self.layers_by_lsn.pop() {
|
let read_desc = match self.planned_reads_by_lsn.pop() {
|
||||||
Some(h) => h,
|
Some(desc) => desc,
|
||||||
None => return None,
|
None => return None,
|
||||||
};
|
};
|
||||||
|
|
||||||
let removed = self.layers.remove_entry(&handle.0);
|
let removed = self.layers.remove_entry(&read_desc.layer_id);
|
||||||
match removed {
|
match removed {
|
||||||
Some((layer, keyspace)) => Some((layer, keyspace)),
|
Some((
|
||||||
|
_,
|
||||||
|
LayerKeyspace {
|
||||||
|
layer,
|
||||||
|
target_keyspace,
|
||||||
|
},
|
||||||
|
)) => Some((layer, target_keyspace, read_desc.lsn_range)),
|
||||||
None => unreachable!("fringe internals are always consistent"),
|
None => unreachable!("fringe internals are always consistent"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn update(&mut self, layer: ReadableLayerDesc, keyspace: KeySpace) {
|
pub(crate) fn update(
|
||||||
let entry = self.layers.entry(layer.clone());
|
&mut self,
|
||||||
|
layer: ReadableLayer,
|
||||||
|
keyspace: KeySpace,
|
||||||
|
lsn_range: Range<Lsn>,
|
||||||
|
) {
|
||||||
|
let layer_id = layer.id();
|
||||||
|
let entry = self.layers.entry(layer_id.clone());
|
||||||
match entry {
|
match entry {
|
||||||
Entry::Occupied(mut entry) => {
|
Entry::Occupied(mut entry) => {
|
||||||
entry.get_mut().merge(&keyspace);
|
entry.get_mut().target_keyspace.merge(&keyspace);
|
||||||
}
|
}
|
||||||
Entry::Vacant(entry) => {
|
Entry::Vacant(entry) => {
|
||||||
self.layers_by_lsn
|
self.planned_reads_by_lsn.push(ReadDesc {
|
||||||
.push(ReadableLayerDescOrdered(entry.key().clone()));
|
lsn_range,
|
||||||
entry.insert(keyspace);
|
layer_id: layer_id.clone(),
|
||||||
|
});
|
||||||
|
entry.insert(LayerKeyspace {
|
||||||
|
layer,
|
||||||
|
target_keyspace: keyspace,
|
||||||
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -277,77 +307,55 @@ impl Default for LayerFringe {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Ord for ReadableLayerDescOrdered {
|
impl Ord for ReadDesc {
|
||||||
fn cmp(&self, other: &Self) -> Ordering {
|
fn cmp(&self, other: &Self) -> Ordering {
|
||||||
let ord = self.0.get_lsn_ceil().cmp(&other.0.get_lsn_ceil());
|
let ord = self.lsn_range.end.cmp(&other.lsn_range.end);
|
||||||
if ord == std::cmp::Ordering::Equal {
|
if ord == std::cmp::Ordering::Equal {
|
||||||
self.0
|
self.lsn_range.start.cmp(&other.lsn_range.start).reverse()
|
||||||
.get_lsn_floor()
|
|
||||||
.cmp(&other.0.get_lsn_floor())
|
|
||||||
.reverse()
|
|
||||||
} else {
|
} else {
|
||||||
ord
|
ord
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PartialOrd for ReadableLayerDescOrdered {
|
impl PartialOrd for ReadDesc {
|
||||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||||
Some(self.cmp(other))
|
Some(self.cmp(other))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PartialEq for ReadableLayerDescOrdered {
|
impl PartialEq for ReadDesc {
|
||||||
fn eq(&self, other: &Self) -> bool {
|
fn eq(&self, other: &Self) -> bool {
|
||||||
self.0.get_lsn_floor() == other.0.get_lsn_floor()
|
self.lsn_range == other.lsn_range
|
||||||
&& self.0.get_lsn_ceil() == other.0.get_lsn_ceil()
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Eq for ReadableLayerDescOrdered {}
|
impl Eq for ReadDesc {}
|
||||||
|
|
||||||
impl ReadableLayerDesc {
|
impl ReadableLayer {
|
||||||
pub(crate) fn get_lsn_floor(&self) -> Lsn {
|
pub(crate) fn id(&self) -> LayerId {
|
||||||
match self {
|
match self {
|
||||||
ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.start,
|
Self::PersistentLayer(layer) => LayerId::PersitentLayerId(layer.layer_desc().key()),
|
||||||
ReadableLayerDesc::InMemory { handle, .. } => handle.get_lsn_floor(),
|
Self::InMemoryLayer(layer) => LayerId::InMemoryLayerId(layer.file_id()),
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn get_lsn_ceil(&self) -> Lsn {
|
|
||||||
match self {
|
|
||||||
ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.end,
|
|
||||||
ReadableLayerDesc::InMemory { lsn_ceil, .. } => *lsn_ceil,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) async fn get_values_reconstruct_data(
|
pub(crate) async fn get_values_reconstruct_data(
|
||||||
&self,
|
&self,
|
||||||
layer_manager: &LayerManager,
|
|
||||||
keyspace: KeySpace,
|
keyspace: KeySpace,
|
||||||
|
lsn_range: Range<Lsn>,
|
||||||
reconstruct_state: &mut ValuesReconstructState,
|
reconstruct_state: &mut ValuesReconstructState,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<(), GetVectoredError> {
|
) -> Result<(), GetVectoredError> {
|
||||||
match self {
|
match self {
|
||||||
ReadableLayerDesc::Persistent { desc, lsn_range } => {
|
ReadableLayer::PersistentLayer(layer) => {
|
||||||
let layer = layer_manager.get_from_desc(desc);
|
|
||||||
layer
|
layer
|
||||||
.get_values_reconstruct_data(
|
.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx)
|
||||||
keyspace,
|
|
||||||
lsn_range.clone(),
|
|
||||||
reconstruct_state,
|
|
||||||
ctx,
|
|
||||||
)
|
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
ReadableLayerDesc::InMemory { handle, lsn_ceil } => {
|
ReadableLayer::InMemoryLayer(layer) => {
|
||||||
let layer = layer_manager
|
|
||||||
.layer_map()
|
|
||||||
.get_in_memory_layer(handle)
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
layer
|
layer
|
||||||
.get_values_reconstruct_data(keyspace, *lsn_ceil, reconstruct_state, ctx)
|
.get_values_reconstruct_data(keyspace, lsn_range.end, reconstruct_state, ctx)
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -47,6 +47,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
|
|||||||
use bytes::BytesMut;
|
use bytes::BytesMut;
|
||||||
use camino::{Utf8Path, Utf8PathBuf};
|
use camino::{Utf8Path, Utf8PathBuf};
|
||||||
use futures::StreamExt;
|
use futures::StreamExt;
|
||||||
|
use itertools::Itertools;
|
||||||
use pageserver_api::keyspace::KeySpace;
|
use pageserver_api::keyspace::KeySpace;
|
||||||
use pageserver_api::models::LayerAccessKind;
|
use pageserver_api::models::LayerAccessKind;
|
||||||
use pageserver_api::shard::TenantShardId;
|
use pageserver_api::shard::TenantShardId;
|
||||||
@@ -946,6 +947,34 @@ impl DeltaLayerInner {
|
|||||||
Ok(planner.finish())
|
Ok(planner.finish())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn get_min_read_buffer_size(
|
||||||
|
planned_reads: &[VectoredRead],
|
||||||
|
read_size_soft_max: usize,
|
||||||
|
) -> usize {
|
||||||
|
let Some(largest_read) = planned_reads.iter().max_by_key(|read| read.size()) else {
|
||||||
|
return read_size_soft_max;
|
||||||
|
};
|
||||||
|
|
||||||
|
let largest_read_size = largest_read.size();
|
||||||
|
if largest_read_size > read_size_soft_max {
|
||||||
|
// If the read is oversized, it should only contain one key.
|
||||||
|
let offenders = largest_read
|
||||||
|
.blobs_at
|
||||||
|
.as_slice()
|
||||||
|
.iter()
|
||||||
|
.map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn))
|
||||||
|
.join(", ");
|
||||||
|
tracing::warn!(
|
||||||
|
"Oversized vectored read ({} > {}) for keys {}",
|
||||||
|
largest_read_size,
|
||||||
|
read_size_soft_max,
|
||||||
|
offenders
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
largest_read_size
|
||||||
|
}
|
||||||
|
|
||||||
async fn do_reads_and_update_state(
|
async fn do_reads_and_update_state(
|
||||||
&self,
|
&self,
|
||||||
reads: Vec<VectoredRead>,
|
reads: Vec<VectoredRead>,
|
||||||
@@ -959,7 +988,8 @@ impl DeltaLayerInner {
|
|||||||
.expect("Layer is loaded with max vectored bytes config")
|
.expect("Layer is loaded with max vectored bytes config")
|
||||||
.0
|
.0
|
||||||
.into();
|
.into();
|
||||||
let mut buf = Some(BytesMut::with_capacity(max_vectored_read_bytes));
|
let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes);
|
||||||
|
let mut buf = Some(BytesMut::with_capacity(buf_size));
|
||||||
|
|
||||||
// Note that reads are processed in reverse order (from highest key+lsn).
|
// Note that reads are processed in reverse order (from highest key+lsn).
|
||||||
// This is the order that `ReconstructState` requires such that it can
|
// This is the order that `ReconstructState` requires such that it can
|
||||||
@@ -986,7 +1016,7 @@ impl DeltaLayerInner {
|
|||||||
|
|
||||||
// We have "lost" the buffer since the lower level IO api
|
// We have "lost" the buffer since the lower level IO api
|
||||||
// doesn't return the buffer on error. Allocate a new one.
|
// doesn't return the buffer on error. Allocate a new one.
|
||||||
buf = Some(BytesMut::with_capacity(max_vectored_read_bytes));
|
buf = Some(BytesMut::with_capacity(buf_size));
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -1210,9 +1240,16 @@ impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for Del
|
|||||||
mod test {
|
mod test {
|
||||||
use std::collections::BTreeMap;
|
use std::collections::BTreeMap;
|
||||||
|
|
||||||
|
use itertools::MinMaxResult;
|
||||||
|
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
|
||||||
|
use rand::RngCore;
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::{
|
use crate::{
|
||||||
context::DownloadBehavior, task_mgr::TaskKind, tenant::disk_btree::tests::TestDisk,
|
context::DownloadBehavior,
|
||||||
|
task_mgr::TaskKind,
|
||||||
|
tenant::{disk_btree::tests::TestDisk, harness::TenantHarness},
|
||||||
|
DEFAULT_PG_VERSION,
|
||||||
};
|
};
|
||||||
|
|
||||||
/// Construct an index for a fictional delta layer and and then
|
/// Construct an index for a fictional delta layer and and then
|
||||||
@@ -1332,4 +1369,229 @@ mod test {
|
|||||||
|
|
||||||
assert_eq!(planned_blobs, expected_blobs);
|
assert_eq!(planned_blobs, expected_blobs);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
mod constants {
|
||||||
|
use utils::lsn::Lsn;
|
||||||
|
|
||||||
|
/// Offset used by all lsns in this test
|
||||||
|
pub(super) const LSN_OFFSET: Lsn = Lsn(0x08);
|
||||||
|
/// Number of unique keys including in the test data
|
||||||
|
pub(super) const KEY_COUNT: u8 = 60;
|
||||||
|
/// Max number of different lsns for each key
|
||||||
|
pub(super) const MAX_ENTRIES_PER_KEY: u8 = 20;
|
||||||
|
/// Possible value sizes for each key along with a probability weight
|
||||||
|
pub(super) const VALUE_SIZES: [(usize, u8); 3] = [(100, 2), (1024, 2), (1024 * 1024, 1)];
|
||||||
|
/// Probability that there will be a gap between the current key and the next one (33.3%)
|
||||||
|
pub(super) const KEY_GAP_CHANGES: [(bool, u8); 2] = [(true, 1), (false, 2)];
|
||||||
|
/// The minimum size of a key range in all the generated reads
|
||||||
|
pub(super) const MIN_RANGE_SIZE: i128 = 10;
|
||||||
|
/// The number of ranges included in each vectored read
|
||||||
|
pub(super) const RANGES_COUNT: u8 = 2;
|
||||||
|
/// The number of vectored reads performed
|
||||||
|
pub(super) const READS_COUNT: u8 = 100;
|
||||||
|
/// Soft max size of a vectored read. Will be violated if we have to read keys
|
||||||
|
/// with values larger than the limit
|
||||||
|
pub(super) const MAX_VECTORED_READ_BYTES: usize = 64 * 1024;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct Entry {
|
||||||
|
key: Key,
|
||||||
|
lsn: Lsn,
|
||||||
|
value: Vec<u8>,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn generate_entries(rng: &mut StdRng) -> Vec<Entry> {
|
||||||
|
let mut current_key = Key::MIN;
|
||||||
|
|
||||||
|
let mut entries = Vec::new();
|
||||||
|
for _ in 0..constants::KEY_COUNT {
|
||||||
|
let count = rng.gen_range(1..constants::MAX_ENTRIES_PER_KEY);
|
||||||
|
let mut lsns_iter =
|
||||||
|
std::iter::successors(Some(Lsn(constants::LSN_OFFSET.0 + 0x08)), |lsn| {
|
||||||
|
Some(Lsn(lsn.0 + 0x08))
|
||||||
|
});
|
||||||
|
let mut lsns = Vec::new();
|
||||||
|
while lsns.len() < count as usize {
|
||||||
|
let take = rng.gen_bool(0.5);
|
||||||
|
let lsn = lsns_iter.next().unwrap();
|
||||||
|
if take {
|
||||||
|
lsns.push(lsn);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for lsn in lsns {
|
||||||
|
let size = constants::VALUE_SIZES
|
||||||
|
.choose_weighted(rng, |item| item.1)
|
||||||
|
.unwrap()
|
||||||
|
.0;
|
||||||
|
let mut buf = vec![0; size];
|
||||||
|
rng.fill_bytes(&mut buf);
|
||||||
|
|
||||||
|
entries.push(Entry {
|
||||||
|
key: current_key,
|
||||||
|
lsn,
|
||||||
|
value: buf,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
let gap = constants::KEY_GAP_CHANGES
|
||||||
|
.choose_weighted(rng, |item| item.1)
|
||||||
|
.unwrap()
|
||||||
|
.0;
|
||||||
|
if gap {
|
||||||
|
current_key = current_key.add(2);
|
||||||
|
} else {
|
||||||
|
current_key = current_key.add(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
entries
|
||||||
|
}
|
||||||
|
|
||||||
|
struct EntriesMeta {
|
||||||
|
key_range: Range<Key>,
|
||||||
|
lsn_range: Range<Lsn>,
|
||||||
|
index: BTreeMap<(Key, Lsn), Vec<u8>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_entries_meta(entries: &[Entry]) -> EntriesMeta {
|
||||||
|
let key_range = match entries.iter().minmax_by_key(|e| e.key) {
|
||||||
|
MinMaxResult::MinMax(min, max) => min.key..max.key.next(),
|
||||||
|
_ => panic!("More than one entry is always expected"),
|
||||||
|
};
|
||||||
|
|
||||||
|
let lsn_range = match entries.iter().minmax_by_key(|e| e.lsn) {
|
||||||
|
MinMaxResult::MinMax(min, max) => min.lsn..Lsn(max.lsn.0 + 1),
|
||||||
|
_ => panic!("More than one entry is always expected"),
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut index = BTreeMap::new();
|
||||||
|
for entry in entries.iter() {
|
||||||
|
index.insert((entry.key, entry.lsn), entry.value.clone());
|
||||||
|
}
|
||||||
|
|
||||||
|
EntriesMeta {
|
||||||
|
key_range,
|
||||||
|
lsn_range,
|
||||||
|
index,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn pick_random_keyspace(rng: &mut StdRng, key_range: &Range<Key>) -> KeySpace {
|
||||||
|
let start = key_range.start.to_i128();
|
||||||
|
let end = key_range.end.to_i128();
|
||||||
|
|
||||||
|
let mut keyspace = KeySpace::default();
|
||||||
|
|
||||||
|
for _ in 0..constants::RANGES_COUNT {
|
||||||
|
let mut range: Option<Range<Key>> = Option::default();
|
||||||
|
while range.is_none() || keyspace.overlaps(range.as_ref().unwrap()) {
|
||||||
|
let range_start = rng.gen_range(start..end);
|
||||||
|
let range_end_offset = range_start + constants::MIN_RANGE_SIZE;
|
||||||
|
if range_end_offset >= end {
|
||||||
|
range = Some(Key::from_i128(range_start)..Key::from_i128(end));
|
||||||
|
} else {
|
||||||
|
let range_end = rng.gen_range((range_start + constants::MIN_RANGE_SIZE)..end);
|
||||||
|
range = Some(Key::from_i128(range_start)..Key::from_i128(range_end));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
keyspace.ranges.push(range.unwrap());
|
||||||
|
}
|
||||||
|
|
||||||
|
keyspace
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_delta_layer_vectored_read_end_to_end() -> anyhow::Result<()> {
|
||||||
|
let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read")?;
|
||||||
|
let (tenant, ctx) = harness.load().await;
|
||||||
|
|
||||||
|
let timeline_id = TimelineId::generate();
|
||||||
|
let timeline = tenant
|
||||||
|
.create_test_timeline(timeline_id, constants::LSN_OFFSET, DEFAULT_PG_VERSION, &ctx)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
tracing::info!("Generating test data ...");
|
||||||
|
|
||||||
|
let rng = &mut StdRng::seed_from_u64(0);
|
||||||
|
let entries = generate_entries(rng);
|
||||||
|
let entries_meta = get_entries_meta(&entries);
|
||||||
|
|
||||||
|
tracing::info!("Done generating {} entries", entries.len());
|
||||||
|
|
||||||
|
tracing::info!("Writing test data to delta layer ...");
|
||||||
|
let mut writer = DeltaLayerWriter::new(
|
||||||
|
harness.conf,
|
||||||
|
timeline_id,
|
||||||
|
harness.tenant_shard_id,
|
||||||
|
entries_meta.key_range.start,
|
||||||
|
entries_meta.lsn_range.clone(),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
for entry in entries {
|
||||||
|
let (_, res) = writer
|
||||||
|
.put_value_bytes(entry.key, entry.lsn, entry.value, false)
|
||||||
|
.await;
|
||||||
|
res?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let resident = writer.finish(entries_meta.key_range.end, &timeline).await?;
|
||||||
|
|
||||||
|
let inner = resident.get_inner_delta(&ctx).await?;
|
||||||
|
|
||||||
|
let file_size = inner.file.metadata().await?.len();
|
||||||
|
tracing::info!(
|
||||||
|
"Done writing test data to delta layer. Resulting file size is: {}",
|
||||||
|
file_size
|
||||||
|
);
|
||||||
|
|
||||||
|
for i in 0..constants::READS_COUNT {
|
||||||
|
tracing::info!("Doing vectored read {}/{}", i + 1, constants::READS_COUNT);
|
||||||
|
|
||||||
|
let block_reader = FileBlockReader::new(&inner.file, inner.file_id);
|
||||||
|
let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||||
|
inner.index_start_blk,
|
||||||
|
inner.index_root_blk,
|
||||||
|
block_reader,
|
||||||
|
);
|
||||||
|
|
||||||
|
let planner = VectoredReadPlanner::new(constants::MAX_VECTORED_READ_BYTES);
|
||||||
|
let mut reconstruct_state = ValuesReconstructState::new();
|
||||||
|
let keyspace = pick_random_keyspace(rng, &entries_meta.key_range);
|
||||||
|
let data_end_offset = inner.index_start_blk as u64 * PAGE_SZ as u64;
|
||||||
|
|
||||||
|
let vectored_reads = DeltaLayerInner::plan_reads(
|
||||||
|
keyspace.clone(),
|
||||||
|
entries_meta.lsn_range.clone(),
|
||||||
|
data_end_offset,
|
||||||
|
index_reader,
|
||||||
|
planner,
|
||||||
|
&mut reconstruct_state,
|
||||||
|
&ctx,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let vectored_blob_reader = VectoredBlobReader::new(&inner.file);
|
||||||
|
let buf_size = DeltaLayerInner::get_min_read_buffer_size(
|
||||||
|
&vectored_reads,
|
||||||
|
constants::MAX_VECTORED_READ_BYTES,
|
||||||
|
);
|
||||||
|
let mut buf = Some(BytesMut::with_capacity(buf_size));
|
||||||
|
|
||||||
|
for read in vectored_reads {
|
||||||
|
let blobs_buf = vectored_blob_reader
|
||||||
|
.read_blobs(&read, buf.take().expect("Should have a buffer"))
|
||||||
|
.await?;
|
||||||
|
for meta in blobs_buf.blobs.iter() {
|
||||||
|
let value = &blobs_buf.buf[meta.start..meta.end];
|
||||||
|
assert_eq!(value, entries_meta.index[&(meta.meta.key, meta.meta.lsn)]);
|
||||||
|
}
|
||||||
|
|
||||||
|
buf = Some(blobs_buf.buf);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -44,6 +44,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
|
|||||||
use bytes::{Bytes, BytesMut};
|
use bytes::{Bytes, BytesMut};
|
||||||
use camino::{Utf8Path, Utf8PathBuf};
|
use camino::{Utf8Path, Utf8PathBuf};
|
||||||
use hex;
|
use hex;
|
||||||
|
use itertools::Itertools;
|
||||||
use pageserver_api::keyspace::KeySpace;
|
use pageserver_api::keyspace::KeySpace;
|
||||||
use pageserver_api::models::LayerAccessKind;
|
use pageserver_api::models::LayerAccessKind;
|
||||||
use pageserver_api::shard::TenantShardId;
|
use pageserver_api::shard::TenantShardId;
|
||||||
@@ -540,7 +541,25 @@ impl ImageLayerInner {
|
|||||||
|
|
||||||
let vectored_blob_reader = VectoredBlobReader::new(&self.file);
|
let vectored_blob_reader = VectoredBlobReader::new(&self.file);
|
||||||
for read in reads.into_iter() {
|
for read in reads.into_iter() {
|
||||||
let buf = BytesMut::with_capacity(max_vectored_read_bytes);
|
let buf_size = read.size();
|
||||||
|
|
||||||
|
if buf_size > max_vectored_read_bytes {
|
||||||
|
// If the read is oversized, it should only contain one key.
|
||||||
|
let offenders = read
|
||||||
|
.blobs_at
|
||||||
|
.as_slice()
|
||||||
|
.iter()
|
||||||
|
.map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn))
|
||||||
|
.join(", ");
|
||||||
|
tracing::warn!(
|
||||||
|
"Oversized vectored read ({} > {}) for keys {}",
|
||||||
|
buf_size,
|
||||||
|
max_vectored_read_bytes,
|
||||||
|
offenders
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
let buf = BytesMut::with_capacity(buf_size);
|
||||||
let res = vectored_blob_reader.read_blobs(&read, buf).await;
|
let res = vectored_blob_reader.read_blobs(&read, buf).await;
|
||||||
|
|
||||||
match res {
|
match res {
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ use crate::tenant::ephemeral_file::EphemeralFile;
|
|||||||
use crate::tenant::storage_layer::ValueReconstructResult;
|
use crate::tenant::storage_layer::ValueReconstructResult;
|
||||||
use crate::tenant::timeline::GetVectoredError;
|
use crate::tenant::timeline::GetVectoredError;
|
||||||
use crate::tenant::{PageReconstructError, Timeline};
|
use crate::tenant::{PageReconstructError, Timeline};
|
||||||
use crate::walrecord;
|
use crate::{page_cache, walrecord};
|
||||||
use anyhow::{anyhow, ensure, Result};
|
use anyhow::{anyhow, ensure, Result};
|
||||||
use pageserver_api::keyspace::KeySpace;
|
use pageserver_api::keyspace::KeySpace;
|
||||||
use pageserver_api::models::InMemoryLayerInfo;
|
use pageserver_api::models::InMemoryLayerInfo;
|
||||||
@@ -36,10 +36,14 @@ use super::{
|
|||||||
ValuesReconstructState,
|
ValuesReconstructState,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
|
||||||
|
pub(crate) struct InMemoryLayerFileId(page_cache::FileId);
|
||||||
|
|
||||||
pub struct InMemoryLayer {
|
pub struct InMemoryLayer {
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_shard_id: TenantShardId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
|
file_id: InMemoryLayerFileId,
|
||||||
|
|
||||||
/// This layer contains all the changes from 'start_lsn'. The
|
/// This layer contains all the changes from 'start_lsn'. The
|
||||||
/// start is inclusive.
|
/// start is inclusive.
|
||||||
@@ -200,6 +204,10 @@ pub(crate) static GLOBAL_RESOURCES: GlobalResources = GlobalResources {
|
|||||||
};
|
};
|
||||||
|
|
||||||
impl InMemoryLayer {
|
impl InMemoryLayer {
|
||||||
|
pub(crate) fn file_id(&self) -> InMemoryLayerFileId {
|
||||||
|
self.file_id
|
||||||
|
}
|
||||||
|
|
||||||
pub(crate) fn get_timeline_id(&self) -> TimelineId {
|
pub(crate) fn get_timeline_id(&self) -> TimelineId {
|
||||||
self.timeline_id
|
self.timeline_id
|
||||||
}
|
}
|
||||||
@@ -443,8 +451,10 @@ impl InMemoryLayer {
|
|||||||
trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
|
trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
|
||||||
|
|
||||||
let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?;
|
let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?;
|
||||||
|
let key = InMemoryLayerFileId(file.id());
|
||||||
|
|
||||||
Ok(InMemoryLayer {
|
Ok(InMemoryLayer {
|
||||||
|
file_id: key,
|
||||||
conf,
|
conf,
|
||||||
timeline_id,
|
timeline_id,
|
||||||
tenant_shard_id,
|
tenant_shard_id,
|
||||||
|
|||||||
@@ -1759,6 +1759,18 @@ impl ResidentLayer {
|
|||||||
pub(crate) fn metadata(&self) -> LayerFileMetadata {
|
pub(crate) fn metadata(&self) -> LayerFileMetadata {
|
||||||
self.owner.metadata()
|
self.owner.metadata()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
pub(crate) async fn get_inner_delta<'a>(
|
||||||
|
&'a self,
|
||||||
|
ctx: &RequestContext,
|
||||||
|
) -> anyhow::Result<&'a delta_layer::DeltaLayerInner> {
|
||||||
|
let owner = &self.owner.0;
|
||||||
|
match self.downloaded.get(owner, ctx).await? {
|
||||||
|
LayerKind::Delta(d) => Ok(d),
|
||||||
|
LayerKind::Image(_) => Err(anyhow::anyhow!("Expected a delta layer")),
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl AsLayerDesc for ResidentLayer {
|
impl AsLayerDesc for ResidentLayer {
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ pub mod uninit;
|
|||||||
mod walreceiver;
|
mod walreceiver;
|
||||||
|
|
||||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||||
|
use arc_swap::ArcSwap;
|
||||||
use bytes::Bytes;
|
use bytes::Bytes;
|
||||||
use camino::Utf8Path;
|
use camino::Utf8Path;
|
||||||
use enumset::EnumSet;
|
use enumset::EnumSet;
|
||||||
@@ -118,11 +119,11 @@ use self::layer_manager::LayerManager;
|
|||||||
use self::logical_size::LogicalSize;
|
use self::logical_size::LogicalSize;
|
||||||
use self::walreceiver::{WalReceiver, WalReceiverConf};
|
use self::walreceiver::{WalReceiver, WalReceiverConf};
|
||||||
|
|
||||||
use super::remote_timeline_client::RemoteTimelineClient;
|
use super::config::TenantConf;
|
||||||
use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
|
use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
|
||||||
use super::{config::TenantConf, storage_layer::ReadableLayerDesc};
|
|
||||||
use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
|
use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
|
||||||
use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
|
use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
|
||||||
|
use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
|
||||||
|
|
||||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||||
pub(super) enum FlushLoopState {
|
pub(super) enum FlushLoopState {
|
||||||
@@ -183,7 +184,7 @@ pub(crate) struct AuxFilesState {
|
|||||||
|
|
||||||
pub struct Timeline {
|
pub struct Timeline {
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
tenant_conf: Arc<RwLock<AttachedTenantConf>>,
|
tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
|
||||||
|
|
||||||
myself: Weak<Self>,
|
myself: Weak<Self>,
|
||||||
|
|
||||||
@@ -281,10 +282,12 @@ pub struct Timeline {
|
|||||||
pub(super) flush_loop_state: Mutex<FlushLoopState>,
|
pub(super) flush_loop_state: Mutex<FlushLoopState>,
|
||||||
|
|
||||||
/// layer_flush_start_tx can be used to wake up the layer-flushing task.
|
/// layer_flush_start_tx can be used to wake up the layer-flushing task.
|
||||||
/// The value is a counter, incremented every time a new flush cycle is requested.
|
/// - The u64 value is a counter, incremented every time a new flush cycle is requested.
|
||||||
/// The flush cycle counter is sent back on the layer_flush_done channel when
|
/// The flush cycle counter is sent back on the layer_flush_done channel when
|
||||||
/// the flush finishes. You can use that to wait for the flush to finish.
|
/// the flush finishes. You can use that to wait for the flush to finish.
|
||||||
layer_flush_start_tx: tokio::sync::watch::Sender<u64>,
|
/// - The LSN is updated to max() of its current value and the latest disk_consistent_lsn
|
||||||
|
/// read by whoever sends an update
|
||||||
|
layer_flush_start_tx: tokio::sync::watch::Sender<(u64, Lsn)>,
|
||||||
/// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel
|
/// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel
|
||||||
layer_flush_done_tx: tokio::sync::watch::Sender<(u64, Result<(), FlushLayerError>)>,
|
layer_flush_done_tx: tokio::sync::watch::Sender<(u64, Result<(), FlushLayerError>)>,
|
||||||
|
|
||||||
@@ -309,6 +312,8 @@ pub struct Timeline {
|
|||||||
/// Configuration: how often should the partitioning be recalculated.
|
/// Configuration: how often should the partitioning be recalculated.
|
||||||
repartition_threshold: u64,
|
repartition_threshold: u64,
|
||||||
|
|
||||||
|
last_image_layer_creation_check_at: AtomicLsn,
|
||||||
|
|
||||||
/// Current logical size of the "datadir", at the last LSN.
|
/// Current logical size of the "datadir", at the last LSN.
|
||||||
current_logical_size: LogicalSize,
|
current_logical_size: LogicalSize,
|
||||||
|
|
||||||
@@ -610,6 +615,25 @@ pub enum GetVectoredImpl {
|
|||||||
Vectored,
|
Vectored,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) enum WaitLsnWaiter<'a> {
|
||||||
|
Timeline(&'a Timeline),
|
||||||
|
Tenant,
|
||||||
|
PageService,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Argument to [`Timeline::shutdown`].
|
||||||
|
#[derive(Debug, Clone, Copy)]
|
||||||
|
pub(crate) enum ShutdownMode {
|
||||||
|
/// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then
|
||||||
|
/// also to remote storage. This method can easily take multiple seconds for a busy timeline.
|
||||||
|
///
|
||||||
|
/// While we are flushing, we continue to accept read I/O for LSNs ingested before
|
||||||
|
/// the call to [`Timeline::shutdown`].
|
||||||
|
FreezeAndFlush,
|
||||||
|
/// Shut down immediately, without waiting for any open layers to flush.
|
||||||
|
Hard,
|
||||||
|
}
|
||||||
|
|
||||||
/// Public interface functions
|
/// Public interface functions
|
||||||
impl Timeline {
|
impl Timeline {
|
||||||
/// Get the LSN where this branch was created
|
/// Get the LSN where this branch was created
|
||||||
@@ -1058,7 +1082,8 @@ impl Timeline {
|
|||||||
pub(crate) async fn wait_lsn(
|
pub(crate) async fn wait_lsn(
|
||||||
&self,
|
&self,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
_ctx: &RequestContext, /* Prepare for use by cancellation */
|
who_is_waiting: WaitLsnWaiter<'_>,
|
||||||
|
ctx: &RequestContext, /* Prepare for use by cancellation */
|
||||||
) -> Result<(), WaitLsnError> {
|
) -> Result<(), WaitLsnError> {
|
||||||
if self.cancel.is_cancelled() {
|
if self.cancel.is_cancelled() {
|
||||||
return Err(WaitLsnError::Shutdown);
|
return Err(WaitLsnError::Shutdown);
|
||||||
@@ -1066,20 +1091,28 @@ impl Timeline {
|
|||||||
return Err(WaitLsnError::BadState);
|
return Err(WaitLsnError::BadState);
|
||||||
}
|
}
|
||||||
|
|
||||||
// This should never be called from the WAL receiver, because that could lead
|
if cfg!(debug_assertions) {
|
||||||
// to a deadlock.
|
match ctx.task_kind() {
|
||||||
debug_assert!(
|
TaskKind::WalReceiverManager
|
||||||
task_mgr::current_task_kind() != Some(TaskKind::WalReceiverManager),
|
| TaskKind::WalReceiverConnectionHandler
|
||||||
"wait_lsn cannot be called in WAL receiver"
|
| TaskKind::WalReceiverConnectionPoller => {
|
||||||
);
|
let is_myself = match who_is_waiting {
|
||||||
debug_assert!(
|
WaitLsnWaiter::Timeline(waiter) => Weak::ptr_eq(&waiter.myself, &self.myself),
|
||||||
task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionHandler),
|
WaitLsnWaiter::Tenant | WaitLsnWaiter::PageService => unreachable!("tenant or page_service context are not expected to have task kind {:?}", ctx.task_kind()),
|
||||||
"wait_lsn cannot be called in WAL receiver"
|
};
|
||||||
);
|
if is_myself {
|
||||||
debug_assert!(
|
if let Err(current) = self.last_record_lsn.would_wait_for(lsn) {
|
||||||
task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionPoller),
|
// walingest is the only one that can advance last_record_lsn; it should make sure to never reach here
|
||||||
"wait_lsn cannot be called in WAL receiver"
|
panic!("this timeline's walingest task is calling wait_lsn({lsn}) but we only have last_record_lsn={current}; would deadlock");
|
||||||
);
|
}
|
||||||
|
} else {
|
||||||
|
// if another timeline's is waiting for us, there's no deadlock risk because
|
||||||
|
// our walreceiver task can make progress independent of theirs
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let _timer = crate::metrics::WAIT_LSN_TIME.start_timer();
|
let _timer = crate::metrics::WAIT_LSN_TIME.start_timer();
|
||||||
|
|
||||||
@@ -1138,8 +1171,8 @@ impl Timeline {
|
|||||||
/// Flush to disk all data that was written with the put_* functions
|
/// Flush to disk all data that was written with the put_* functions
|
||||||
#[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
|
#[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
|
||||||
pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> {
|
pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> {
|
||||||
self.freeze_inmem_layer(false).await;
|
let to_lsn = self.freeze_inmem_layer(false).await;
|
||||||
self.flush_frozen_layers_and_wait().await
|
self.flush_frozen_layers_and_wait(to_lsn).await
|
||||||
}
|
}
|
||||||
|
|
||||||
/// If there is no writer, and conditions for rolling the latest layer are met, then freeze it.
|
/// If there is no writer, and conditions for rolling the latest layer are met, then freeze it.
|
||||||
@@ -1159,7 +1192,39 @@ impl Timeline {
|
|||||||
};
|
};
|
||||||
|
|
||||||
let Some(open_layer) = &layers_guard.layer_map().open_layer else {
|
let Some(open_layer) = &layers_guard.layer_map().open_layer else {
|
||||||
// No open layer, no work to do.
|
// If there is no open layer, we have no layer freezing to do. However, we might need to generate
|
||||||
|
// some updates to disk_consistent_lsn and remote_consistent_lsn, in case we ingested some WAL regions
|
||||||
|
// that didn't result in writes to this shard.
|
||||||
|
|
||||||
|
// Must not hold the layers lock while waiting for a flush.
|
||||||
|
drop(layers_guard);
|
||||||
|
|
||||||
|
let last_record_lsn = self.get_last_record_lsn();
|
||||||
|
let disk_consistent_lsn = self.get_disk_consistent_lsn();
|
||||||
|
if last_record_lsn > disk_consistent_lsn {
|
||||||
|
// We have no open layer, but disk_consistent_lsn is behind the last record: this indicates
|
||||||
|
// we are a sharded tenant and have skipped some WAL
|
||||||
|
let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
|
||||||
|
if last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() {
|
||||||
|
// This should be somewhat rare, so we log it at INFO level.
|
||||||
|
//
|
||||||
|
// We checked for checkpoint timeout so that a shard without any
|
||||||
|
// data ingested (yet) doesn't write a remote index as soon as it
|
||||||
|
// sees its LSN advance: we only do this if we've been layer-less
|
||||||
|
// for some time.
|
||||||
|
tracing::info!(
|
||||||
|
"Advancing disk_consistent_lsn past WAL ingest gap {} -> {}",
|
||||||
|
disk_consistent_lsn,
|
||||||
|
last_record_lsn
|
||||||
|
);
|
||||||
|
|
||||||
|
// The flush loop will update remote consistent LSN as well as disk consistent LSN.
|
||||||
|
self.flush_frozen_layers_and_wait(last_record_lsn)
|
||||||
|
.await
|
||||||
|
.ok();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return;
|
return;
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -1288,83 +1353,119 @@ impl Timeline {
|
|||||||
self.launch_eviction_task(parent, background_jobs_can_start);
|
self.launch_eviction_task(parent, background_jobs_can_start);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then
|
/// After this function returns, there are no timeline-scoped tasks are left running.
|
||||||
/// also to remote storage. This method can easily take multiple seconds for a busy timeline.
|
|
||||||
///
|
///
|
||||||
/// While we are flushing, we continue to accept read I/O.
|
/// The preferred pattern for is:
|
||||||
pub(crate) async fn flush_and_shutdown(&self) {
|
/// - in any spawned tasks, keep Timeline::guard open + Timeline::cancel / child token
|
||||||
|
/// - if early shutdown (not just cancellation) of a sub-tree of tasks is required,
|
||||||
|
/// go the extra mile and keep track of JoinHandles
|
||||||
|
/// - Keep track of JoinHandles using a passed-down `Arc<Mutex<Option<JoinSet>>>` or similar,
|
||||||
|
/// instead of spawning directly on a runtime. It is a more composable / testable pattern.
|
||||||
|
///
|
||||||
|
/// For legacy reasons, we still have multiple tasks spawned using
|
||||||
|
/// `task_mgr::spawn(X, Some(tenant_id), Some(timeline_id))`.
|
||||||
|
/// We refer to these as "timeline-scoped task_mgr tasks".
|
||||||
|
/// Some of these tasks are already sensitive to Timeline::cancel while others are
|
||||||
|
/// not sensitive to Timeline::cancel and instead respect [`task_mgr::shutdown_token`]
|
||||||
|
/// or [`task_mgr::shutdown_watcher`].
|
||||||
|
/// We want to gradually convert the code base away from these.
|
||||||
|
///
|
||||||
|
/// Here is an inventory of timeline-scoped task_mgr tasks that are still sensitive to
|
||||||
|
/// `task_mgr::shutdown_{token,watcher}` (there are also tenant-scoped and global-scoped
|
||||||
|
/// ones that aren't mentioned here):
|
||||||
|
/// - [`TaskKind::TimelineDeletionWorker`]
|
||||||
|
/// - NB: also used for tenant deletion
|
||||||
|
/// - [`TaskKind::RemoteUploadTask`]`
|
||||||
|
/// - [`TaskKind::InitialLogicalSizeCalculation`]
|
||||||
|
/// - [`TaskKind::DownloadAllRemoteLayers`] (can we get rid of it?)
|
||||||
|
// Inventory of timeline-scoped task_mgr tasks that use spawn but aren't sensitive:
|
||||||
|
/// - [`TaskKind::Eviction`]
|
||||||
|
/// - [`TaskKind::LayerFlushTask`]
|
||||||
|
/// - [`TaskKind::OndemandLogicalSizeCalculation`]
|
||||||
|
/// - [`TaskKind::GarbageCollector`] (immediate_gc is timeline-scoped)
|
||||||
|
pub(crate) async fn shutdown(&self, mode: ShutdownMode) {
|
||||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
|
|
||||||
// Stop ingesting data, so that we are not still writing to an InMemoryLayer while
|
let try_freeze_and_flush = match mode {
|
||||||
// trying to flush
|
ShutdownMode::FreezeAndFlush => true,
|
||||||
tracing::debug!("Waiting for WalReceiverManager...");
|
ShutdownMode::Hard => false,
|
||||||
task_mgr::shutdown_tasks(
|
};
|
||||||
Some(TaskKind::WalReceiverManager),
|
|
||||||
Some(self.tenant_shard_id),
|
|
||||||
Some(self.timeline_id),
|
|
||||||
)
|
|
||||||
.await;
|
|
||||||
|
|
||||||
// Since we have shut down WAL ingest, we should not let anyone start waiting for the LSN to advance
|
// Regardless of whether we're going to try_freeze_and_flush
|
||||||
|
// or not, stop ingesting any more data. Walreceiver only provides
|
||||||
|
// cancellation but no "wait until gone", because it uses the Timeline::gate.
|
||||||
|
// So, only after the self.gate.close() below will we know for sure that
|
||||||
|
// no walreceiver tasks are left.
|
||||||
|
// For `try_freeze_and_flush=true`, this means that we might still be ingesting
|
||||||
|
// data during the call to `self.freeze_and_flush()` below.
|
||||||
|
// That's not ideal, but, we don't have the concept of a ChildGuard,
|
||||||
|
// which is what we'd need to properly model early shutdown of the walreceiver
|
||||||
|
// task sub-tree before the other Timeline task sub-trees.
|
||||||
|
let walreceiver = self.walreceiver.lock().unwrap().take();
|
||||||
|
tracing::debug!(
|
||||||
|
is_some = walreceiver.is_some(),
|
||||||
|
"Waiting for WalReceiverManager..."
|
||||||
|
);
|
||||||
|
if let Some(walreceiver) = walreceiver {
|
||||||
|
walreceiver.cancel();
|
||||||
|
}
|
||||||
|
// ... and inform any waiters for newer LSNs that there won't be any.
|
||||||
self.last_record_lsn.shutdown();
|
self.last_record_lsn.shutdown();
|
||||||
|
|
||||||
// now all writers to InMemory layer are gone, do the final flush if requested
|
if try_freeze_and_flush {
|
||||||
match self.freeze_and_flush().await {
|
// we shut down walreceiver above, so, we won't add anything more
|
||||||
Ok(_) => {
|
// to the InMemoryLayer; freeze it and wait for all frozen layers
|
||||||
// drain the upload queue
|
// to reach the disk & upload queue, then shut the upload queue and
|
||||||
if let Some(client) = self.remote_client.as_ref() {
|
// wait for it to drain.
|
||||||
// if we did not wait for completion here, it might be our shutdown process
|
match self.freeze_and_flush().await {
|
||||||
// didn't wait for remote uploads to complete at all, as new tasks can forever
|
Ok(_) => {
|
||||||
// be spawned.
|
// drain the upload queue
|
||||||
//
|
if let Some(client) = self.remote_client.as_ref() {
|
||||||
// what is problematic is the shutting down of RemoteTimelineClient, because
|
// if we did not wait for completion here, it might be our shutdown process
|
||||||
// obviously it does not make sense to stop while we wait for it, but what
|
// didn't wait for remote uploads to complete at all, as new tasks can forever
|
||||||
// about corner cases like s3 suddenly hanging up?
|
// be spawned.
|
||||||
client.shutdown().await;
|
//
|
||||||
|
// what is problematic is the shutting down of RemoteTimelineClient, because
|
||||||
|
// obviously it does not make sense to stop while we wait for it, but what
|
||||||
|
// about corner cases like s3 suddenly hanging up?
|
||||||
|
client.shutdown().await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
// Non-fatal. Shutdown is infallible. Failures to flush just mean that
|
||||||
|
// we have some extra WAL replay to do next time the timeline starts.
|
||||||
|
warn!("failed to freeze and flush: {e:#}");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Err(e) => {
|
|
||||||
// Non-fatal. Shutdown is infallible. Failures to flush just mean that
|
|
||||||
// we have some extra WAL replay to do next time the timeline starts.
|
|
||||||
warn!("failed to freeze and flush: {e:#}");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
self.shutdown().await;
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Shut down immediately, without waiting for any open layers to flush to disk. This is a subset of
|
|
||||||
/// the graceful [`Timeline::flush_and_shutdown`] function.
|
|
||||||
pub(crate) async fn shutdown(&self) {
|
|
||||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
|
||||||
|
|
||||||
// Signal any subscribers to our cancellation token to drop out
|
// Signal any subscribers to our cancellation token to drop out
|
||||||
tracing::debug!("Cancelling CancellationToken");
|
tracing::debug!("Cancelling CancellationToken");
|
||||||
self.cancel.cancel();
|
self.cancel.cancel();
|
||||||
|
|
||||||
// Page request handlers might be waiting for LSN to advance: they do not respect Timeline::cancel
|
// Transition the remote_client into a state where it's only useful for timeline deletion.
|
||||||
// while doing so.
|
// (The deletion use case is why we can't just hook up remote_client to Self::cancel).)
|
||||||
self.last_record_lsn.shutdown();
|
|
||||||
|
|
||||||
// Shut down the layer flush task before the remote client, as one depends on the other
|
|
||||||
task_mgr::shutdown_tasks(
|
|
||||||
Some(TaskKind::LayerFlushTask),
|
|
||||||
Some(self.tenant_shard_id),
|
|
||||||
Some(self.timeline_id),
|
|
||||||
)
|
|
||||||
.await;
|
|
||||||
|
|
||||||
// Shut down remote timeline client: this gracefully moves its metadata into its Stopping state in
|
|
||||||
// case our caller wants to use that for a deletion
|
|
||||||
if let Some(remote_client) = self.remote_client.as_ref() {
|
if let Some(remote_client) = self.remote_client.as_ref() {
|
||||||
remote_client.stop();
|
remote_client.stop();
|
||||||
|
// As documented in remote_client.stop()'s doc comment, it's our responsibility
|
||||||
|
// to shut down the upload queue tasks.
|
||||||
|
// TODO: fix that, task management should be encapsulated inside remote_client.
|
||||||
|
task_mgr::shutdown_tasks(
|
||||||
|
Some(TaskKind::RemoteUploadTask),
|
||||||
|
Some(self.tenant_shard_id),
|
||||||
|
Some(self.timeline_id),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: work toward making this a no-op. See this funciton's doc comment for more context.
|
||||||
tracing::debug!("Waiting for tasks...");
|
tracing::debug!("Waiting for tasks...");
|
||||||
|
|
||||||
task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), Some(self.timeline_id)).await;
|
task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), Some(self.timeline_id)).await;
|
||||||
|
|
||||||
// Finally wait until any gate-holders are complete
|
// Finally wait until any gate-holders are complete.
|
||||||
|
//
|
||||||
|
// TODO: once above shutdown_tasks is a no-op, we can close the gate before calling shutdown_tasks
|
||||||
|
// and use a TBD variant of shutdown_tasks that asserts that there were no tasks left.
|
||||||
self.gate.close().await;
|
self.gate.close().await;
|
||||||
|
|
||||||
self.metrics.shutdown();
|
self.metrics.shutdown();
|
||||||
@@ -1568,57 +1669,65 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
|
|||||||
// Private functions
|
// Private functions
|
||||||
impl Timeline {
|
impl Timeline {
|
||||||
pub(crate) fn get_lazy_slru_download(&self) -> bool {
|
pub(crate) fn get_lazy_slru_download(&self) -> bool {
|
||||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
let tenant_conf = self.tenant_conf.load();
|
||||||
tenant_conf
|
tenant_conf
|
||||||
|
.tenant_conf
|
||||||
.lazy_slru_download
|
.lazy_slru_download
|
||||||
.unwrap_or(self.conf.default_tenant_conf.lazy_slru_download)
|
.unwrap_or(self.conf.default_tenant_conf.lazy_slru_download)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_checkpoint_distance(&self) -> u64 {
|
fn get_checkpoint_distance(&self) -> u64 {
|
||||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
let tenant_conf = self.tenant_conf.load();
|
||||||
tenant_conf
|
tenant_conf
|
||||||
|
.tenant_conf
|
||||||
.checkpoint_distance
|
.checkpoint_distance
|
||||||
.unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
|
.unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_checkpoint_timeout(&self) -> Duration {
|
fn get_checkpoint_timeout(&self) -> Duration {
|
||||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
let tenant_conf = self.tenant_conf.load();
|
||||||
tenant_conf
|
tenant_conf
|
||||||
|
.tenant_conf
|
||||||
.checkpoint_timeout
|
.checkpoint_timeout
|
||||||
.unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
|
.unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_compaction_target_size(&self) -> u64 {
|
fn get_compaction_target_size(&self) -> u64 {
|
||||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
let tenant_conf = self.tenant_conf.load();
|
||||||
tenant_conf
|
tenant_conf
|
||||||
|
.tenant_conf
|
||||||
.compaction_target_size
|
.compaction_target_size
|
||||||
.unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
|
.unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_compaction_threshold(&self) -> usize {
|
fn get_compaction_threshold(&self) -> usize {
|
||||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
let tenant_conf = self.tenant_conf.load();
|
||||||
tenant_conf
|
tenant_conf
|
||||||
|
.tenant_conf
|
||||||
.compaction_threshold
|
.compaction_threshold
|
||||||
.unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
|
.unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_image_creation_threshold(&self) -> usize {
|
fn get_image_creation_threshold(&self) -> usize {
|
||||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
let tenant_conf = self.tenant_conf.load();
|
||||||
tenant_conf
|
tenant_conf
|
||||||
|
.tenant_conf
|
||||||
.image_creation_threshold
|
.image_creation_threshold
|
||||||
.unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
|
.unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_compaction_algorithm(&self) -> CompactionAlgorithm {
|
fn get_compaction_algorithm(&self) -> CompactionAlgorithm {
|
||||||
let tenant_conf = &self.tenant_conf.read().unwrap().tenant_conf;
|
let tenant_conf = &self.tenant_conf.load();
|
||||||
tenant_conf
|
tenant_conf
|
||||||
|
.tenant_conf
|
||||||
.compaction_algorithm
|
.compaction_algorithm
|
||||||
.unwrap_or(self.conf.default_tenant_conf.compaction_algorithm)
|
.unwrap_or(self.conf.default_tenant_conf.compaction_algorithm)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_eviction_policy(&self) -> EvictionPolicy {
|
fn get_eviction_policy(&self) -> EvictionPolicy {
|
||||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
let tenant_conf = self.tenant_conf.load();
|
||||||
tenant_conf
|
tenant_conf
|
||||||
|
.tenant_conf
|
||||||
.eviction_policy
|
.eviction_policy
|
||||||
.unwrap_or(self.conf.default_tenant_conf.eviction_policy)
|
.unwrap_or(self.conf.default_tenant_conf.eviction_policy)
|
||||||
}
|
}
|
||||||
@@ -1632,14 +1741,26 @@ impl Timeline {
|
|||||||
.unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold)
|
.unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(super) fn tenant_conf_updated(&self) {
|
fn get_image_layer_creation_check_threshold(&self) -> u8 {
|
||||||
|
let tenant_conf = self.tenant_conf.load();
|
||||||
|
tenant_conf
|
||||||
|
.tenant_conf
|
||||||
|
.image_layer_creation_check_threshold
|
||||||
|
.unwrap_or(
|
||||||
|
self.conf
|
||||||
|
.default_tenant_conf
|
||||||
|
.image_layer_creation_check_threshold,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(super) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) {
|
||||||
// NB: Most tenant conf options are read by background loops, so,
|
// NB: Most tenant conf options are read by background loops, so,
|
||||||
// changes will automatically be picked up.
|
// changes will automatically be picked up.
|
||||||
|
|
||||||
// The threshold is embedded in the metric. So, we need to update it.
|
// The threshold is embedded in the metric. So, we need to update it.
|
||||||
{
|
{
|
||||||
let new_threshold = Self::get_evictions_low_residence_duration_metric_threshold(
|
let new_threshold = Self::get_evictions_low_residence_duration_metric_threshold(
|
||||||
&self.tenant_conf.read().unwrap().tenant_conf,
|
new_conf,
|
||||||
&self.conf.default_tenant_conf,
|
&self.conf.default_tenant_conf,
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -1666,7 +1787,7 @@ impl Timeline {
|
|||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
pub(super) fn new(
|
pub(super) fn new(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
tenant_conf: Arc<RwLock<AttachedTenantConf>>,
|
tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
|
||||||
metadata: &TimelineMetadata,
|
metadata: &TimelineMetadata,
|
||||||
ancestor: Option<Arc<Timeline>>,
|
ancestor: Option<Arc<Timeline>>,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
@@ -1682,17 +1803,16 @@ impl Timeline {
|
|||||||
let disk_consistent_lsn = metadata.disk_consistent_lsn();
|
let disk_consistent_lsn = metadata.disk_consistent_lsn();
|
||||||
let (state, _) = watch::channel(state);
|
let (state, _) = watch::channel(state);
|
||||||
|
|
||||||
let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0);
|
let (layer_flush_start_tx, _) = tokio::sync::watch::channel((0, disk_consistent_lsn));
|
||||||
let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
|
let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
|
||||||
|
|
||||||
let tenant_conf_guard = tenant_conf.read().unwrap();
|
let evictions_low_residence_duration_metric_threshold = {
|
||||||
|
let loaded_tenant_conf = tenant_conf.load();
|
||||||
let evictions_low_residence_duration_metric_threshold =
|
|
||||||
Self::get_evictions_low_residence_duration_metric_threshold(
|
Self::get_evictions_low_residence_duration_metric_threshold(
|
||||||
&tenant_conf_guard.tenant_conf,
|
&loaded_tenant_conf.tenant_conf,
|
||||||
&conf.default_tenant_conf,
|
&conf.default_tenant_conf,
|
||||||
);
|
)
|
||||||
drop(tenant_conf_guard);
|
};
|
||||||
|
|
||||||
Arc::new_cyclic(|myself| {
|
Arc::new_cyclic(|myself| {
|
||||||
let mut result = Timeline {
|
let mut result = Timeline {
|
||||||
@@ -1769,6 +1889,7 @@ impl Timeline {
|
|||||||
},
|
},
|
||||||
partitioning: tokio::sync::Mutex::new((KeyPartitioning::new(), Lsn(0))),
|
partitioning: tokio::sync::Mutex::new((KeyPartitioning::new(), Lsn(0))),
|
||||||
repartition_threshold: 0,
|
repartition_threshold: 0,
|
||||||
|
last_image_layer_creation_check_at: AtomicLsn::new(0),
|
||||||
|
|
||||||
last_received_wal: Mutex::new(None),
|
last_received_wal: Mutex::new(None),
|
||||||
rel_size_cache: RwLock::new(HashMap::new()),
|
rel_size_cache: RwLock::new(HashMap::new()),
|
||||||
@@ -1797,6 +1918,7 @@ impl Timeline {
|
|||||||
};
|
};
|
||||||
result.repartition_threshold =
|
result.repartition_threshold =
|
||||||
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
|
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
|
||||||
|
|
||||||
result
|
result
|
||||||
.metrics
|
.metrics
|
||||||
.last_record_gauge
|
.last_record_gauge
|
||||||
@@ -1873,20 +1995,19 @@ impl Timeline {
|
|||||||
self.timeline_id, self.tenant_shard_id
|
self.timeline_id, self.tenant_shard_id
|
||||||
);
|
);
|
||||||
|
|
||||||
let tenant_conf_guard = self.tenant_conf.read().unwrap();
|
let tenant_conf = self.tenant_conf.load();
|
||||||
let wal_connect_timeout = tenant_conf_guard
|
let wal_connect_timeout = tenant_conf
|
||||||
.tenant_conf
|
.tenant_conf
|
||||||
.walreceiver_connect_timeout
|
.walreceiver_connect_timeout
|
||||||
.unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout);
|
.unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout);
|
||||||
let lagging_wal_timeout = tenant_conf_guard
|
let lagging_wal_timeout = tenant_conf
|
||||||
.tenant_conf
|
.tenant_conf
|
||||||
.lagging_wal_timeout
|
.lagging_wal_timeout
|
||||||
.unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout);
|
.unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout);
|
||||||
let max_lsn_wal_lag = tenant_conf_guard
|
let max_lsn_wal_lag = tenant_conf
|
||||||
.tenant_conf
|
.tenant_conf
|
||||||
.max_lsn_wal_lag
|
.max_lsn_wal_lag
|
||||||
.unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag);
|
.unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag);
|
||||||
drop(tenant_conf_guard);
|
|
||||||
|
|
||||||
let mut guard = self.walreceiver.lock().unwrap();
|
let mut guard = self.walreceiver.lock().unwrap();
|
||||||
assert!(
|
assert!(
|
||||||
@@ -2434,10 +2555,6 @@ impl Timeline {
|
|||||||
debug!("cancelling logical size calculation for timeline shutdown");
|
debug!("cancelling logical size calculation for timeline shutdown");
|
||||||
calculation.await
|
calculation.await
|
||||||
}
|
}
|
||||||
_ = task_mgr::shutdown_watcher() => {
|
|
||||||
debug!("cancelling logical size calculation for task shutdown");
|
|
||||||
calculation.await
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2892,16 +3009,6 @@ impl Timeline {
|
|||||||
|
|
||||||
let mut completed_keyspace = KeySpace::default();
|
let mut completed_keyspace = KeySpace::default();
|
||||||
|
|
||||||
// Hold the layer map whilst visiting the timeline to prevent
|
|
||||||
// compaction, eviction and flushes from rendering the layers unreadable.
|
|
||||||
//
|
|
||||||
// TODO: Do we actually need to do this? In theory holding on
|
|
||||||
// to [`tenant::storage_layer::Layer`] should be enough. However,
|
|
||||||
// [`Timeline::get`] also holds the lock during IO, so more investigation
|
|
||||||
// is needed.
|
|
||||||
let guard = timeline.layers.read().await;
|
|
||||||
let layers = guard.layer_map();
|
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
if cancel.is_cancelled() {
|
if cancel.is_cancelled() {
|
||||||
return Err(GetVectoredError::Cancelled);
|
return Err(GetVectoredError::Cancelled);
|
||||||
@@ -2911,6 +3018,9 @@ impl Timeline {
|
|||||||
unmapped_keyspace.remove_overlapping_with(&keys_done_last_step);
|
unmapped_keyspace.remove_overlapping_with(&keys_done_last_step);
|
||||||
completed_keyspace.merge(&keys_done_last_step);
|
completed_keyspace.merge(&keys_done_last_step);
|
||||||
|
|
||||||
|
let guard = timeline.layers.read().await;
|
||||||
|
let layers = guard.layer_map();
|
||||||
|
|
||||||
let in_memory_layer = layers.find_in_memory_layer(|l| {
|
let in_memory_layer = layers.find_in_memory_layer(|l| {
|
||||||
let start_lsn = l.get_lsn_range().start;
|
let start_lsn = l.get_lsn_range().start;
|
||||||
cont_lsn > start_lsn
|
cont_lsn > start_lsn
|
||||||
@@ -2918,12 +3028,11 @@ impl Timeline {
|
|||||||
|
|
||||||
match in_memory_layer {
|
match in_memory_layer {
|
||||||
Some(l) => {
|
Some(l) => {
|
||||||
|
let lsn_range = l.get_lsn_range().start..cont_lsn;
|
||||||
fringe.update(
|
fringe.update(
|
||||||
ReadableLayerDesc::InMemory {
|
ReadableLayer::InMemoryLayer(l),
|
||||||
handle: l,
|
|
||||||
lsn_ceil: cont_lsn,
|
|
||||||
},
|
|
||||||
unmapped_keyspace.clone(),
|
unmapped_keyspace.clone(),
|
||||||
|
lsn_range,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
None => {
|
None => {
|
||||||
@@ -2935,30 +3044,43 @@ impl Timeline {
|
|||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
|
.map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
|
||||||
(
|
(
|
||||||
ReadableLayerDesc::Persistent {
|
ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)),
|
||||||
desc: (*layer).clone(),
|
|
||||||
lsn_range: lsn_floor..cont_lsn,
|
|
||||||
},
|
|
||||||
keyspace_accum.to_keyspace(),
|
keyspace_accum.to_keyspace(),
|
||||||
|
lsn_floor..cont_lsn,
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
.for_each(|(layer, keyspace)| fringe.update(layer, keyspace));
|
.for_each(|(layer, keyspace, lsn_range)| {
|
||||||
|
fringe.update(layer, keyspace, lsn_range)
|
||||||
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some((layer_to_read, keyspace_to_read)) = fringe.next_layer() {
|
// It's safe to drop the layer map lock after planning the next round of reads.
|
||||||
|
// The fringe keeps readable handles for the layers which are safe to read even
|
||||||
|
// if layers were compacted or flushed.
|
||||||
|
//
|
||||||
|
// The more interesting consideration is: "Why is the read algorithm still correct
|
||||||
|
// if the layer map changes while it is operating?". Doing a vectored read on a
|
||||||
|
// timeline boils down to pushing an imaginary lsn boundary downwards for each range
|
||||||
|
// covered by the read. The layer map tells us how to move the lsn downwards for a
|
||||||
|
// range at *a particular point in time*. It is fine for the answer to be different
|
||||||
|
// at two different time points.
|
||||||
|
drop(guard);
|
||||||
|
|
||||||
|
if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() {
|
||||||
|
let next_cont_lsn = lsn_range.start;
|
||||||
layer_to_read
|
layer_to_read
|
||||||
.get_values_reconstruct_data(
|
.get_values_reconstruct_data(
|
||||||
&guard,
|
|
||||||
keyspace_to_read.clone(),
|
keyspace_to_read.clone(),
|
||||||
|
lsn_range,
|
||||||
reconstruct_state,
|
reconstruct_state,
|
||||||
ctx,
|
ctx,
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
unmapped_keyspace = keyspace_to_read;
|
unmapped_keyspace = keyspace_to_read;
|
||||||
cont_lsn = layer_to_read.get_lsn_floor();
|
cont_lsn = next_cont_lsn;
|
||||||
} else {
|
} else {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@@ -3036,7 +3158,7 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
ancestor
|
ancestor
|
||||||
.wait_lsn(self.ancestor_lsn, ctx)
|
.wait_lsn(self.ancestor_lsn, WaitLsnWaiter::Timeline(self), ctx)
|
||||||
.await
|
.await
|
||||||
.map_err(|e| match e {
|
.map_err(|e| match e {
|
||||||
e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e),
|
e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e),
|
||||||
@@ -3086,7 +3208,9 @@ impl Timeline {
|
|||||||
self.last_record_lsn.advance(new_lsn);
|
self.last_record_lsn.advance(new_lsn);
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn freeze_inmem_layer(&self, write_lock_held: bool) {
|
/// Whether there was a layer to freeze or not, return the value of get_last_record_lsn
|
||||||
|
/// before we attempted the freeze: this guarantees that ingested data is frozen up to this lsn (inclusive).
|
||||||
|
async fn freeze_inmem_layer(&self, write_lock_held: bool) -> Lsn {
|
||||||
// Freeze the current open in-memory layer. It will be written to disk on next
|
// Freeze the current open in-memory layer. It will be written to disk on next
|
||||||
// iteration.
|
// iteration.
|
||||||
|
|
||||||
@@ -3096,7 +3220,9 @@ impl Timeline {
|
|||||||
Some(self.write_lock.lock().await)
|
Some(self.write_lock.lock().await)
|
||||||
};
|
};
|
||||||
|
|
||||||
self.freeze_inmem_layer_at(self.get_last_record_lsn()).await;
|
let to_lsn = self.get_last_record_lsn();
|
||||||
|
self.freeze_inmem_layer_at(to_lsn).await;
|
||||||
|
to_lsn
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn freeze_inmem_layer_at(&self, at: Lsn) {
|
async fn freeze_inmem_layer_at(&self, at: Lsn) {
|
||||||
@@ -3109,25 +3235,24 @@ impl Timeline {
|
|||||||
/// Layer flusher task's main loop.
|
/// Layer flusher task's main loop.
|
||||||
async fn flush_loop(
|
async fn flush_loop(
|
||||||
self: &Arc<Self>,
|
self: &Arc<Self>,
|
||||||
mut layer_flush_start_rx: tokio::sync::watch::Receiver<u64>,
|
mut layer_flush_start_rx: tokio::sync::watch::Receiver<(u64, Lsn)>,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) {
|
) {
|
||||||
info!("started flush loop");
|
info!("started flush loop");
|
||||||
loop {
|
loop {
|
||||||
tokio::select! {
|
tokio::select! {
|
||||||
_ = self.cancel.cancelled() => {
|
_ = self.cancel.cancelled() => {
|
||||||
info!("shutting down layer flush task");
|
info!("shutting down layer flush task due to Timeline::cancel");
|
||||||
break;
|
|
||||||
},
|
|
||||||
_ = task_mgr::shutdown_watcher() => {
|
|
||||||
info!("shutting down layer flush task");
|
|
||||||
break;
|
break;
|
||||||
},
|
},
|
||||||
_ = layer_flush_start_rx.changed() => {}
|
_ = layer_flush_start_rx.changed() => {}
|
||||||
}
|
}
|
||||||
|
|
||||||
trace!("waking up");
|
trace!("waking up");
|
||||||
let flush_counter = *layer_flush_start_rx.borrow();
|
let (flush_counter, frozen_to_lsn) = *layer_flush_start_rx.borrow();
|
||||||
|
|
||||||
|
// The highest LSN to which we flushed in the loop over frozen layers
|
||||||
|
let mut flushed_to_lsn = Lsn(0);
|
||||||
|
|
||||||
let result = loop {
|
let result = loop {
|
||||||
if self.cancel.is_cancelled() {
|
if self.cancel.is_cancelled() {
|
||||||
info!("dropping out of flush loop for timeline shutdown");
|
info!("dropping out of flush loop for timeline shutdown");
|
||||||
@@ -3148,7 +3273,9 @@ impl Timeline {
|
|||||||
break Ok(());
|
break Ok(());
|
||||||
};
|
};
|
||||||
match self.flush_frozen_layer(layer_to_flush, ctx).await {
|
match self.flush_frozen_layer(layer_to_flush, ctx).await {
|
||||||
Ok(()) => {}
|
Ok(this_layer_to_lsn) => {
|
||||||
|
flushed_to_lsn = std::cmp::max(flushed_to_lsn, this_layer_to_lsn);
|
||||||
|
}
|
||||||
Err(FlushLayerError::Cancelled) => {
|
Err(FlushLayerError::Cancelled) => {
|
||||||
info!("dropping out of flush loop for timeline shutdown");
|
info!("dropping out of flush loop for timeline shutdown");
|
||||||
return;
|
return;
|
||||||
@@ -3157,11 +3284,36 @@ impl Timeline {
|
|||||||
FlushLayerError::Other(_) | FlushLayerError::CreateImageLayersError(_),
|
FlushLayerError::Other(_) | FlushLayerError::CreateImageLayersError(_),
|
||||||
) => {
|
) => {
|
||||||
error!("could not flush frozen layer: {err:?}");
|
error!("could not flush frozen layer: {err:?}");
|
||||||
break err;
|
break err.map(|_| ());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
timer.stop_and_record();
|
timer.stop_and_record();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Unsharded tenants should never advance their LSN beyond the end of the
|
||||||
|
// highest layer they write: such gaps between layer data and the frozen LSN
|
||||||
|
// are only legal on sharded tenants.
|
||||||
|
debug_assert!(
|
||||||
|
self.shard_identity.count.count() > 1
|
||||||
|
|| flushed_to_lsn >= frozen_to_lsn
|
||||||
|
|| !flushed_to_lsn.is_valid()
|
||||||
|
);
|
||||||
|
|
||||||
|
if flushed_to_lsn < frozen_to_lsn && self.shard_identity.count.count() > 1 {
|
||||||
|
// If our layer flushes didn't carry disk_consistent_lsn up to the `to_lsn` advertised
|
||||||
|
// to us via layer_flush_start_rx, then advance it here.
|
||||||
|
//
|
||||||
|
// This path is only taken for tenants with multiple shards: single sharded tenants should
|
||||||
|
// never encounter a gap in the wal.
|
||||||
|
let old_disk_consistent_lsn = self.disk_consistent_lsn.load();
|
||||||
|
tracing::debug!("Advancing disk_consistent_lsn across layer gap {old_disk_consistent_lsn}->{frozen_to_lsn}");
|
||||||
|
if self.set_disk_consistent_lsn(frozen_to_lsn) {
|
||||||
|
if let Err(e) = self.schedule_uploads(frozen_to_lsn, vec![]) {
|
||||||
|
tracing::warn!("Failed to schedule metadata upload after updating disk_consistent_lsn: {e}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Notify any listeners that we're done
|
// Notify any listeners that we're done
|
||||||
let _ = self
|
let _ = self
|
||||||
.layer_flush_done_tx
|
.layer_flush_done_tx
|
||||||
@@ -3169,7 +3321,13 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn flush_frozen_layers_and_wait(&self) -> anyhow::Result<()> {
|
/// Request the flush loop to write out all frozen layers up to `to_lsn` as Delta L0 files to disk.
|
||||||
|
/// The caller is responsible for the freezing, e.g., [`Self::freeze_inmem_layer`].
|
||||||
|
///
|
||||||
|
/// `last_record_lsn` may be higher than the highest LSN of a frozen layer: if this is the case,
|
||||||
|
/// it means no data will be written between the top of the highest frozen layer and to_lsn,
|
||||||
|
/// e.g. because this tenant shard has ingested up to to_lsn and not written any data locally for that part of the WAL.
|
||||||
|
async fn flush_frozen_layers_and_wait(&self, last_record_lsn: Lsn) -> anyhow::Result<()> {
|
||||||
let mut rx = self.layer_flush_done_tx.subscribe();
|
let mut rx = self.layer_flush_done_tx.subscribe();
|
||||||
|
|
||||||
// Increment the flush cycle counter and wake up the flush task.
|
// Increment the flush cycle counter and wake up the flush task.
|
||||||
@@ -3183,9 +3341,10 @@ impl Timeline {
|
|||||||
anyhow::bail!("cannot flush frozen layers when flush_loop is not running, state is {flush_loop_state:?}")
|
anyhow::bail!("cannot flush frozen layers when flush_loop is not running, state is {flush_loop_state:?}")
|
||||||
}
|
}
|
||||||
|
|
||||||
self.layer_flush_start_tx.send_modify(|counter| {
|
self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
|
||||||
my_flush_request = *counter + 1;
|
my_flush_request = *counter + 1;
|
||||||
*counter = my_flush_request;
|
*counter = my_flush_request;
|
||||||
|
*lsn = std::cmp::max(last_record_lsn, *lsn);
|
||||||
});
|
});
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
@@ -3222,16 +3381,22 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn flush_frozen_layers(&self) {
|
fn flush_frozen_layers(&self) {
|
||||||
self.layer_flush_start_tx.send_modify(|val| *val += 1);
|
self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
|
||||||
|
*counter += 1;
|
||||||
|
|
||||||
|
*lsn = std::cmp::max(*lsn, Lsn(self.last_freeze_at.load().0 - 1));
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Flush one frozen in-memory layer to disk, as a new delta layer.
|
/// Flush one frozen in-memory layer to disk, as a new delta layer.
|
||||||
|
///
|
||||||
|
/// Return value is the last lsn (inclusive) of the layer that was frozen.
|
||||||
#[instrument(skip_all, fields(layer=%frozen_layer))]
|
#[instrument(skip_all, fields(layer=%frozen_layer))]
|
||||||
async fn flush_frozen_layer(
|
async fn flush_frozen_layer(
|
||||||
self: &Arc<Self>,
|
self: &Arc<Self>,
|
||||||
frozen_layer: Arc<InMemoryLayer>,
|
frozen_layer: Arc<InMemoryLayer>,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<(), FlushLayerError> {
|
) -> Result<Lsn, FlushLayerError> {
|
||||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
|
|
||||||
// As a special case, when we have just imported an image into the repository,
|
// As a special case, when we have just imported an image into the repository,
|
||||||
@@ -3306,7 +3471,6 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1);
|
let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1);
|
||||||
let old_disk_consistent_lsn = self.disk_consistent_lsn.load();
|
|
||||||
|
|
||||||
// The new on-disk layers are now in the layer map. We can remove the
|
// The new on-disk layers are now in the layer map. We can remove the
|
||||||
// in-memory layer from the map now. The flushed layer is stored in
|
// in-memory layer from the map now. The flushed layer is stored in
|
||||||
@@ -3320,10 +3484,7 @@ impl Timeline {
|
|||||||
|
|
||||||
guard.finish_flush_l0_layer(delta_layer_to_add.as_ref(), &frozen_layer, &self.metrics);
|
guard.finish_flush_l0_layer(delta_layer_to_add.as_ref(), &frozen_layer, &self.metrics);
|
||||||
|
|
||||||
if disk_consistent_lsn != old_disk_consistent_lsn {
|
if self.set_disk_consistent_lsn(disk_consistent_lsn) {
|
||||||
assert!(disk_consistent_lsn > old_disk_consistent_lsn);
|
|
||||||
self.disk_consistent_lsn.store(disk_consistent_lsn);
|
|
||||||
|
|
||||||
// Schedule remote uploads that will reflect our new disk_consistent_lsn
|
// Schedule remote uploads that will reflect our new disk_consistent_lsn
|
||||||
self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?;
|
self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?;
|
||||||
}
|
}
|
||||||
@@ -3340,7 +3501,22 @@ impl Timeline {
|
|||||||
// This failpoint is used by another test case `test_pageserver_recovery`.
|
// This failpoint is used by another test case `test_pageserver_recovery`.
|
||||||
fail_point!("flush-frozen-exit");
|
fail_point!("flush-frozen-exit");
|
||||||
|
|
||||||
Ok(())
|
Ok(Lsn(lsn_range.end.0 - 1))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return true if the value changed
|
||||||
|
///
|
||||||
|
/// This function must only be used from the layer flush task, and may not be called concurrently.
|
||||||
|
fn set_disk_consistent_lsn(&self, new_value: Lsn) -> bool {
|
||||||
|
// We do a simple load/store cycle: that's why this function isn't safe for concurrent use.
|
||||||
|
let old_value = self.disk_consistent_lsn.load();
|
||||||
|
if new_value != old_value {
|
||||||
|
assert!(new_value >= old_value);
|
||||||
|
self.disk_consistent_lsn.store(new_value);
|
||||||
|
true
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Update metadata file
|
/// Update metadata file
|
||||||
@@ -3501,6 +3677,24 @@ impl Timeline {
|
|||||||
|
|
||||||
// Is it time to create a new image layer for the given partition?
|
// Is it time to create a new image layer for the given partition?
|
||||||
async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
|
async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
|
||||||
|
let last = self.last_image_layer_creation_check_at.load();
|
||||||
|
if lsn != Lsn(0) {
|
||||||
|
let distance = lsn
|
||||||
|
.checked_sub(last)
|
||||||
|
.expect("Attempt to compact with LSN going backwards");
|
||||||
|
|
||||||
|
let min_distance = self.get_image_layer_creation_check_threshold() as u64
|
||||||
|
* self.get_checkpoint_distance();
|
||||||
|
|
||||||
|
// Skip the expensive delta layer counting below if we've not ingested
|
||||||
|
// sufficient WAL since the last check.
|
||||||
|
if distance.0 < min_distance {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
self.last_image_layer_creation_check_at.store(lsn);
|
||||||
|
|
||||||
let threshold = self.get_image_creation_threshold();
|
let threshold = self.get_image_creation_threshold();
|
||||||
|
|
||||||
let guard = self.layers.read().await;
|
let guard = self.layers.read().await;
|
||||||
@@ -3842,6 +4036,24 @@ impl Timeline {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Schedules the uploads of the given image layers
|
||||||
|
fn upload_new_image_layers(
|
||||||
|
self: &Arc<Self>,
|
||||||
|
new_images: impl IntoIterator<Item = ResidentLayer>,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
let Some(remote_client) = &self.remote_client else {
|
||||||
|
return Ok(());
|
||||||
|
};
|
||||||
|
for layer in new_images {
|
||||||
|
remote_client.schedule_layer_file_upload(layer)?;
|
||||||
|
}
|
||||||
|
// should any new image layer been created, not uploading index_part will
|
||||||
|
// result in a mismatch between remote_physical_size and layermap calculated
|
||||||
|
// size, which will fail some tests, but should not be an issue otherwise.
|
||||||
|
remote_client.schedule_index_upload_for_file_changes()?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
/// Update information about which layer files need to be retained on
|
/// Update information about which layer files need to be retained on
|
||||||
/// garbage collection. This is separate from actually performing the GC,
|
/// garbage collection. This is separate from actually performing the GC,
|
||||||
/// and is updated more frequently, so that compaction can remove obsolete
|
/// and is updated more frequently, so that compaction can remove obsolete
|
||||||
|
|||||||
@@ -125,18 +125,8 @@ impl Timeline {
|
|||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
.map_err(anyhow::Error::from)?;
|
.map_err(anyhow::Error::from)?;
|
||||||
if let Some(remote_client) = &self.remote_client {
|
|
||||||
for layer in layers {
|
|
||||||
remote_client.schedule_layer_file_upload(layer)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(remote_client) = &self.remote_client {
|
self.upload_new_image_layers(layers)?;
|
||||||
// should any new image layer been created, not uploading index_part will
|
|
||||||
// result in a mismatch between remote_physical_size and layermap calculated
|
|
||||||
// size, which will fail some tests, but should not be an issue otherwise.
|
|
||||||
remote_client.schedule_index_upload_for_file_changes()?;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
// no partitioning? This is normal, if the timeline was just created
|
// no partitioning? This is normal, if the timeline was just created
|
||||||
@@ -818,7 +808,10 @@ impl TimelineAdaptor {
|
|||||||
self.timeline
|
self.timeline
|
||||||
.finish_compact_batch(&self.new_deltas, &self.new_images, &layers_to_delete)
|
.finish_compact_batch(&self.new_deltas, &self.new_images, &layers_to_delete)
|
||||||
.await?;
|
.await?;
|
||||||
self.new_images.clear();
|
|
||||||
|
self.timeline
|
||||||
|
.upload_new_image_layers(std::mem::take(&mut self.new_images))?;
|
||||||
|
|
||||||
self.new_deltas.clear();
|
self.new_deltas.clear();
|
||||||
self.layers_to_delete.clear();
|
self.layers_to_delete.clear();
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ use std::{
|
|||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use pageserver_api::{models::TimelineState, shard::TenantShardId};
|
use pageserver_api::{models::TimelineState, shard::TenantShardId};
|
||||||
use tokio::sync::OwnedMutexGuard;
|
use tokio::sync::OwnedMutexGuard;
|
||||||
use tracing::{debug, error, info, instrument, Instrument};
|
use tracing::{error, info, instrument, Instrument};
|
||||||
use utils::{crashsafe, fs_ext, id::TimelineId};
|
use utils::{crashsafe, fs_ext, id::TimelineId};
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
@@ -14,7 +14,6 @@ use crate::{
|
|||||||
deletion_queue::DeletionQueueClient,
|
deletion_queue::DeletionQueueClient,
|
||||||
task_mgr::{self, TaskKind},
|
task_mgr::{self, TaskKind},
|
||||||
tenant::{
|
tenant::{
|
||||||
debug_assert_current_span_has_tenant_and_timeline_id,
|
|
||||||
metadata::TimelineMetadata,
|
metadata::TimelineMetadata,
|
||||||
remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
|
remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
|
||||||
CreateTimelineCause, DeleteTimelineError, Tenant,
|
CreateTimelineCause, DeleteTimelineError, Tenant,
|
||||||
@@ -23,58 +22,6 @@ use crate::{
|
|||||||
|
|
||||||
use super::{Timeline, TimelineResources};
|
use super::{Timeline, TimelineResources};
|
||||||
|
|
||||||
/// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
|
|
||||||
async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
|
|
||||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
|
||||||
// Notify any timeline work to drop out of loops/requests
|
|
||||||
tracing::debug!("Cancelling CancellationToken");
|
|
||||||
timeline.cancel.cancel();
|
|
||||||
|
|
||||||
// Stop the walreceiver first.
|
|
||||||
debug!("waiting for wal receiver to shutdown");
|
|
||||||
let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() };
|
|
||||||
if let Some(walreceiver) = maybe_started_walreceiver {
|
|
||||||
walreceiver.stop().await;
|
|
||||||
}
|
|
||||||
debug!("wal receiver shutdown confirmed");
|
|
||||||
|
|
||||||
// Shut down the layer flush task before the remote client, as one depends on the other
|
|
||||||
task_mgr::shutdown_tasks(
|
|
||||||
Some(TaskKind::LayerFlushTask),
|
|
||||||
Some(timeline.tenant_shard_id),
|
|
||||||
Some(timeline.timeline_id),
|
|
||||||
)
|
|
||||||
.await;
|
|
||||||
|
|
||||||
// Prevent new uploads from starting.
|
|
||||||
if let Some(remote_client) = timeline.remote_client.as_ref() {
|
|
||||||
remote_client.stop();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Stop & wait for the remaining timeline tasks, including upload tasks.
|
|
||||||
// NB: This and other delete_timeline calls do not run as a task_mgr task,
|
|
||||||
// so, they are not affected by this shutdown_tasks() call.
|
|
||||||
info!("waiting for timeline tasks to shutdown");
|
|
||||||
task_mgr::shutdown_tasks(
|
|
||||||
None,
|
|
||||||
Some(timeline.tenant_shard_id),
|
|
||||||
Some(timeline.timeline_id),
|
|
||||||
)
|
|
||||||
.await;
|
|
||||||
|
|
||||||
fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
|
|
||||||
Err(anyhow::anyhow!(
|
|
||||||
"failpoint: timeline-delete-before-index-deleted-at"
|
|
||||||
))?
|
|
||||||
});
|
|
||||||
|
|
||||||
tracing::debug!("Waiting for gate...");
|
|
||||||
timeline.gate.close().await;
|
|
||||||
tracing::debug!("Shutdown complete");
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Mark timeline as deleted in S3 so we won't pick it up next time
|
/// Mark timeline as deleted in S3 so we won't pick it up next time
|
||||||
/// during attach or pageserver restart.
|
/// during attach or pageserver restart.
|
||||||
/// See comment in persist_index_part_with_deleted_flag.
|
/// See comment in persist_index_part_with_deleted_flag.
|
||||||
@@ -268,7 +215,14 @@ impl DeleteTimelineFlow {
|
|||||||
|
|
||||||
guard.mark_in_progress()?;
|
guard.mark_in_progress()?;
|
||||||
|
|
||||||
stop_tasks(&timeline).await?;
|
// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
|
||||||
|
timeline.shutdown(super::ShutdownMode::Hard).await;
|
||||||
|
|
||||||
|
fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
|
||||||
|
Err(anyhow::anyhow!(
|
||||||
|
"failpoint: timeline-delete-before-index-deleted-at"
|
||||||
|
))?
|
||||||
|
});
|
||||||
|
|
||||||
set_deleted_in_remote_index(&timeline).await?;
|
set_deleted_in_remote_index(&timeline).await?;
|
||||||
|
|
||||||
|
|||||||
@@ -67,20 +67,19 @@ impl Timeline {
|
|||||||
),
|
),
|
||||||
false,
|
false,
|
||||||
async move {
|
async move {
|
||||||
let cancel = task_mgr::shutdown_token();
|
|
||||||
tokio::select! {
|
tokio::select! {
|
||||||
_ = cancel.cancelled() => { return Ok(()); }
|
_ = self_clone.cancel.cancelled() => { return Ok(()); }
|
||||||
_ = completion::Barrier::maybe_wait(background_tasks_can_start) => {}
|
_ = completion::Barrier::maybe_wait(background_tasks_can_start) => {}
|
||||||
};
|
};
|
||||||
|
|
||||||
self_clone.eviction_task(parent, cancel).await;
|
self_clone.eviction_task(parent).await;
|
||||||
Ok(())
|
Ok(())
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
|
#[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
|
||||||
async fn eviction_task(self: Arc<Self>, tenant: Arc<Tenant>, cancel: CancellationToken) {
|
async fn eviction_task(self: Arc<Self>, tenant: Arc<Tenant>) {
|
||||||
use crate::tenant::tasks::random_init_delay;
|
use crate::tenant::tasks::random_init_delay;
|
||||||
|
|
||||||
// acquire the gate guard only once within a useful span
|
// acquire the gate guard only once within a useful span
|
||||||
@@ -95,7 +94,7 @@ impl Timeline {
|
|||||||
EvictionPolicy::OnlyImitiate(lat) => lat.period,
|
EvictionPolicy::OnlyImitiate(lat) => lat.period,
|
||||||
EvictionPolicy::NoEviction => Duration::from_secs(10),
|
EvictionPolicy::NoEviction => Duration::from_secs(10),
|
||||||
};
|
};
|
||||||
if random_init_delay(period, &cancel).await.is_err() {
|
if random_init_delay(period, &self.cancel).await.is_err() {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -104,13 +103,13 @@ impl Timeline {
|
|||||||
loop {
|
loop {
|
||||||
let policy = self.get_eviction_policy();
|
let policy = self.get_eviction_policy();
|
||||||
let cf = self
|
let cf = self
|
||||||
.eviction_iteration(&tenant, &policy, &cancel, &guard, &ctx)
|
.eviction_iteration(&tenant, &policy, &self.cancel, &guard, &ctx)
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
match cf {
|
match cf {
|
||||||
ControlFlow::Break(()) => break,
|
ControlFlow::Break(()) => break,
|
||||||
ControlFlow::Continue(sleep_until) => {
|
ControlFlow::Continue(sleep_until) => {
|
||||||
if tokio::time::timeout_at(sleep_until, cancel.cancelled())
|
if tokio::time::timeout_at(sleep_until, self.cancel.cancelled())
|
||||||
.await
|
.await
|
||||||
.is_ok()
|
.is_ok()
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -120,9 +120,10 @@ impl LayerManager {
|
|||||||
/// Called from `freeze_inmem_layer`, returns true if successfully frozen.
|
/// Called from `freeze_inmem_layer`, returns true if successfully frozen.
|
||||||
pub(crate) async fn try_freeze_in_memory_layer(
|
pub(crate) async fn try_freeze_in_memory_layer(
|
||||||
&mut self,
|
&mut self,
|
||||||
Lsn(last_record_lsn): Lsn,
|
lsn: Lsn,
|
||||||
last_freeze_at: &AtomicLsn,
|
last_freeze_at: &AtomicLsn,
|
||||||
) {
|
) {
|
||||||
|
let Lsn(last_record_lsn) = lsn;
|
||||||
let end_lsn = Lsn(last_record_lsn + 1);
|
let end_lsn = Lsn(last_record_lsn + 1);
|
||||||
|
|
||||||
if let Some(open_layer) = &self.layer_map.open_layer {
|
if let Some(open_layer) = &self.layer_map.open_layer {
|
||||||
@@ -135,8 +136,11 @@ impl LayerManager {
|
|||||||
self.layer_map.frozen_layers.push_back(open_layer_rc);
|
self.layer_map.frozen_layers.push_back(open_layer_rc);
|
||||||
self.layer_map.open_layer = None;
|
self.layer_map.open_layer = None;
|
||||||
self.layer_map.next_open_layer_at = Some(end_lsn);
|
self.layer_map.next_open_layer_at = Some(end_lsn);
|
||||||
last_freeze_at.store(end_lsn);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Even if there was no layer to freeze, advance last_freeze_at to last_record_lsn+1: this
|
||||||
|
// accounts for regions in the LSN range where we might have ingested no data due to sharding.
|
||||||
|
last_freeze_at.store(end_lsn);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Add image layers to the layer map, called from `create_image_layers`.
|
/// Add image layers to the layer map, called from `create_image_layers`.
|
||||||
|
|||||||
@@ -24,26 +24,21 @@ mod connection_manager;
|
|||||||
mod walreceiver_connection;
|
mod walreceiver_connection;
|
||||||
|
|
||||||
use crate::context::{DownloadBehavior, RequestContext};
|
use crate::context::{DownloadBehavior, RequestContext};
|
||||||
use crate::task_mgr::{self, TaskKind, WALRECEIVER_RUNTIME};
|
use crate::task_mgr::{TaskKind, WALRECEIVER_RUNTIME};
|
||||||
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
|
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||||
use crate::tenant::timeline::walreceiver::connection_manager::{
|
use crate::tenant::timeline::walreceiver::connection_manager::{
|
||||||
connection_manager_loop_step, ConnectionManagerState,
|
connection_manager_loop_step, ConnectionManagerState,
|
||||||
};
|
};
|
||||||
|
|
||||||
use pageserver_api::shard::TenantShardId;
|
|
||||||
use std::future::Future;
|
use std::future::Future;
|
||||||
use std::num::NonZeroU64;
|
use std::num::NonZeroU64;
|
||||||
use std::ops::ControlFlow;
|
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use storage_broker::BrokerClientChannel;
|
use storage_broker::BrokerClientChannel;
|
||||||
use tokio::select;
|
|
||||||
use tokio::sync::watch;
|
use tokio::sync::watch;
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
|
|
||||||
use utils::id::TimelineId;
|
|
||||||
|
|
||||||
use self::connection_manager::ConnectionManagerStatus;
|
use self::connection_manager::ConnectionManagerStatus;
|
||||||
|
|
||||||
use super::Timeline;
|
use super::Timeline;
|
||||||
@@ -62,9 +57,10 @@ pub struct WalReceiverConf {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub struct WalReceiver {
|
pub struct WalReceiver {
|
||||||
tenant_shard_id: TenantShardId,
|
|
||||||
timeline_id: TimelineId,
|
|
||||||
manager_status: Arc<std::sync::RwLock<Option<ConnectionManagerStatus>>>,
|
manager_status: Arc<std::sync::RwLock<Option<ConnectionManagerStatus>>>,
|
||||||
|
/// All task spawned by [`WalReceiver::start`] and its children are sensitive to this token.
|
||||||
|
/// It's a child token of [`Timeline`] so that timeline shutdown can cancel WalReceiver tasks early for `freeze_and_flush=true`.
|
||||||
|
cancel: CancellationToken,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl WalReceiver {
|
impl WalReceiver {
|
||||||
@@ -78,65 +74,58 @@ impl WalReceiver {
|
|||||||
let timeline_id = timeline.timeline_id;
|
let timeline_id = timeline.timeline_id;
|
||||||
let walreceiver_ctx =
|
let walreceiver_ctx =
|
||||||
ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error);
|
ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error);
|
||||||
|
|
||||||
let loop_status = Arc::new(std::sync::RwLock::new(None));
|
let loop_status = Arc::new(std::sync::RwLock::new(None));
|
||||||
let manager_status = Arc::clone(&loop_status);
|
let manager_status = Arc::clone(&loop_status);
|
||||||
task_mgr::spawn(
|
let cancel = timeline.cancel.child_token();
|
||||||
WALRECEIVER_RUNTIME.handle(),
|
WALRECEIVER_RUNTIME.spawn({
|
||||||
TaskKind::WalReceiverManager,
|
let cancel = cancel.clone();
|
||||||
Some(timeline.tenant_shard_id),
|
|
||||||
Some(timeline_id),
|
|
||||||
&format!("walreceiver for timeline {tenant_shard_id}/{timeline_id}"),
|
|
||||||
false,
|
|
||||||
async move {
|
async move {
|
||||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
|
// acquire timeline gate so we know the task doesn't outlive the Timeline
|
||||||
|
let Ok(_guard) = timeline.gate.enter() else {
|
||||||
|
debug!("WAL receiver manager could not enter the gate timeline gate, it's closed already");
|
||||||
|
return;
|
||||||
|
};
|
||||||
debug!("WAL receiver manager started, connecting to broker");
|
debug!("WAL receiver manager started, connecting to broker");
|
||||||
let mut connection_manager_state = ConnectionManagerState::new(
|
let mut connection_manager_state = ConnectionManagerState::new(
|
||||||
timeline,
|
timeline,
|
||||||
conf,
|
conf,
|
||||||
|
cancel.clone(),
|
||||||
);
|
);
|
||||||
loop {
|
while !cancel.is_cancelled() {
|
||||||
select! {
|
let loop_step_result = connection_manager_loop_step(
|
||||||
_ = task_mgr::shutdown_watcher() => {
|
&mut broker_client,
|
||||||
trace!("WAL receiver shutdown requested, shutting down");
|
&mut connection_manager_state,
|
||||||
|
&walreceiver_ctx,
|
||||||
|
&cancel,
|
||||||
|
&loop_status,
|
||||||
|
).await;
|
||||||
|
match loop_step_result {
|
||||||
|
Ok(()) => continue,
|
||||||
|
Err(_cancelled) => {
|
||||||
|
trace!("Connection manager loop ended, shutting down");
|
||||||
break;
|
break;
|
||||||
},
|
}
|
||||||
loop_step_result = connection_manager_loop_step(
|
|
||||||
&mut broker_client,
|
|
||||||
&mut connection_manager_state,
|
|
||||||
&walreceiver_ctx,
|
|
||||||
&loop_status,
|
|
||||||
) => match loop_step_result {
|
|
||||||
ControlFlow::Continue(()) => continue,
|
|
||||||
ControlFlow::Break(()) => {
|
|
||||||
trace!("Connection manager loop ended, shutting down");
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
connection_manager_state.shutdown().await;
|
connection_manager_state.shutdown().await;
|
||||||
*loop_status.write().unwrap() = None;
|
*loop_status.write().unwrap() = None;
|
||||||
Ok(())
|
debug!("task exits");
|
||||||
}
|
}
|
||||||
.instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), timeline_id = %timeline_id))
|
.instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), timeline_id = %timeline_id))
|
||||||
);
|
});
|
||||||
|
|
||||||
Self {
|
Self {
|
||||||
tenant_shard_id,
|
|
||||||
timeline_id,
|
|
||||||
manager_status,
|
manager_status,
|
||||||
|
cancel,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn stop(self) {
|
#[instrument(skip_all, level = tracing::Level::DEBUG)]
|
||||||
task_mgr::shutdown_tasks(
|
pub fn cancel(&self) {
|
||||||
Some(TaskKind::WalReceiverManager),
|
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
Some(self.tenant_shard_id),
|
debug!("cancelling walreceiver tasks");
|
||||||
Some(self.timeline_id),
|
self.cancel.cancel();
|
||||||
)
|
|
||||||
.await;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn status(&self) -> Option<ConnectionManagerStatus> {
|
pub(crate) fn status(&self) -> Option<ConnectionManagerStatus> {
|
||||||
@@ -170,14 +159,18 @@ enum TaskStateUpdate<E> {
|
|||||||
|
|
||||||
impl<E: Clone> TaskHandle<E> {
|
impl<E: Clone> TaskHandle<E> {
|
||||||
/// Initializes the task, starting it immediately after the creation.
|
/// Initializes the task, starting it immediately after the creation.
|
||||||
|
///
|
||||||
|
/// The second argument to `task` is a child token of `cancel_parent` ([`CancellationToken::child_token`]).
|
||||||
|
/// It being a child token enables us to provide a [`Self::shutdown`] method.
|
||||||
fn spawn<Fut>(
|
fn spawn<Fut>(
|
||||||
|
cancel_parent: &CancellationToken,
|
||||||
task: impl FnOnce(watch::Sender<TaskStateUpdate<E>>, CancellationToken) -> Fut + Send + 'static,
|
task: impl FnOnce(watch::Sender<TaskStateUpdate<E>>, CancellationToken) -> Fut + Send + 'static,
|
||||||
) -> Self
|
) -> Self
|
||||||
where
|
where
|
||||||
Fut: Future<Output = anyhow::Result<()>> + Send,
|
Fut: Future<Output = anyhow::Result<()>> + Send,
|
||||||
E: Send + Sync + 'static,
|
E: Send + Sync + 'static,
|
||||||
{
|
{
|
||||||
let cancellation = CancellationToken::new();
|
let cancellation = cancel_parent.child_token();
|
||||||
let (events_sender, events_receiver) = watch::channel(TaskStateUpdate::Started);
|
let (events_sender, events_receiver) = watch::channel(TaskStateUpdate::Started);
|
||||||
|
|
||||||
let cancellation_clone = cancellation.clone();
|
let cancellation_clone = cancellation.clone();
|
||||||
@@ -197,6 +190,9 @@ impl<E: Clone> TaskHandle<E> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// # Cancel-Safety
|
||||||
|
///
|
||||||
|
/// Cancellation-safe.
|
||||||
async fn next_task_event(&mut self) -> TaskEvent<E> {
|
async fn next_task_event(&mut self) -> TaskEvent<E> {
|
||||||
match self.events_receiver.changed().await {
|
match self.events_receiver.changed().await {
|
||||||
Ok(()) => TaskEvent::Update((self.events_receiver.borrow()).clone()),
|
Ok(()) => TaskEvent::Update((self.events_receiver.borrow()).clone()),
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ use crate::metrics::{
|
|||||||
WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED,
|
WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED,
|
||||||
WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES,
|
WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES,
|
||||||
};
|
};
|
||||||
use crate::task_mgr::{shutdown_token, TaskKind};
|
use crate::task_mgr::TaskKind;
|
||||||
use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline};
|
use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline};
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use chrono::{NaiveDateTime, Utc};
|
use chrono::{NaiveDateTime, Utc};
|
||||||
@@ -27,7 +27,7 @@ use storage_broker::proto::SafekeeperTimelineInfo;
|
|||||||
use storage_broker::proto::SubscribeSafekeeperInfoRequest;
|
use storage_broker::proto::SubscribeSafekeeperInfoRequest;
|
||||||
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
|
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
|
||||||
use storage_broker::{BrokerClientChannel, Code, Streaming};
|
use storage_broker::{BrokerClientChannel, Code, Streaming};
|
||||||
use tokio::select;
|
use tokio_util::sync::CancellationToken;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
|
|
||||||
use postgres_connection::PgConnectionConfig;
|
use postgres_connection::PgConnectionConfig;
|
||||||
@@ -45,27 +45,33 @@ use super::{
|
|||||||
TaskEvent, TaskHandle,
|
TaskEvent, TaskHandle,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
pub(crate) struct Cancelled;
|
||||||
|
|
||||||
/// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker.
|
/// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker.
|
||||||
/// Based on the updates, desides whether to start, keep or stop a WAL receiver task.
|
/// Based on the updates, desides whether to start, keep or stop a WAL receiver task.
|
||||||
/// If storage broker subscription is cancelled, exits.
|
/// If storage broker subscription is cancelled, exits.
|
||||||
|
///
|
||||||
|
/// # Cancel-Safety
|
||||||
|
///
|
||||||
|
/// Not cancellation-safe. Use `cancel` token to request cancellation.
|
||||||
pub(super) async fn connection_manager_loop_step(
|
pub(super) async fn connection_manager_loop_step(
|
||||||
broker_client: &mut BrokerClientChannel,
|
broker_client: &mut BrokerClientChannel,
|
||||||
connection_manager_state: &mut ConnectionManagerState,
|
connection_manager_state: &mut ConnectionManagerState,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
|
cancel: &CancellationToken,
|
||||||
manager_status: &std::sync::RwLock<Option<ConnectionManagerStatus>>,
|
manager_status: &std::sync::RwLock<Option<ConnectionManagerStatus>>,
|
||||||
) -> ControlFlow<(), ()> {
|
) -> Result<(), Cancelled> {
|
||||||
match connection_manager_state
|
match tokio::select! {
|
||||||
.timeline
|
_ = cancel.cancelled() => { return Err(Cancelled); },
|
||||||
.wait_to_become_active(ctx)
|
st = connection_manager_state.timeline.wait_to_become_active(ctx) => { st }
|
||||||
.await
|
} {
|
||||||
{
|
|
||||||
Ok(()) => {}
|
Ok(()) => {}
|
||||||
Err(new_state) => {
|
Err(new_state) => {
|
||||||
debug!(
|
debug!(
|
||||||
?new_state,
|
?new_state,
|
||||||
"state changed, stopping wal connection manager loop"
|
"state changed, stopping wal connection manager loop"
|
||||||
);
|
);
|
||||||
return ControlFlow::Break(());
|
return Err(Cancelled);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -86,7 +92,7 @@ pub(super) async fn connection_manager_loop_step(
|
|||||||
// Subscribe to the broker updates. Stream shares underlying TCP connection
|
// Subscribe to the broker updates. Stream shares underlying TCP connection
|
||||||
// with other streams on this client (other connection managers). When
|
// with other streams on this client (other connection managers). When
|
||||||
// object goes out of scope, stream finishes in drop() automatically.
|
// object goes out of scope, stream finishes in drop() automatically.
|
||||||
let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id).await;
|
let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?;
|
||||||
debug!("Subscribed for broker timeline updates");
|
debug!("Subscribed for broker timeline updates");
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
@@ -94,6 +100,7 @@ pub(super) async fn connection_manager_loop_step(
|
|||||||
|
|
||||||
// These things are happening concurrently:
|
// These things are happening concurrently:
|
||||||
//
|
//
|
||||||
|
// - cancellation request
|
||||||
// - keep receiving WAL on the current connection
|
// - keep receiving WAL on the current connection
|
||||||
// - if the shared state says we need to change connection, disconnect and return
|
// - if the shared state says we need to change connection, disconnect and return
|
||||||
// - this runs in a separate task and we receive updates via a watch channel
|
// - this runs in a separate task and we receive updates via a watch channel
|
||||||
@@ -101,7 +108,11 @@ pub(super) async fn connection_manager_loop_step(
|
|||||||
// - receive updates from broker
|
// - receive updates from broker
|
||||||
// - this might change the current desired connection
|
// - this might change the current desired connection
|
||||||
// - timeline state changes to something that does not allow walreceiver to run concurrently
|
// - timeline state changes to something that does not allow walreceiver to run concurrently
|
||||||
select! {
|
|
||||||
|
// NB: make sure each of the select expressions are cancellation-safe
|
||||||
|
// (no need for arms to be cancellation-safe).
|
||||||
|
tokio::select! {
|
||||||
|
_ = cancel.cancelled() => { return Err(Cancelled); }
|
||||||
Some(wal_connection_update) = async {
|
Some(wal_connection_update) = async {
|
||||||
match connection_manager_state.wal_connection.as_mut() {
|
match connection_manager_state.wal_connection.as_mut() {
|
||||||
Some(wal_connection) => Some(wal_connection.connection_task.next_task_event().await),
|
Some(wal_connection) => Some(wal_connection.connection_task.next_task_event().await),
|
||||||
@@ -133,7 +144,7 @@ pub(super) async fn connection_manager_loop_step(
|
|||||||
},
|
},
|
||||||
|
|
||||||
// Got a new update from the broker
|
// Got a new update from the broker
|
||||||
broker_update = broker_subscription.message() => {
|
broker_update = broker_subscription.message() /* TODO: review cancellation-safety */ => {
|
||||||
match broker_update {
|
match broker_update {
|
||||||
Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update),
|
Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update),
|
||||||
Err(status) => {
|
Err(status) => {
|
||||||
@@ -147,16 +158,17 @@ pub(super) async fn connection_manager_loop_step(
|
|||||||
warn!("broker subscription failed: {status}");
|
warn!("broker subscription failed: {status}");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return ControlFlow::Continue(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
Ok(None) => {
|
Ok(None) => {
|
||||||
error!("broker subscription stream ended"); // can't happen
|
error!("broker subscription stream ended"); // can't happen
|
||||||
return ControlFlow::Continue(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
new_event = async {
|
new_event = async {
|
||||||
|
// Reminder: this match arm needs to be cancellation-safe.
|
||||||
loop {
|
loop {
|
||||||
if connection_manager_state.timeline.current_state() == TimelineState::Loading {
|
if connection_manager_state.timeline.current_state() == TimelineState::Loading {
|
||||||
warn!("wal connection manager should only be launched after timeline has become active");
|
warn!("wal connection manager should only be launched after timeline has become active");
|
||||||
@@ -182,11 +194,11 @@ pub(super) async fn connection_manager_loop_step(
|
|||||||
}
|
}
|
||||||
} => match new_event {
|
} => match new_event {
|
||||||
ControlFlow::Continue(()) => {
|
ControlFlow::Continue(()) => {
|
||||||
return ControlFlow::Continue(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
ControlFlow::Break(()) => {
|
ControlFlow::Break(()) => {
|
||||||
debug!("Timeline is no longer active, stopping wal connection manager loop");
|
debug!("Timeline is no longer active, stopping wal connection manager loop");
|
||||||
return ControlFlow::Break(());
|
return Err(Cancelled);
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
@@ -218,16 +230,15 @@ pub(super) async fn connection_manager_loop_step(
|
|||||||
async fn subscribe_for_timeline_updates(
|
async fn subscribe_for_timeline_updates(
|
||||||
broker_client: &mut BrokerClientChannel,
|
broker_client: &mut BrokerClientChannel,
|
||||||
id: TenantTimelineId,
|
id: TenantTimelineId,
|
||||||
) -> Streaming<SafekeeperTimelineInfo> {
|
cancel: &CancellationToken,
|
||||||
|
) -> Result<Streaming<SafekeeperTimelineInfo>, Cancelled> {
|
||||||
let mut attempt = 0;
|
let mut attempt = 0;
|
||||||
let cancel = shutdown_token();
|
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
exponential_backoff(
|
exponential_backoff(
|
||||||
attempt,
|
attempt,
|
||||||
DEFAULT_BASE_BACKOFF_SECONDS,
|
DEFAULT_BASE_BACKOFF_SECONDS,
|
||||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||||
&cancel,
|
cancel,
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
attempt += 1;
|
attempt += 1;
|
||||||
@@ -241,9 +252,14 @@ async fn subscribe_for_timeline_updates(
|
|||||||
subscription_key: Some(key),
|
subscription_key: Some(key),
|
||||||
};
|
};
|
||||||
|
|
||||||
match broker_client.subscribe_safekeeper_info(request).await {
|
match {
|
||||||
|
tokio::select! {
|
||||||
|
r = broker_client.subscribe_safekeeper_info(request) => { r }
|
||||||
|
_ = cancel.cancelled() => { return Err(Cancelled); }
|
||||||
|
}
|
||||||
|
} {
|
||||||
Ok(resp) => {
|
Ok(resp) => {
|
||||||
return resp.into_inner();
|
return Ok(resp.into_inner());
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
// Safekeeper nodes can stop pushing timeline updates to the broker, when no new writes happen and
|
// Safekeeper nodes can stop pushing timeline updates to the broker, when no new writes happen and
|
||||||
@@ -264,6 +280,8 @@ pub(super) struct ConnectionManagerState {
|
|||||||
id: TenantTimelineId,
|
id: TenantTimelineId,
|
||||||
/// Use pageserver data about the timeline to filter out some of the safekeepers.
|
/// Use pageserver data about the timeline to filter out some of the safekeepers.
|
||||||
timeline: Arc<Timeline>,
|
timeline: Arc<Timeline>,
|
||||||
|
/// Child token of [`super::WalReceiver::cancel`], inherited to all tasks we spawn.
|
||||||
|
cancel: CancellationToken,
|
||||||
conf: WalReceiverConf,
|
conf: WalReceiverConf,
|
||||||
/// Current connection to safekeeper for WAL streaming.
|
/// Current connection to safekeeper for WAL streaming.
|
||||||
wal_connection: Option<WalConnection>,
|
wal_connection: Option<WalConnection>,
|
||||||
@@ -386,7 +404,11 @@ struct BrokerSkTimeline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl ConnectionManagerState {
|
impl ConnectionManagerState {
|
||||||
pub(super) fn new(timeline: Arc<Timeline>, conf: WalReceiverConf) -> Self {
|
pub(super) fn new(
|
||||||
|
timeline: Arc<Timeline>,
|
||||||
|
conf: WalReceiverConf,
|
||||||
|
cancel: CancellationToken,
|
||||||
|
) -> Self {
|
||||||
let id = TenantTimelineId {
|
let id = TenantTimelineId {
|
||||||
tenant_id: timeline.tenant_shard_id.tenant_id,
|
tenant_id: timeline.tenant_shard_id.tenant_id,
|
||||||
timeline_id: timeline.timeline_id,
|
timeline_id: timeline.timeline_id,
|
||||||
@@ -394,6 +416,7 @@ impl ConnectionManagerState {
|
|||||||
Self {
|
Self {
|
||||||
id,
|
id,
|
||||||
timeline,
|
timeline,
|
||||||
|
cancel,
|
||||||
conf,
|
conf,
|
||||||
wal_connection: None,
|
wal_connection: None,
|
||||||
wal_stream_candidates: HashMap::new(),
|
wal_stream_candidates: HashMap::new(),
|
||||||
@@ -401,6 +424,22 @@ impl ConnectionManagerState {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn spawn<Fut>(
|
||||||
|
&self,
|
||||||
|
task: impl FnOnce(
|
||||||
|
tokio::sync::watch::Sender<TaskStateUpdate<WalConnectionStatus>>,
|
||||||
|
CancellationToken,
|
||||||
|
) -> Fut
|
||||||
|
+ Send
|
||||||
|
+ 'static,
|
||||||
|
) -> TaskHandle<WalConnectionStatus>
|
||||||
|
where
|
||||||
|
Fut: std::future::Future<Output = anyhow::Result<()>> + Send,
|
||||||
|
{
|
||||||
|
// TODO: get rid of TaskHandle
|
||||||
|
super::TaskHandle::spawn(&self.cancel, task)
|
||||||
|
}
|
||||||
|
|
||||||
/// Shuts down the current connection (if any) and immediately starts another one with the given connection string.
|
/// Shuts down the current connection (if any) and immediately starts another one with the given connection string.
|
||||||
async fn change_connection(&mut self, new_sk: NewWalConnectionCandidate, ctx: &RequestContext) {
|
async fn change_connection(&mut self, new_sk: NewWalConnectionCandidate, ctx: &RequestContext) {
|
||||||
WALRECEIVER_SWITCHES
|
WALRECEIVER_SWITCHES
|
||||||
@@ -419,7 +458,7 @@ impl ConnectionManagerState {
|
|||||||
);
|
);
|
||||||
|
|
||||||
let span = info_span!("connection", %node_id);
|
let span = info_span!("connection", %node_id);
|
||||||
let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| {
|
let connection_handle = self.spawn(move |events_sender, cancellation| {
|
||||||
async move {
|
async move {
|
||||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
|
|
||||||
@@ -447,6 +486,12 @@ impl ConnectionManagerState {
|
|||||||
info!("walreceiver connection handling ended: {e}");
|
info!("walreceiver connection handling ended: {e}");
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
WalReceiverError::ClosedGate => {
|
||||||
|
info!(
|
||||||
|
"walreceiver connection handling ended because of closed gate"
|
||||||
|
);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
WalReceiverError::Other(e) => {
|
WalReceiverError::Other(e) => {
|
||||||
// give out an error to have task_mgr give it a really verbose logging
|
// give out an error to have task_mgr give it a really verbose logging
|
||||||
if cancellation.is_cancelled() {
|
if cancellation.is_cancelled() {
|
||||||
@@ -486,6 +531,10 @@ impl ConnectionManagerState {
|
|||||||
|
|
||||||
/// Drops the current connection (if any) and updates retry timeout for the next
|
/// Drops the current connection (if any) and updates retry timeout for the next
|
||||||
/// connection attempt to the same safekeeper.
|
/// connection attempt to the same safekeeper.
|
||||||
|
///
|
||||||
|
/// # Cancel-Safety
|
||||||
|
///
|
||||||
|
/// Not cancellation-safe.
|
||||||
async fn drop_old_connection(&mut self, needs_shutdown: bool) {
|
async fn drop_old_connection(&mut self, needs_shutdown: bool) {
|
||||||
let wal_connection = match self.wal_connection.take() {
|
let wal_connection = match self.wal_connection.take() {
|
||||||
Some(wal_connection) => wal_connection,
|
Some(wal_connection) => wal_connection,
|
||||||
@@ -493,7 +542,14 @@ impl ConnectionManagerState {
|
|||||||
};
|
};
|
||||||
|
|
||||||
if needs_shutdown {
|
if needs_shutdown {
|
||||||
wal_connection.connection_task.shutdown().await;
|
wal_connection
|
||||||
|
.connection_task
|
||||||
|
.shutdown()
|
||||||
|
// This here is why this function isn't cancellation-safe.
|
||||||
|
// If we got cancelled here, then self.wal_connection is already None and we lose track of the task.
|
||||||
|
// Even if our caller diligently calls Self::shutdown(), it will find a self.wal_connection=None
|
||||||
|
// and thus be ineffective.
|
||||||
|
.await;
|
||||||
}
|
}
|
||||||
|
|
||||||
let retry = self
|
let retry = self
|
||||||
@@ -838,6 +894,9 @@ impl ConnectionManagerState {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// # Cancel-Safety
|
||||||
|
///
|
||||||
|
/// Not cancellation-safe.
|
||||||
pub(super) async fn shutdown(mut self) {
|
pub(super) async fn shutdown(mut self) {
|
||||||
if let Some(wal_connection) = self.wal_connection.take() {
|
if let Some(wal_connection) = self.wal_connection.take() {
|
||||||
wal_connection.connection_task.shutdown().await;
|
wal_connection.connection_task.shutdown().await;
|
||||||
@@ -986,7 +1045,7 @@ mod tests {
|
|||||||
sk_id: connected_sk_id,
|
sk_id: connected_sk_id,
|
||||||
availability_zone: None,
|
availability_zone: None,
|
||||||
status: connection_status,
|
status: connection_status,
|
||||||
connection_task: TaskHandle::spawn(move |sender, _| async move {
|
connection_task: state.spawn(move |sender, _| async move {
|
||||||
sender
|
sender
|
||||||
.send(TaskStateUpdate::Progress(connection_status))
|
.send(TaskStateUpdate::Progress(connection_status))
|
||||||
.ok();
|
.ok();
|
||||||
@@ -1154,7 +1213,7 @@ mod tests {
|
|||||||
sk_id: connected_sk_id,
|
sk_id: connected_sk_id,
|
||||||
availability_zone: None,
|
availability_zone: None,
|
||||||
status: connection_status,
|
status: connection_status,
|
||||||
connection_task: TaskHandle::spawn(move |sender, _| async move {
|
connection_task: state.spawn(move |sender, _| async move {
|
||||||
sender
|
sender
|
||||||
.send(TaskStateUpdate::Progress(connection_status))
|
.send(TaskStateUpdate::Progress(connection_status))
|
||||||
.ok();
|
.ok();
|
||||||
@@ -1221,7 +1280,7 @@ mod tests {
|
|||||||
sk_id: NodeId(1),
|
sk_id: NodeId(1),
|
||||||
availability_zone: None,
|
availability_zone: None,
|
||||||
status: connection_status,
|
status: connection_status,
|
||||||
connection_task: TaskHandle::spawn(move |sender, _| async move {
|
connection_task: state.spawn(move |sender, _| async move {
|
||||||
sender
|
sender
|
||||||
.send(TaskStateUpdate::Progress(connection_status))
|
.send(TaskStateUpdate::Progress(connection_status))
|
||||||
.ok();
|
.ok();
|
||||||
@@ -1285,7 +1344,7 @@ mod tests {
|
|||||||
sk_id: NodeId(1),
|
sk_id: NodeId(1),
|
||||||
availability_zone: None,
|
availability_zone: None,
|
||||||
status: connection_status,
|
status: connection_status,
|
||||||
connection_task: TaskHandle::spawn(move |_, _| async move { Ok(()) }),
|
connection_task: state.spawn(move |_, _| async move { Ok(()) }),
|
||||||
discovered_new_wal: Some(NewCommittedWAL {
|
discovered_new_wal: Some(NewCommittedWAL {
|
||||||
discovered_at: time_over_threshold,
|
discovered_at: time_over_threshold,
|
||||||
lsn: new_lsn,
|
lsn: new_lsn,
|
||||||
@@ -1341,6 +1400,7 @@ mod tests {
|
|||||||
timeline_id: TIMELINE_ID,
|
timeline_id: TIMELINE_ID,
|
||||||
},
|
},
|
||||||
timeline,
|
timeline,
|
||||||
|
cancel: CancellationToken::new(),
|
||||||
conf: WalReceiverConf {
|
conf: WalReceiverConf {
|
||||||
wal_connect_timeout: Duration::from_secs(1),
|
wal_connect_timeout: Duration::from_secs(1),
|
||||||
lagging_wal_timeout: Duration::from_secs(1),
|
lagging_wal_timeout: Duration::from_secs(1),
|
||||||
@@ -1384,7 +1444,7 @@ mod tests {
|
|||||||
sk_id: connected_sk_id,
|
sk_id: connected_sk_id,
|
||||||
availability_zone: None,
|
availability_zone: None,
|
||||||
status: connection_status,
|
status: connection_status,
|
||||||
connection_task: TaskHandle::spawn(move |sender, _| async move {
|
connection_task: state.spawn(move |sender, _| async move {
|
||||||
sender
|
sender
|
||||||
.send(TaskStateUpdate::Progress(connection_status))
|
.send(TaskStateUpdate::Progress(connection_status))
|
||||||
.ok();
|
.ok();
|
||||||
|
|||||||
@@ -27,7 +27,6 @@ use super::TaskStateUpdate;
|
|||||||
use crate::{
|
use crate::{
|
||||||
context::RequestContext,
|
context::RequestContext,
|
||||||
metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
|
metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
|
||||||
task_mgr,
|
|
||||||
task_mgr::TaskKind,
|
task_mgr::TaskKind,
|
||||||
task_mgr::WALRECEIVER_RUNTIME,
|
task_mgr::WALRECEIVER_RUNTIME,
|
||||||
tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
|
tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
|
||||||
@@ -37,8 +36,8 @@ use crate::{
|
|||||||
use postgres_backend::is_expected_io_error;
|
use postgres_backend::is_expected_io_error;
|
||||||
use postgres_connection::PgConnectionConfig;
|
use postgres_connection::PgConnectionConfig;
|
||||||
use postgres_ffi::waldecoder::WalStreamDecoder;
|
use postgres_ffi::waldecoder::WalStreamDecoder;
|
||||||
use utils::pageserver_feedback::PageserverFeedback;
|
|
||||||
use utils::{id::NodeId, lsn::Lsn};
|
use utils::{id::NodeId, lsn::Lsn};
|
||||||
|
use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError};
|
||||||
|
|
||||||
/// Status of the connection.
|
/// Status of the connection.
|
||||||
#[derive(Debug, Clone, Copy)]
|
#[derive(Debug, Clone, Copy)]
|
||||||
@@ -68,6 +67,7 @@ pub(super) enum WalReceiverError {
|
|||||||
SuccessfulCompletion(String),
|
SuccessfulCompletion(String),
|
||||||
/// Generic error
|
/// Generic error
|
||||||
Other(anyhow::Error),
|
Other(anyhow::Error),
|
||||||
|
ClosedGate,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<tokio_postgres::Error> for WalReceiverError {
|
impl From<tokio_postgres::Error> for WalReceiverError {
|
||||||
@@ -119,6 +119,16 @@ pub(super) async fn handle_walreceiver_connection(
|
|||||||
) -> Result<(), WalReceiverError> {
|
) -> Result<(), WalReceiverError> {
|
||||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
|
|
||||||
|
// prevent timeline shutdown from finishing until we have exited
|
||||||
|
let _guard = timeline.gate.enter().map_err(|e| match e {
|
||||||
|
GateError::GateClosed => WalReceiverError::ClosedGate,
|
||||||
|
})?;
|
||||||
|
// This function spawns a side-car task (WalReceiverConnectionPoller).
|
||||||
|
// Get its gate guard now as well.
|
||||||
|
let poller_guard = timeline.gate.enter().map_err(|e| match e {
|
||||||
|
GateError::GateClosed => WalReceiverError::ClosedGate,
|
||||||
|
})?;
|
||||||
|
|
||||||
WALRECEIVER_STARTED_CONNECTIONS.inc();
|
WALRECEIVER_STARTED_CONNECTIONS.inc();
|
||||||
|
|
||||||
// Connect to the database in replication mode.
|
// Connect to the database in replication mode.
|
||||||
@@ -156,22 +166,19 @@ pub(super) async fn handle_walreceiver_connection(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// The connection object performs the actual communication with the database,
|
// The connection object performs the actual communication with the database,
|
||||||
// so spawn it off to run on its own.
|
// so spawn it off to run on its own. It shouldn't outlive this function, but,
|
||||||
|
// due to lack of async drop, we can't enforce that. However, we ensure that
|
||||||
|
// 1. it is sensitive to `cancellation` and
|
||||||
|
// 2. holds the Timeline gate open so that after timeline shutdown,
|
||||||
|
// we know this task is gone.
|
||||||
let _connection_ctx = ctx.detached_child(
|
let _connection_ctx = ctx.detached_child(
|
||||||
TaskKind::WalReceiverConnectionPoller,
|
TaskKind::WalReceiverConnectionPoller,
|
||||||
ctx.download_behavior(),
|
ctx.download_behavior(),
|
||||||
);
|
);
|
||||||
let connection_cancellation = cancellation.clone();
|
let connection_cancellation = cancellation.clone();
|
||||||
task_mgr::spawn(
|
WALRECEIVER_RUNTIME.spawn(
|
||||||
WALRECEIVER_RUNTIME.handle(),
|
|
||||||
TaskKind::WalReceiverConnectionPoller,
|
|
||||||
Some(timeline.tenant_shard_id),
|
|
||||||
Some(timeline.timeline_id),
|
|
||||||
"walreceiver connection",
|
|
||||||
false,
|
|
||||||
async move {
|
async move {
|
||||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
|
|
||||||
select! {
|
select! {
|
||||||
connection_result = connection => match connection_result {
|
connection_result = connection => match connection_result {
|
||||||
Ok(()) => debug!("Walreceiver db connection closed"),
|
Ok(()) => debug!("Walreceiver db connection closed"),
|
||||||
@@ -182,6 +189,9 @@ pub(super) async fn handle_walreceiver_connection(
|
|||||||
// with a similar error.
|
// with a similar error.
|
||||||
},
|
},
|
||||||
WalReceiverError::SuccessfulCompletion(_) => {}
|
WalReceiverError::SuccessfulCompletion(_) => {}
|
||||||
|
WalReceiverError::ClosedGate => {
|
||||||
|
// doesn't happen at runtime
|
||||||
|
}
|
||||||
WalReceiverError::Other(err) => {
|
WalReceiverError::Other(err) => {
|
||||||
warn!("Connection aborted: {err:#}")
|
warn!("Connection aborted: {err:#}")
|
||||||
}
|
}
|
||||||
@@ -190,7 +200,7 @@ pub(super) async fn handle_walreceiver_connection(
|
|||||||
},
|
},
|
||||||
_ = connection_cancellation.cancelled() => debug!("Connection cancelled"),
|
_ = connection_cancellation.cancelled() => debug!("Connection cancelled"),
|
||||||
}
|
}
|
||||||
Ok(())
|
drop(poller_guard);
|
||||||
}
|
}
|
||||||
// Enrich the log lines emitted by this closure with meaningful context.
|
// Enrich the log lines emitted by this closure with meaningful context.
|
||||||
// TODO: technically, this task outlives the surrounding function, so, the
|
// TODO: technically, this task outlives the surrounding function, so, the
|
||||||
@@ -303,6 +313,7 @@ pub(super) async fn handle_walreceiver_connection(
|
|||||||
|
|
||||||
trace!("received XLogData between {startlsn} and {endlsn}");
|
trace!("received XLogData between {startlsn} and {endlsn}");
|
||||||
|
|
||||||
|
WAL_INGEST.bytes_received.inc_by(data.len() as u64);
|
||||||
waldecoder.feed_bytes(data);
|
waldecoder.feed_bytes(data);
|
||||||
|
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -61,7 +61,7 @@ pub struct VectoredRead {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl VectoredRead {
|
impl VectoredRead {
|
||||||
fn size(&self) -> usize {
|
pub fn size(&self) -> usize {
|
||||||
(self.end - self.start) as usize
|
(self.end - self.start) as usize
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -15,11 +15,23 @@ pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result<PageserverUtiliz
|
|||||||
.map_err(std::io::Error::from)
|
.map_err(std::io::Error::from)
|
||||||
.context("statvfs tenants directory")?;
|
.context("statvfs tenants directory")?;
|
||||||
|
|
||||||
let blocksz = statvfs.block_size();
|
// https://unix.stackexchange.com/a/703650
|
||||||
|
let blocksz = if statvfs.fragment_size() > 0 {
|
||||||
|
statvfs.fragment_size()
|
||||||
|
} else {
|
||||||
|
statvfs.block_size()
|
||||||
|
};
|
||||||
|
|
||||||
#[cfg_attr(not(target_os = "macos"), allow(clippy::unnecessary_cast))]
|
#[cfg_attr(not(target_os = "macos"), allow(clippy::unnecessary_cast))]
|
||||||
let free = statvfs.blocks_available() as u64 * blocksz;
|
let free = statvfs.blocks_available() as u64 * blocksz;
|
||||||
let used = crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.get();
|
|
||||||
|
#[cfg_attr(not(target_os = "macos"), allow(clippy::unnecessary_cast))]
|
||||||
|
let used = statvfs
|
||||||
|
.blocks()
|
||||||
|
// use blocks_free instead of available here to match df in case someone compares
|
||||||
|
.saturating_sub(statvfs.blocks_free()) as u64
|
||||||
|
* blocksz;
|
||||||
|
|
||||||
let captured_at = std::time::SystemTime::now();
|
let captured_at = std::time::SystemTime::now();
|
||||||
|
|
||||||
let doc = PageserverUtilization {
|
let doc = PageserverUtilization {
|
||||||
|
|||||||
@@ -36,11 +36,12 @@ use bytes::{Bytes, BytesMut};
|
|||||||
use pageserver_api::key::key_to_rel_block;
|
use pageserver_api::key::key_to_rel_block;
|
||||||
use pageserver_api::models::WalRedoManagerStatus;
|
use pageserver_api::models::WalRedoManagerStatus;
|
||||||
use pageserver_api::shard::TenantShardId;
|
use pageserver_api::shard::TenantShardId;
|
||||||
use std::sync::{Arc, RwLock};
|
use std::sync::Arc;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
use utils::lsn::Lsn;
|
use utils::lsn::Lsn;
|
||||||
|
use utils::sync::heavier_once_cell;
|
||||||
|
|
||||||
///
|
///
|
||||||
/// This is the real implementation that uses a Postgres process to
|
/// This is the real implementation that uses a Postgres process to
|
||||||
@@ -53,7 +54,19 @@ pub struct PostgresRedoManager {
|
|||||||
tenant_shard_id: TenantShardId,
|
tenant_shard_id: TenantShardId,
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
last_redo_at: std::sync::Mutex<Option<Instant>>,
|
last_redo_at: std::sync::Mutex<Option<Instant>>,
|
||||||
redo_process: RwLock<Option<Arc<process::WalRedoProcess>>>,
|
/// The current [`process::WalRedoProcess`] that is used by new redo requests.
|
||||||
|
/// We use [`heavier_once_cell`] for coalescing the spawning, but the redo
|
||||||
|
/// requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the
|
||||||
|
/// their process object; we use [`Arc::clone`] for that.
|
||||||
|
/// This is primarily because earlier implementations that didn't use [`heavier_once_cell`]
|
||||||
|
/// had that behavior; it's probably unnecessary.
|
||||||
|
/// The only merit of it is that if one walredo process encounters an error,
|
||||||
|
/// it can take it out of rotation (= using [`heavier_once_cell::Guard::take_and_deinit`].
|
||||||
|
/// and retry redo, thereby starting the new process, while other redo tasks might
|
||||||
|
/// still be using the old redo process. But, those other tasks will most likely
|
||||||
|
/// encounter an error as well, and errors are an unexpected condition anyway.
|
||||||
|
/// So, probably we could get rid of the `Arc` in the future.
|
||||||
|
redo_process: heavier_once_cell::OnceCell<Arc<process::WalRedoProcess>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
///
|
///
|
||||||
@@ -101,6 +114,7 @@ impl PostgresRedoManager {
|
|||||||
self.conf.wal_redo_timeout,
|
self.conf.wal_redo_timeout,
|
||||||
pg_version,
|
pg_version,
|
||||||
)
|
)
|
||||||
|
.await
|
||||||
};
|
};
|
||||||
img = Some(result?);
|
img = Some(result?);
|
||||||
|
|
||||||
@@ -121,6 +135,7 @@ impl PostgresRedoManager {
|
|||||||
self.conf.wal_redo_timeout,
|
self.conf.wal_redo_timeout,
|
||||||
pg_version,
|
pg_version,
|
||||||
)
|
)
|
||||||
|
.await
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -134,7 +149,7 @@ impl PostgresRedoManager {
|
|||||||
chrono::Utc::now().checked_sub_signed(chrono::Duration::from_std(age).ok()?)
|
chrono::Utc::now().checked_sub_signed(chrono::Duration::from_std(age).ok()?)
|
||||||
})
|
})
|
||||||
},
|
},
|
||||||
pid: self.redo_process.read().unwrap().as_ref().map(|p| p.id()),
|
pid: self.redo_process.get().map(|p| p.id()),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -152,7 +167,7 @@ impl PostgresRedoManager {
|
|||||||
tenant_shard_id,
|
tenant_shard_id,
|
||||||
conf,
|
conf,
|
||||||
last_redo_at: std::sync::Mutex::default(),
|
last_redo_at: std::sync::Mutex::default(),
|
||||||
redo_process: RwLock::new(None),
|
redo_process: heavier_once_cell::OnceCell::default(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -164,8 +179,7 @@ impl PostgresRedoManager {
|
|||||||
if let Some(last_redo_at) = *g {
|
if let Some(last_redo_at) = *g {
|
||||||
if last_redo_at.elapsed() >= idle_timeout {
|
if last_redo_at.elapsed() >= idle_timeout {
|
||||||
drop(g);
|
drop(g);
|
||||||
let mut guard = self.redo_process.write().unwrap();
|
drop(self.redo_process.get().map(|guard| guard.take_and_deinit()));
|
||||||
*guard = None;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -174,8 +188,11 @@ impl PostgresRedoManager {
|
|||||||
///
|
///
|
||||||
/// Process one request for WAL redo using wal-redo postgres
|
/// Process one request for WAL redo using wal-redo postgres
|
||||||
///
|
///
|
||||||
|
/// # Cancel-Safety
|
||||||
|
///
|
||||||
|
/// Cancellation safe.
|
||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
fn apply_batch_postgres(
|
async fn apply_batch_postgres(
|
||||||
&self,
|
&self,
|
||||||
key: Key,
|
key: Key,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
@@ -191,42 +208,31 @@ impl PostgresRedoManager {
|
|||||||
const MAX_RETRY_ATTEMPTS: u32 = 1;
|
const MAX_RETRY_ATTEMPTS: u32 = 1;
|
||||||
let mut n_attempts = 0u32;
|
let mut n_attempts = 0u32;
|
||||||
loop {
|
loop {
|
||||||
// launch the WAL redo process on first use
|
let proc: Arc<process::WalRedoProcess> =
|
||||||
let proc: Arc<process::WalRedoProcess> = {
|
match self.redo_process.get_or_init_detached().await {
|
||||||
let proc_guard = self.redo_process.read().unwrap();
|
Ok(guard) => Arc::clone(&guard),
|
||||||
match &*proc_guard {
|
Err(permit) => {
|
||||||
None => {
|
// don't hold poison_guard, the launch code can bail
|
||||||
// "upgrade" to write lock to launch the process
|
let start = Instant::now();
|
||||||
drop(proc_guard);
|
let proc = Arc::new(
|
||||||
let mut proc_guard = self.redo_process.write().unwrap();
|
process::WalRedoProcess::launch(
|
||||||
match &*proc_guard {
|
self.conf,
|
||||||
None => {
|
self.tenant_shard_id,
|
||||||
let start = Instant::now();
|
pg_version,
|
||||||
let proc = Arc::new(
|
)
|
||||||
process::WalRedoProcess::launch(
|
.context("launch walredo process")?,
|
||||||
self.conf,
|
);
|
||||||
self.tenant_shard_id,
|
let duration = start.elapsed();
|
||||||
pg_version,
|
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
|
||||||
)
|
info!(
|
||||||
.context("launch walredo process")?,
|
duration_ms = duration.as_millis(),
|
||||||
);
|
pid = proc.id(),
|
||||||
let duration = start.elapsed();
|
"launched walredo process"
|
||||||
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM
|
);
|
||||||
.observe(duration.as_secs_f64());
|
self.redo_process.set(Arc::clone(&proc), permit);
|
||||||
info!(
|
proc
|
||||||
duration_ms = duration.as_millis(),
|
|
||||||
pid = proc.id(),
|
|
||||||
"launched walredo process"
|
|
||||||
);
|
|
||||||
*proc_guard = Some(Arc::clone(&proc));
|
|
||||||
proc
|
|
||||||
}
|
|
||||||
Some(proc) => Arc::clone(proc),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
Some(proc) => Arc::clone(proc),
|
};
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let started_at = std::time::Instant::now();
|
let started_at = std::time::Instant::now();
|
||||||
|
|
||||||
@@ -272,34 +278,34 @@ impl PostgresRedoManager {
|
|||||||
n_attempts,
|
n_attempts,
|
||||||
e,
|
e,
|
||||||
);
|
);
|
||||||
// Avoid concurrent callers hitting the same issue.
|
// Avoid concurrent callers hitting the same issue by taking `proc` out of the rotation.
|
||||||
// We can't prevent it from happening because we want to enable parallelism.
|
// Note that there may be other tasks concurrent with us that also hold `proc`.
|
||||||
{
|
// We have to deal with that here.
|
||||||
let mut guard = self.redo_process.write().unwrap();
|
// Also read the doc comment on field `self.redo_process`.
|
||||||
match &*guard {
|
//
|
||||||
Some(current_field_value) => {
|
|
||||||
if Arc::ptr_eq(current_field_value, &proc) {
|
|
||||||
// We're the first to observe an error from `proc`, it's our job to take it out of rotation.
|
|
||||||
*guard = None;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
None => {
|
|
||||||
// Another thread was faster to observe the error, and already took the process out of rotation.
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// NB: there may still be other concurrent threads using `proc`.
|
// NB: there may still be other concurrent threads using `proc`.
|
||||||
// The last one will send SIGKILL when the underlying Arc reaches refcount 0.
|
// The last one will send SIGKILL when the underlying Arc reaches refcount 0.
|
||||||
// NB: it's important to drop(proc) after drop(guard). Otherwise we'd keep
|
//
|
||||||
// holding the lock while waiting for the process to exit.
|
// NB: the drop impl blocks the dropping thread with a wait() system call for
|
||||||
// NB: the drop impl blocks the current threads with a wait() system call for
|
// the child process. In some ways the blocking is actually good: if we
|
||||||
// the child process. We dropped the `guard` above so that other threads aren't
|
// deferred the waiting into the background / to tokio if we used `tokio::process`,
|
||||||
// affected. But, it's good that the current thread _does_ block to wait.
|
// it could happen that if walredo always fails immediately, we spawn processes faster
|
||||||
// If we instead deferred the waiting into the background / to tokio, it could
|
|
||||||
// happen that if walredo always fails immediately, we spawn processes faster
|
|
||||||
// than we can SIGKILL & `wait` for them to exit. By doing it the way we do here,
|
// than we can SIGKILL & `wait` for them to exit. By doing it the way we do here,
|
||||||
// we limit this risk of run-away to at most $num_runtimes * $num_executor_threads.
|
// we limit this risk of run-away to at most $num_runtimes * $num_executor_threads.
|
||||||
// This probably needs revisiting at some later point.
|
// This probably needs revisiting at some later point.
|
||||||
|
match self.redo_process.get() {
|
||||||
|
None => (),
|
||||||
|
Some(guard) => {
|
||||||
|
if Arc::ptr_eq(&proc, &*guard) {
|
||||||
|
// We're the first to observe an error from `proc`, it's our job to take it out of rotation.
|
||||||
|
guard.take_and_deinit();
|
||||||
|
} else {
|
||||||
|
// Another task already spawned another redo process (further up in this method)
|
||||||
|
// and put it into `redo_process`. Do nothing, our view of the world is behind.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// The last task that does this `drop()` of `proc` will do a blocking `wait()` syscall.
|
||||||
drop(proc);
|
drop(proc);
|
||||||
} else if n_attempts != 0 {
|
} else if n_attempts != 0 {
|
||||||
info!(n_attempts, "retried walredo succeeded");
|
info!(n_attempts, "retried walredo succeeded");
|
||||||
|
|||||||
@@ -111,6 +111,7 @@ static PageServer page_servers[MAX_SHARDS];
|
|||||||
|
|
||||||
static bool pageserver_flush(shardno_t shard_no);
|
static bool pageserver_flush(shardno_t shard_no);
|
||||||
static void pageserver_disconnect(shardno_t shard_no);
|
static void pageserver_disconnect(shardno_t shard_no);
|
||||||
|
static void pageserver_disconnect_shard(shardno_t shard_no);
|
||||||
|
|
||||||
static bool
|
static bool
|
||||||
PagestoreShmemIsValid(void)
|
PagestoreShmemIsValid(void)
|
||||||
@@ -487,9 +488,32 @@ retry:
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Reset prefetch and drop connection to the shard.
|
||||||
|
* It also drops connection to all other shards involved in prefetch.
|
||||||
|
*/
|
||||||
static void
|
static void
|
||||||
pageserver_disconnect(shardno_t shard_no)
|
pageserver_disconnect(shardno_t shard_no)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* If the connection to any pageserver is lost, we throw away the
|
||||||
|
* whole prefetch queue, even for other pageservers. It should not
|
||||||
|
* cause big problems, because connection loss is supposed to be a
|
||||||
|
* rare event.
|
||||||
|
*
|
||||||
|
* Prefetch state should be reset even if page_servers[shard_no].conn == NULL,
|
||||||
|
* because prefetch request may be registered before connection is established.
|
||||||
|
*/
|
||||||
|
prefetch_on_ps_disconnect();
|
||||||
|
|
||||||
|
pageserver_disconnect_shard(shard_no);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Disconnect from specified shard
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
pageserver_disconnect_shard(shardno_t shard_no)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
* If anything goes wrong while we were sending a request, it's not clear
|
* If anything goes wrong while we were sending a request, it's not clear
|
||||||
@@ -503,14 +527,6 @@ pageserver_disconnect(shardno_t shard_no)
|
|||||||
neon_shard_log(shard_no, LOG, "dropping connection to page server due to error");
|
neon_shard_log(shard_no, LOG, "dropping connection to page server due to error");
|
||||||
PQfinish(page_servers[shard_no].conn);
|
PQfinish(page_servers[shard_no].conn);
|
||||||
page_servers[shard_no].conn = NULL;
|
page_servers[shard_no].conn = NULL;
|
||||||
|
|
||||||
/*
|
|
||||||
* If the connection to any pageserver is lost, we throw away the
|
|
||||||
* whole prefetch queue, even for other pageservers. It should not
|
|
||||||
* cause big problems, because connection loss is supposed to be a
|
|
||||||
* rare event.
|
|
||||||
*/
|
|
||||||
prefetch_on_ps_disconnect();
|
|
||||||
}
|
}
|
||||||
if (page_servers[shard_no].wes != NULL)
|
if (page_servers[shard_no].wes != NULL)
|
||||||
{
|
{
|
||||||
@@ -676,7 +692,8 @@ page_server_api api =
|
|||||||
{
|
{
|
||||||
.send = pageserver_send,
|
.send = pageserver_send,
|
||||||
.flush = pageserver_flush,
|
.flush = pageserver_flush,
|
||||||
.receive = pageserver_receive
|
.receive = pageserver_receive,
|
||||||
|
.disconnect = pageserver_disconnect_shard
|
||||||
};
|
};
|
||||||
|
|
||||||
static bool
|
static bool
|
||||||
|
|||||||
@@ -180,6 +180,7 @@ typedef struct
|
|||||||
bool (*send) (shardno_t shard_no, NeonRequest * request);
|
bool (*send) (shardno_t shard_no, NeonRequest * request);
|
||||||
NeonResponse *(*receive) (shardno_t shard_no);
|
NeonResponse *(*receive) (shardno_t shard_no);
|
||||||
bool (*flush) (shardno_t shard_no);
|
bool (*flush) (shardno_t shard_no);
|
||||||
|
void (*disconnect) (shardno_t shard_no);
|
||||||
} page_server_api;
|
} page_server_api;
|
||||||
|
|
||||||
extern void prefetch_on_ps_disconnect(void);
|
extern void prefetch_on_ps_disconnect(void);
|
||||||
|
|||||||
@@ -613,6 +613,14 @@ prefetch_on_ps_disconnect(void)
|
|||||||
Assert(slot->status == PRFS_REQUESTED);
|
Assert(slot->status == PRFS_REQUESTED);
|
||||||
Assert(slot->my_ring_index == ring_index);
|
Assert(slot->my_ring_index == ring_index);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Drop connection to all shards which have prefetch requests.
|
||||||
|
* It is not a problem to call disconnect multiple times on the same connection
|
||||||
|
* because disconnect implementation in libpagestore.c will check if connection
|
||||||
|
* is alive and do nothing of connection was already dropped.
|
||||||
|
*/
|
||||||
|
page_server->disconnect(slot->shard_no);
|
||||||
|
|
||||||
/* clean up the request */
|
/* clean up the request */
|
||||||
slot->status = PRFS_TAG_REMAINS;
|
slot->status = PRFS_TAG_REMAINS;
|
||||||
MyPState->n_requests_inflight -= 1;
|
MyPState->n_requests_inflight -= 1;
|
||||||
@@ -633,13 +641,12 @@ prefetch_on_ps_disconnect(void)
|
|||||||
static inline void
|
static inline void
|
||||||
prefetch_set_unused(uint64 ring_index)
|
prefetch_set_unused(uint64 ring_index)
|
||||||
{
|
{
|
||||||
PrefetchRequest *slot = GetPrfSlot(ring_index);
|
PrefetchRequest *slot;
|
||||||
|
|
||||||
if (ring_index < MyPState->ring_last)
|
if (ring_index < MyPState->ring_last)
|
||||||
return; /* Should already be unused */
|
return; /* Should already be unused */
|
||||||
|
|
||||||
Assert(MyPState->ring_unused > ring_index);
|
slot = GetPrfSlot(ring_index);
|
||||||
|
|
||||||
if (slot->status == PRFS_UNUSED)
|
if (slot->status == PRFS_UNUSED)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
@@ -798,7 +805,8 @@ Retry:
|
|||||||
{
|
{
|
||||||
if (*force_lsn > slot->effective_request_lsn)
|
if (*force_lsn > slot->effective_request_lsn)
|
||||||
{
|
{
|
||||||
prefetch_wait_for(ring_index);
|
if (!prefetch_wait_for(ring_index))
|
||||||
|
goto Retry;
|
||||||
prefetch_set_unused(ring_index);
|
prefetch_set_unused(ring_index);
|
||||||
entry = NULL;
|
entry = NULL;
|
||||||
}
|
}
|
||||||
@@ -813,7 +821,8 @@ Retry:
|
|||||||
{
|
{
|
||||||
if (*force_lsn != slot->effective_request_lsn)
|
if (*force_lsn != slot->effective_request_lsn)
|
||||||
{
|
{
|
||||||
prefetch_wait_for(ring_index);
|
if (!prefetch_wait_for(ring_index))
|
||||||
|
goto Retry;
|
||||||
prefetch_set_unused(ring_index);
|
prefetch_set_unused(ring_index);
|
||||||
entry = NULL;
|
entry = NULL;
|
||||||
}
|
}
|
||||||
@@ -879,7 +888,8 @@ Retry:
|
|||||||
{
|
{
|
||||||
case PRFS_REQUESTED:
|
case PRFS_REQUESTED:
|
||||||
Assert(MyPState->ring_receive == cleanup_index);
|
Assert(MyPState->ring_receive == cleanup_index);
|
||||||
prefetch_wait_for(cleanup_index);
|
if (!prefetch_wait_for(cleanup_index))
|
||||||
|
goto Retry;
|
||||||
prefetch_set_unused(cleanup_index);
|
prefetch_set_unused(cleanup_index);
|
||||||
break;
|
break;
|
||||||
case PRFS_RECEIVED:
|
case PRFS_RECEIVED:
|
||||||
@@ -1680,7 +1690,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
|
neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_exists", resp->tag);
|
||||||
}
|
}
|
||||||
pfree(resp);
|
pfree(resp);
|
||||||
return exists;
|
return exists;
|
||||||
@@ -2132,6 +2142,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
|||||||
/*
|
/*
|
||||||
* Try to find prefetched page in the list of received pages.
|
* Try to find prefetched page in the list of received pages.
|
||||||
*/
|
*/
|
||||||
|
Retry:
|
||||||
entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag);
|
entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag);
|
||||||
|
|
||||||
if (entry != NULL)
|
if (entry != NULL)
|
||||||
@@ -2153,7 +2164,8 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
|||||||
*/
|
*/
|
||||||
if (slot->status == PRFS_REQUESTED)
|
if (slot->status == PRFS_REQUESTED)
|
||||||
{
|
{
|
||||||
prefetch_wait_for(slot->my_ring_index);
|
if (!prefetch_wait_for(slot->my_ring_index))
|
||||||
|
goto Retry;
|
||||||
}
|
}
|
||||||
/* drop caches */
|
/* drop caches */
|
||||||
prefetch_set_unused(slot->my_ring_index);
|
prefetch_set_unused(slot->my_ring_index);
|
||||||
@@ -2216,7 +2228,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
|||||||
((NeonErrorResponse *) resp)->message)));
|
((NeonErrorResponse *) resp)->message)));
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
|
neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_at_lsn", resp->tag);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* buffer was used, clean up for later reuse */
|
/* buffer was used, clean up for later reuse */
|
||||||
@@ -2489,7 +2501,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
|
neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_nblocks", resp->tag);
|
||||||
}
|
}
|
||||||
update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
|
update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
|
||||||
|
|
||||||
@@ -2544,7 +2556,7 @@ neon_dbsize(Oid dbNode)
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
|
neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_dbsize", resp->tag);
|
||||||
}
|
}
|
||||||
|
|
||||||
neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
|
neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
|
||||||
@@ -2849,7 +2861,7 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
|
neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_slru_segment", resp->tag);
|
||||||
}
|
}
|
||||||
pfree(resp);
|
pfree(resp);
|
||||||
|
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ testing = []
|
|||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
anyhow.workspace = true
|
anyhow.workspace = true
|
||||||
|
async-compression.workspace = true
|
||||||
async-trait.workspace = true
|
async-trait.workspace = true
|
||||||
aws-config.workspace = true
|
aws-config.workspace = true
|
||||||
aws-sdk-iam.workspace = true
|
aws-sdk-iam.workspace = true
|
||||||
|
|||||||
@@ -102,8 +102,7 @@ pub(super) async fn authenticate(
|
|||||||
|
|
||||||
ctx.set_user(db_info.user.into());
|
ctx.set_user(db_info.user.into());
|
||||||
ctx.set_project(db_info.aux.clone());
|
ctx.set_project(db_info.aux.clone());
|
||||||
let cold_start_info = db_info.aux.cold_start_info.clone().unwrap_or_default();
|
info!("woken up a compute node");
|
||||||
info!(?cold_start_info, "woken up a compute node");
|
|
||||||
|
|
||||||
// Backwards compatibility. pg_sni_proxy uses "--" in domain names
|
// Backwards compatibility. pg_sni_proxy uses "--" in domain names
|
||||||
// while direct connections do not. Once we migrate to pg_sni_proxy
|
// while direct connections do not. Once we migrate to pg_sni_proxy
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ use itertools::Itertools;
|
|||||||
use proxy::config::TlsServerEndPoint;
|
use proxy::config::TlsServerEndPoint;
|
||||||
use proxy::context::RequestMonitoring;
|
use proxy::context::RequestMonitoring;
|
||||||
use proxy::proxy::run_until_cancelled;
|
use proxy::proxy::run_until_cancelled;
|
||||||
|
use proxy::{BranchId, EndpointId, ProjectId};
|
||||||
use rustls::pki_types::PrivateKeyDer;
|
use rustls::pki_types::PrivateKeyDer;
|
||||||
use tokio::net::TcpListener;
|
use tokio::net::TcpListener;
|
||||||
|
|
||||||
@@ -269,7 +270,12 @@ async fn handle_client(
|
|||||||
|
|
||||||
let client = tokio::net::TcpStream::connect(destination).await?;
|
let client = tokio::net::TcpStream::connect(destination).await?;
|
||||||
|
|
||||||
let metrics_aux: MetricsAuxInfo = Default::default();
|
let metrics_aux: MetricsAuxInfo = MetricsAuxInfo {
|
||||||
|
endpoint_id: (&EndpointId::from("")).into(),
|
||||||
|
project_id: (&ProjectId::from("")).into(),
|
||||||
|
branch_id: (&BranchId::from("")).into(),
|
||||||
|
cold_start_info: proxy::console::messages::ColdStartInfo::Unknown,
|
||||||
|
};
|
||||||
|
|
||||||
// doesn't yet matter as pg-sni-router doesn't report analytics logs
|
// doesn't yet matter as pg-sni-router doesn't report analytics logs
|
||||||
ctx.set_success();
|
ctx.set_success();
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ use proxy::auth;
|
|||||||
use proxy::auth::backend::MaybeOwned;
|
use proxy::auth::backend::MaybeOwned;
|
||||||
use proxy::cancellation::CancelMap;
|
use proxy::cancellation::CancelMap;
|
||||||
use proxy::cancellation::CancellationHandler;
|
use proxy::cancellation::CancellationHandler;
|
||||||
|
use proxy::config::remote_storage_from_toml;
|
||||||
use proxy::config::AuthenticationConfig;
|
use proxy::config::AuthenticationConfig;
|
||||||
use proxy::config::CacheOptions;
|
use proxy::config::CacheOptions;
|
||||||
use proxy::config::HttpConfig;
|
use proxy::config::HttpConfig;
|
||||||
@@ -191,6 +192,19 @@ struct ProxyCliArgs {
|
|||||||
|
|
||||||
#[clap(flatten)]
|
#[clap(flatten)]
|
||||||
parquet_upload: ParquetUploadArgs,
|
parquet_upload: ParquetUploadArgs,
|
||||||
|
|
||||||
|
/// interval for backup metric collection
|
||||||
|
#[clap(long, default_value = "10m", value_parser = humantime::parse_duration)]
|
||||||
|
metric_backup_collection_interval: std::time::Duration,
|
||||||
|
/// remote storage configuration for backup metric collection
|
||||||
|
/// Encoded as toml (same format as pageservers), eg
|
||||||
|
/// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}`
|
||||||
|
#[clap(long, default_value = "{}")]
|
||||||
|
metric_backup_collection_remote_storage: String,
|
||||||
|
/// chunk size for backup metric collection
|
||||||
|
/// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression.
|
||||||
|
#[clap(long, default_value = "4194304")]
|
||||||
|
metric_backup_collection_chunk_size: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(clap::Args, Clone, Copy, Debug)]
|
#[derive(clap::Args, Clone, Copy, Debug)]
|
||||||
@@ -372,12 +386,17 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
|
|
||||||
// maintenance tasks. these never return unless there's an error
|
// maintenance tasks. these never return unless there's an error
|
||||||
let mut maintenance_tasks = JoinSet::new();
|
let mut maintenance_tasks = JoinSet::new();
|
||||||
maintenance_tasks.spawn(proxy::handle_signals(cancellation_token));
|
maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone()));
|
||||||
maintenance_tasks.spawn(http::health_server::task_main(http_listener));
|
maintenance_tasks.spawn(http::health_server::task_main(http_listener));
|
||||||
maintenance_tasks.spawn(console::mgmt::task_main(mgmt_listener));
|
maintenance_tasks.spawn(console::mgmt::task_main(mgmt_listener));
|
||||||
|
|
||||||
if let Some(metrics_config) = &config.metric_collection {
|
if let Some(metrics_config) = &config.metric_collection {
|
||||||
|
// TODO: Add gc regardles of the metric collection being enabled.
|
||||||
maintenance_tasks.spawn(usage_metrics::task_main(metrics_config));
|
maintenance_tasks.spawn(usage_metrics::task_main(metrics_config));
|
||||||
|
client_tasks.spawn(usage_metrics::task_backup(
|
||||||
|
&metrics_config.backup_metric_collection_config,
|
||||||
|
cancellation_token,
|
||||||
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
if let auth::BackendType::Console(api, _) = &config.auth_backend {
|
if let auth::BackendType::Console(api, _) = &config.auth_backend {
|
||||||
@@ -434,6 +453,13 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
|||||||
if args.allow_self_signed_compute {
|
if args.allow_self_signed_compute {
|
||||||
warn!("allowing self-signed compute certificates");
|
warn!("allowing self-signed compute certificates");
|
||||||
}
|
}
|
||||||
|
let backup_metric_collection_config = config::MetricBackupCollectionConfig {
|
||||||
|
interval: args.metric_backup_collection_interval,
|
||||||
|
remote_storage_config: remote_storage_from_toml(
|
||||||
|
&args.metric_backup_collection_remote_storage,
|
||||||
|
)?,
|
||||||
|
chunk_size: args.metric_backup_collection_chunk_size,
|
||||||
|
};
|
||||||
|
|
||||||
let metric_collection = match (
|
let metric_collection = match (
|
||||||
&args.metric_collection_endpoint,
|
&args.metric_collection_endpoint,
|
||||||
@@ -442,6 +468,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
|||||||
(Some(endpoint), Some(interval)) => Some(config::MetricCollectionConfig {
|
(Some(endpoint), Some(interval)) => Some(config::MetricCollectionConfig {
|
||||||
endpoint: endpoint.parse()?,
|
endpoint: endpoint.parse()?,
|
||||||
interval: humantime::parse_duration(interval)?,
|
interval: humantime::parse_duration(interval)?,
|
||||||
|
backup_metric_collection_config,
|
||||||
}),
|
}),
|
||||||
(None, None) => None,
|
(None, None) => None,
|
||||||
_ => bail!(
|
_ => bail!(
|
||||||
|
|||||||
98
proxy/src/cache/project_info.rs
vendored
98
proxy/src/cache/project_info.rs
vendored
@@ -16,7 +16,7 @@ use crate::{
|
|||||||
config::ProjectInfoCacheOptions,
|
config::ProjectInfoCacheOptions,
|
||||||
console::AuthSecret,
|
console::AuthSecret,
|
||||||
intern::{EndpointIdInt, ProjectIdInt, RoleNameInt},
|
intern::{EndpointIdInt, ProjectIdInt, RoleNameInt},
|
||||||
EndpointId, ProjectId, RoleName,
|
EndpointId, RoleName,
|
||||||
};
|
};
|
||||||
|
|
||||||
use super::{Cache, Cached};
|
use super::{Cache, Cached};
|
||||||
@@ -214,14 +214,11 @@ impl ProjectInfoCacheImpl {
|
|||||||
}
|
}
|
||||||
pub fn insert_role_secret(
|
pub fn insert_role_secret(
|
||||||
&self,
|
&self,
|
||||||
project_id: &ProjectId,
|
project_id: ProjectIdInt,
|
||||||
endpoint_id: &EndpointId,
|
endpoint_id: EndpointIdInt,
|
||||||
role_name: &RoleName,
|
role_name: RoleNameInt,
|
||||||
secret: Option<AuthSecret>,
|
secret: Option<AuthSecret>,
|
||||||
) {
|
) {
|
||||||
let project_id = ProjectIdInt::from(project_id);
|
|
||||||
let endpoint_id = EndpointIdInt::from(endpoint_id);
|
|
||||||
let role_name = RoleNameInt::from(role_name);
|
|
||||||
if self.cache.len() >= self.config.size {
|
if self.cache.len() >= self.config.size {
|
||||||
// If there are too many entries, wait until the next gc cycle.
|
// If there are too many entries, wait until the next gc cycle.
|
||||||
return;
|
return;
|
||||||
@@ -234,12 +231,10 @@ impl ProjectInfoCacheImpl {
|
|||||||
}
|
}
|
||||||
pub fn insert_allowed_ips(
|
pub fn insert_allowed_ips(
|
||||||
&self,
|
&self,
|
||||||
project_id: &ProjectId,
|
project_id: ProjectIdInt,
|
||||||
endpoint_id: &EndpointId,
|
endpoint_id: EndpointIdInt,
|
||||||
allowed_ips: Arc<Vec<IpPattern>>,
|
allowed_ips: Arc<Vec<IpPattern>>,
|
||||||
) {
|
) {
|
||||||
let project_id = ProjectIdInt::from(project_id);
|
|
||||||
let endpoint_id = EndpointIdInt::from(endpoint_id);
|
|
||||||
if self.cache.len() >= self.config.size {
|
if self.cache.len() >= self.config.size {
|
||||||
// If there are too many entries, wait until the next gc cycle.
|
// If there are too many entries, wait until the next gc cycle.
|
||||||
return;
|
return;
|
||||||
@@ -358,7 +353,7 @@ impl Cache for ProjectInfoCacheImpl {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::scram::ServerSecret;
|
use crate::{scram::ServerSecret, ProjectId};
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_project_info_cache_settings() {
|
async fn test_project_info_cache_settings() {
|
||||||
@@ -369,8 +364,8 @@ mod tests {
|
|||||||
ttl: Duration::from_secs(1),
|
ttl: Duration::from_secs(1),
|
||||||
gc_interval: Duration::from_secs(600),
|
gc_interval: Duration::from_secs(600),
|
||||||
});
|
});
|
||||||
let project_id = "project".into();
|
let project_id: ProjectId = "project".into();
|
||||||
let endpoint_id = "endpoint".into();
|
let endpoint_id: EndpointId = "endpoint".into();
|
||||||
let user1: RoleName = "user1".into();
|
let user1: RoleName = "user1".into();
|
||||||
let user2: RoleName = "user2".into();
|
let user2: RoleName = "user2".into();
|
||||||
let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
|
let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
|
||||||
@@ -379,9 +374,23 @@ mod tests {
|
|||||||
"127.0.0.1".parse().unwrap(),
|
"127.0.0.1".parse().unwrap(),
|
||||||
"127.0.0.2".parse().unwrap(),
|
"127.0.0.2".parse().unwrap(),
|
||||||
]);
|
]);
|
||||||
cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone());
|
cache.insert_role_secret(
|
||||||
cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone());
|
(&project_id).into(),
|
||||||
cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone());
|
(&endpoint_id).into(),
|
||||||
|
(&user1).into(),
|
||||||
|
secret1.clone(),
|
||||||
|
);
|
||||||
|
cache.insert_role_secret(
|
||||||
|
(&project_id).into(),
|
||||||
|
(&endpoint_id).into(),
|
||||||
|
(&user2).into(),
|
||||||
|
secret2.clone(),
|
||||||
|
);
|
||||||
|
cache.insert_allowed_ips(
|
||||||
|
(&project_id).into(),
|
||||||
|
(&endpoint_id).into(),
|
||||||
|
allowed_ips.clone(),
|
||||||
|
);
|
||||||
|
|
||||||
let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap();
|
let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap();
|
||||||
assert!(cached.cached());
|
assert!(cached.cached());
|
||||||
@@ -393,7 +402,12 @@ mod tests {
|
|||||||
// Shouldn't add more than 2 roles.
|
// Shouldn't add more than 2 roles.
|
||||||
let user3: RoleName = "user3".into();
|
let user3: RoleName = "user3".into();
|
||||||
let secret3 = Some(AuthSecret::Scram(ServerSecret::mock([3; 32])));
|
let secret3 = Some(AuthSecret::Scram(ServerSecret::mock([3; 32])));
|
||||||
cache.insert_role_secret(&project_id, &endpoint_id, &user3, secret3.clone());
|
cache.insert_role_secret(
|
||||||
|
(&project_id).into(),
|
||||||
|
(&endpoint_id).into(),
|
||||||
|
(&user3).into(),
|
||||||
|
secret3.clone(),
|
||||||
|
);
|
||||||
assert!(cache.get_role_secret(&endpoint_id, &user3).is_none());
|
assert!(cache.get_role_secret(&endpoint_id, &user3).is_none());
|
||||||
|
|
||||||
let cached = cache.get_allowed_ips(&endpoint_id).unwrap();
|
let cached = cache.get_allowed_ips(&endpoint_id).unwrap();
|
||||||
@@ -421,8 +435,8 @@ mod tests {
|
|||||||
cache.clone().disable_ttl();
|
cache.clone().disable_ttl();
|
||||||
tokio::time::advance(Duration::from_secs(2)).await;
|
tokio::time::advance(Duration::from_secs(2)).await;
|
||||||
|
|
||||||
let project_id = "project".into();
|
let project_id: ProjectId = "project".into();
|
||||||
let endpoint_id = "endpoint".into();
|
let endpoint_id: EndpointId = "endpoint".into();
|
||||||
let user1: RoleName = "user1".into();
|
let user1: RoleName = "user1".into();
|
||||||
let user2: RoleName = "user2".into();
|
let user2: RoleName = "user2".into();
|
||||||
let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
|
let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
|
||||||
@@ -431,9 +445,23 @@ mod tests {
|
|||||||
"127.0.0.1".parse().unwrap(),
|
"127.0.0.1".parse().unwrap(),
|
||||||
"127.0.0.2".parse().unwrap(),
|
"127.0.0.2".parse().unwrap(),
|
||||||
]);
|
]);
|
||||||
cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone());
|
cache.insert_role_secret(
|
||||||
cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone());
|
(&project_id).into(),
|
||||||
cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone());
|
(&endpoint_id).into(),
|
||||||
|
(&user1).into(),
|
||||||
|
secret1.clone(),
|
||||||
|
);
|
||||||
|
cache.insert_role_secret(
|
||||||
|
(&project_id).into(),
|
||||||
|
(&endpoint_id).into(),
|
||||||
|
(&user2).into(),
|
||||||
|
secret2.clone(),
|
||||||
|
);
|
||||||
|
cache.insert_allowed_ips(
|
||||||
|
(&project_id).into(),
|
||||||
|
(&endpoint_id).into(),
|
||||||
|
allowed_ips.clone(),
|
||||||
|
);
|
||||||
|
|
||||||
tokio::time::advance(Duration::from_secs(2)).await;
|
tokio::time::advance(Duration::from_secs(2)).await;
|
||||||
// Nothing should be invalidated.
|
// Nothing should be invalidated.
|
||||||
@@ -470,8 +498,8 @@ mod tests {
|
|||||||
gc_interval: Duration::from_secs(600),
|
gc_interval: Duration::from_secs(600),
|
||||||
}));
|
}));
|
||||||
|
|
||||||
let project_id = "project".into();
|
let project_id: ProjectId = "project".into();
|
||||||
let endpoint_id = "endpoint".into();
|
let endpoint_id: EndpointId = "endpoint".into();
|
||||||
let user1: RoleName = "user1".into();
|
let user1: RoleName = "user1".into();
|
||||||
let user2: RoleName = "user2".into();
|
let user2: RoleName = "user2".into();
|
||||||
let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
|
let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
|
||||||
@@ -480,10 +508,20 @@ mod tests {
|
|||||||
"127.0.0.1".parse().unwrap(),
|
"127.0.0.1".parse().unwrap(),
|
||||||
"127.0.0.2".parse().unwrap(),
|
"127.0.0.2".parse().unwrap(),
|
||||||
]);
|
]);
|
||||||
cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone());
|
cache.insert_role_secret(
|
||||||
|
(&project_id).into(),
|
||||||
|
(&endpoint_id).into(),
|
||||||
|
(&user1).into(),
|
||||||
|
secret1.clone(),
|
||||||
|
);
|
||||||
cache.clone().disable_ttl();
|
cache.clone().disable_ttl();
|
||||||
tokio::time::advance(Duration::from_millis(100)).await;
|
tokio::time::advance(Duration::from_millis(100)).await;
|
||||||
cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone());
|
cache.insert_role_secret(
|
||||||
|
(&project_id).into(),
|
||||||
|
(&endpoint_id).into(),
|
||||||
|
(&user2).into(),
|
||||||
|
secret2.clone(),
|
||||||
|
);
|
||||||
|
|
||||||
// Added before ttl was disabled + ttl should be still cached.
|
// Added before ttl was disabled + ttl should be still cached.
|
||||||
let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap();
|
let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap();
|
||||||
@@ -497,7 +535,11 @@ mod tests {
|
|||||||
assert!(cache.get_role_secret(&endpoint_id, &user2).is_none());
|
assert!(cache.get_role_secret(&endpoint_id, &user2).is_none());
|
||||||
|
|
||||||
// Added after ttl was disabled + ttl should not be cached.
|
// Added after ttl was disabled + ttl should not be cached.
|
||||||
cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone());
|
cache.insert_allowed_ips(
|
||||||
|
(&project_id).into(),
|
||||||
|
(&endpoint_id).into(),
|
||||||
|
allowed_ips.clone(),
|
||||||
|
);
|
||||||
let cached = cache.get_allowed_ips(&endpoint_id).unwrap();
|
let cached = cache.get_allowed_ips(&endpoint_id).unwrap();
|
||||||
assert!(!cached.cached());
|
assert!(!cached.cached());
|
||||||
|
|
||||||
|
|||||||
@@ -276,6 +276,7 @@ impl ConnCfg {
|
|||||||
let stream = connection.stream.into_inner();
|
let stream = connection.stream.into_inner();
|
||||||
|
|
||||||
info!(
|
info!(
|
||||||
|
cold_start_info = ctx.cold_start_info.as_str(),
|
||||||
"connected to compute node at {host} ({socket_addr}) sslmode={:?}",
|
"connected to compute node at {host} ({socket_addr}) sslmode={:?}",
|
||||||
self.0.get_ssl_mode()
|
self.0.get_ssl_mode()
|
||||||
);
|
);
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ use crate::{
|
|||||||
};
|
};
|
||||||
use anyhow::{bail, ensure, Context, Ok};
|
use anyhow::{bail, ensure, Context, Ok};
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
|
use remote_storage::RemoteStorageConfig;
|
||||||
use rustls::{
|
use rustls::{
|
||||||
crypto::ring::sign,
|
crypto::ring::sign,
|
||||||
pki_types::{CertificateDer, PrivateKeyDer},
|
pki_types::{CertificateDer, PrivateKeyDer},
|
||||||
@@ -39,6 +40,7 @@ pub struct ProxyConfig {
|
|||||||
pub struct MetricCollectionConfig {
|
pub struct MetricCollectionConfig {
|
||||||
pub endpoint: reqwest::Url,
|
pub endpoint: reqwest::Url,
|
||||||
pub interval: Duration,
|
pub interval: Duration,
|
||||||
|
pub backup_metric_collection_config: MetricBackupCollectionConfig,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct TlsConfig {
|
pub struct TlsConfig {
|
||||||
@@ -311,6 +313,21 @@ impl CertResolver {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct MetricBackupCollectionConfig {
|
||||||
|
pub interval: Duration,
|
||||||
|
pub remote_storage_config: OptRemoteStorageConfig,
|
||||||
|
pub chunk_size: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Hack to avoid clap being smarter. If you don't use this type alias, clap assumes more about the optional state and you get
|
||||||
|
/// runtime type errors from the value parser we use.
|
||||||
|
pub type OptRemoteStorageConfig = Option<RemoteStorageConfig>;
|
||||||
|
|
||||||
|
pub fn remote_storage_from_toml(s: &str) -> anyhow::Result<OptRemoteStorageConfig> {
|
||||||
|
RemoteStorageConfig::from_toml(&s.parse()?)
|
||||||
|
}
|
||||||
|
|
||||||
/// Helper for cmdline cache options parsing.
|
/// Helper for cmdline cache options parsing.
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct CacheOptions {
|
pub struct CacheOptions {
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ use std::fmt;
|
|||||||
|
|
||||||
use crate::auth::IpPattern;
|
use crate::auth::IpPattern;
|
||||||
|
|
||||||
use crate::{BranchId, EndpointId, ProjectId};
|
use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt};
|
||||||
|
|
||||||
/// Generic error response with human-readable description.
|
/// Generic error response with human-readable description.
|
||||||
/// Note that we can't always present it to user as is.
|
/// Note that we can't always present it to user as is.
|
||||||
@@ -18,7 +18,7 @@ pub struct ConsoleError {
|
|||||||
pub struct GetRoleSecret {
|
pub struct GetRoleSecret {
|
||||||
pub role_secret: Box<str>,
|
pub role_secret: Box<str>,
|
||||||
pub allowed_ips: Option<Vec<IpPattern>>,
|
pub allowed_ips: Option<Vec<IpPattern>>,
|
||||||
pub project_id: Option<ProjectId>,
|
pub project_id: Option<ProjectIdInt>,
|
||||||
}
|
}
|
||||||
|
|
||||||
// Manually implement debug to omit sensitive info.
|
// Manually implement debug to omit sensitive info.
|
||||||
@@ -93,22 +93,47 @@ impl fmt::Debug for DatabaseInfo {
|
|||||||
|
|
||||||
/// Various labels for prometheus metrics.
|
/// Various labels for prometheus metrics.
|
||||||
/// Also known as `ProxyMetricsAuxInfo` in the console.
|
/// Also known as `ProxyMetricsAuxInfo` in the console.
|
||||||
#[derive(Debug, Deserialize, Clone, Default)]
|
#[derive(Debug, Deserialize, Clone)]
|
||||||
pub struct MetricsAuxInfo {
|
pub struct MetricsAuxInfo {
|
||||||
pub endpoint_id: EndpointId,
|
pub endpoint_id: EndpointIdInt,
|
||||||
pub project_id: ProjectId,
|
pub project_id: ProjectIdInt,
|
||||||
pub branch_id: BranchId,
|
pub branch_id: BranchIdInt,
|
||||||
pub cold_start_info: Option<ColdStartInfo>,
|
#[serde(default)]
|
||||||
|
pub cold_start_info: ColdStartInfo,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Default, Serialize, Deserialize, Clone)]
|
#[derive(Debug, Default, Serialize, Deserialize, Clone, Copy)]
|
||||||
#[serde(rename_all = "snake_case")]
|
#[serde(rename_all = "snake_case")]
|
||||||
pub enum ColdStartInfo {
|
pub enum ColdStartInfo {
|
||||||
#[default]
|
#[default]
|
||||||
Unknown = 0,
|
Unknown,
|
||||||
Warm = 1,
|
/// Compute was already running
|
||||||
PoolHit = 2,
|
Warm,
|
||||||
PoolMiss = 3,
|
#[serde(rename = "pool_hit")]
|
||||||
|
/// Compute was not running but there was an available VM
|
||||||
|
VmPoolHit,
|
||||||
|
#[serde(rename = "pool_miss")]
|
||||||
|
/// Compute was not running and there were no VMs available
|
||||||
|
VmPoolMiss,
|
||||||
|
|
||||||
|
// not provided by control plane
|
||||||
|
/// Connection available from HTTP pool
|
||||||
|
HttpPoolHit,
|
||||||
|
/// Cached connection info
|
||||||
|
WarmCached,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ColdStartInfo {
|
||||||
|
pub fn as_str(&self) -> &'static str {
|
||||||
|
match self {
|
||||||
|
ColdStartInfo::Unknown => "unknown",
|
||||||
|
ColdStartInfo::Warm => "warm",
|
||||||
|
ColdStartInfo::VmPoolHit => "pool_hit",
|
||||||
|
ColdStartInfo::VmPoolMiss => "pool_miss",
|
||||||
|
ColdStartInfo::HttpPoolHit => "http_pool_hit",
|
||||||
|
ColdStartInfo::WarmCached => "warm_cached",
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
|||||||
@@ -12,7 +12,8 @@ use crate::{
|
|||||||
compute,
|
compute,
|
||||||
config::{CacheOptions, ProjectInfoCacheOptions},
|
config::{CacheOptions, ProjectInfoCacheOptions},
|
||||||
context::RequestMonitoring,
|
context::RequestMonitoring,
|
||||||
scram, EndpointCacheKey, ProjectId,
|
intern::ProjectIdInt,
|
||||||
|
scram, EndpointCacheKey,
|
||||||
};
|
};
|
||||||
use dashmap::DashMap;
|
use dashmap::DashMap;
|
||||||
use std::{sync::Arc, time::Duration};
|
use std::{sync::Arc, time::Duration};
|
||||||
@@ -271,7 +272,7 @@ pub struct AuthInfo {
|
|||||||
/// List of IP addresses allowed for the autorization.
|
/// List of IP addresses allowed for the autorization.
|
||||||
pub allowed_ips: Vec<IpPattern>,
|
pub allowed_ips: Vec<IpPattern>,
|
||||||
/// Project ID. This is used for cache invalidation.
|
/// Project ID. This is used for cache invalidation.
|
||||||
pub project_id: Option<ProjectId>,
|
pub project_id: Option<ProjectIdInt>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Info for establishing a connection to a compute node.
|
/// Info for establishing a connection to a compute node.
|
||||||
|
|||||||
@@ -4,10 +4,16 @@ use super::{
|
|||||||
errors::{ApiError, GetAuthInfoError, WakeComputeError},
|
errors::{ApiError, GetAuthInfoError, WakeComputeError},
|
||||||
AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo,
|
AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo,
|
||||||
};
|
};
|
||||||
use crate::console::provider::{CachedAllowedIps, CachedRoleSecret};
|
|
||||||
use crate::context::RequestMonitoring;
|
use crate::context::RequestMonitoring;
|
||||||
use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
|
use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
|
||||||
use crate::{auth::IpPattern, cache::Cached};
|
use crate::{auth::IpPattern, cache::Cached};
|
||||||
|
use crate::{
|
||||||
|
console::{
|
||||||
|
messages::MetricsAuxInfo,
|
||||||
|
provider::{CachedAllowedIps, CachedRoleSecret},
|
||||||
|
},
|
||||||
|
BranchId, EndpointId, ProjectId,
|
||||||
|
};
|
||||||
use futures::TryFutureExt;
|
use futures::TryFutureExt;
|
||||||
use std::{str::FromStr, sync::Arc};
|
use std::{str::FromStr, sync::Arc};
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
@@ -114,7 +120,12 @@ impl Api {
|
|||||||
|
|
||||||
let node = NodeInfo {
|
let node = NodeInfo {
|
||||||
config,
|
config,
|
||||||
aux: Default::default(),
|
aux: MetricsAuxInfo {
|
||||||
|
endpoint_id: (&EndpointId::from("endpoint")).into(),
|
||||||
|
project_id: (&ProjectId::from("project")).into(),
|
||||||
|
branch_id: (&BranchId::from("branch")).into(),
|
||||||
|
cold_start_info: crate::console::messages::ColdStartInfo::Warm,
|
||||||
|
},
|
||||||
allow_self_signed_compute: false,
|
allow_self_signed_compute: false,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -181,15 +181,16 @@ impl super::Api for Api {
|
|||||||
}
|
}
|
||||||
let auth_info = self.do_get_auth_info(ctx, user_info).await?;
|
let auth_info = self.do_get_auth_info(ctx, user_info).await?;
|
||||||
if let Some(project_id) = auth_info.project_id {
|
if let Some(project_id) = auth_info.project_id {
|
||||||
|
let ep_int = ep.into();
|
||||||
self.caches.project_info.insert_role_secret(
|
self.caches.project_info.insert_role_secret(
|
||||||
&project_id,
|
project_id,
|
||||||
ep,
|
ep_int,
|
||||||
user,
|
user.into(),
|
||||||
auth_info.secret.clone(),
|
auth_info.secret.clone(),
|
||||||
);
|
);
|
||||||
self.caches.project_info.insert_allowed_ips(
|
self.caches.project_info.insert_allowed_ips(
|
||||||
&project_id,
|
project_id,
|
||||||
ep,
|
ep_int,
|
||||||
Arc::new(auth_info.allowed_ips),
|
Arc::new(auth_info.allowed_ips),
|
||||||
);
|
);
|
||||||
ctx.set_project_id(project_id);
|
ctx.set_project_id(project_id);
|
||||||
@@ -217,15 +218,16 @@ impl super::Api for Api {
|
|||||||
let allowed_ips = Arc::new(auth_info.allowed_ips);
|
let allowed_ips = Arc::new(auth_info.allowed_ips);
|
||||||
let user = &user_info.user;
|
let user = &user_info.user;
|
||||||
if let Some(project_id) = auth_info.project_id {
|
if let Some(project_id) = auth_info.project_id {
|
||||||
|
let ep_int = ep.into();
|
||||||
self.caches.project_info.insert_role_secret(
|
self.caches.project_info.insert_role_secret(
|
||||||
&project_id,
|
project_id,
|
||||||
ep,
|
ep_int,
|
||||||
user,
|
user.into(),
|
||||||
auth_info.secret.clone(),
|
auth_info.secret.clone(),
|
||||||
);
|
);
|
||||||
self.caches
|
self.caches
|
||||||
.project_info
|
.project_info
|
||||||
.insert_allowed_ips(&project_id, ep, allowed_ips.clone());
|
.insert_allowed_ips(project_id, ep_int, allowed_ips.clone());
|
||||||
ctx.set_project_id(project_id);
|
ctx.set_project_id(project_id);
|
||||||
}
|
}
|
||||||
Ok((
|
Ok((
|
||||||
@@ -248,8 +250,7 @@ impl super::Api for Api {
|
|||||||
// which means that we might cache it to reduce the load and latency.
|
// which means that we might cache it to reduce the load and latency.
|
||||||
if let Some(cached) = self.caches.node_info.get(&key) {
|
if let Some(cached) = self.caches.node_info.get(&key) {
|
||||||
info!(key = &*key, "found cached compute node info");
|
info!(key = &*key, "found cached compute node info");
|
||||||
info!("cold_start_info=warm");
|
ctx.set_project(cached.aux.clone());
|
||||||
ctx.set_cold_start_info(ColdStartInfo::Warm);
|
|
||||||
return Ok(cached);
|
return Ok(cached);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -260,17 +261,21 @@ impl super::Api for Api {
|
|||||||
if permit.should_check_cache() {
|
if permit.should_check_cache() {
|
||||||
if let Some(cached) = self.caches.node_info.get(&key) {
|
if let Some(cached) = self.caches.node_info.get(&key) {
|
||||||
info!(key = &*key, "found cached compute node info");
|
info!(key = &*key, "found cached compute node info");
|
||||||
info!("cold_start_info=warm");
|
ctx.set_project(cached.aux.clone());
|
||||||
ctx.set_cold_start_info(ColdStartInfo::Warm);
|
|
||||||
return Ok(cached);
|
return Ok(cached);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let node = self.do_wake_compute(ctx, user_info).await?;
|
let mut node = self.do_wake_compute(ctx, user_info).await?;
|
||||||
ctx.set_project(node.aux.clone());
|
ctx.set_project(node.aux.clone());
|
||||||
let cold_start_info = node.aux.cold_start_info.clone().unwrap_or_default();
|
let cold_start_info = node.aux.cold_start_info;
|
||||||
info!(?cold_start_info, "woken up a compute node");
|
info!("woken up a compute node");
|
||||||
let (_, cached) = self.caches.node_info.insert(key.clone(), node);
|
|
||||||
|
// store the cached node as 'warm'
|
||||||
|
node.aux.cold_start_info = ColdStartInfo::WarmCached;
|
||||||
|
let (_, mut cached) = self.caches.node_info.insert(key.clone(), node);
|
||||||
|
cached.aux.cold_start_info = cold_start_info;
|
||||||
|
|
||||||
info!(key = &*key, "created a cache entry for compute node info");
|
info!(key = &*key, "created a cache entry for compute node info");
|
||||||
|
|
||||||
Ok(cached)
|
Ok(cached)
|
||||||
|
|||||||
@@ -11,8 +11,9 @@ use uuid::Uuid;
|
|||||||
use crate::{
|
use crate::{
|
||||||
console::messages::{ColdStartInfo, MetricsAuxInfo},
|
console::messages::{ColdStartInfo, MetricsAuxInfo},
|
||||||
error::ErrorKind,
|
error::ErrorKind,
|
||||||
|
intern::{BranchIdInt, ProjectIdInt},
|
||||||
metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND},
|
metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND},
|
||||||
BranchId, DbName, EndpointId, ProjectId, RoleName,
|
DbName, EndpointId, RoleName,
|
||||||
};
|
};
|
||||||
|
|
||||||
use self::parquet::RequestData;
|
use self::parquet::RequestData;
|
||||||
@@ -34,8 +35,8 @@ pub struct RequestMonitoring {
|
|||||||
pub span: Span,
|
pub span: Span,
|
||||||
|
|
||||||
// filled in as they are discovered
|
// filled in as they are discovered
|
||||||
project: Option<ProjectId>,
|
project: Option<ProjectIdInt>,
|
||||||
branch: Option<BranchId>,
|
branch: Option<BranchIdInt>,
|
||||||
endpoint_id: Option<EndpointId>,
|
endpoint_id: Option<EndpointId>,
|
||||||
dbname: Option<DbName>,
|
dbname: Option<DbName>,
|
||||||
user: Option<RoleName>,
|
user: Option<RoleName>,
|
||||||
@@ -43,7 +44,7 @@ pub struct RequestMonitoring {
|
|||||||
error_kind: Option<ErrorKind>,
|
error_kind: Option<ErrorKind>,
|
||||||
pub(crate) auth_method: Option<AuthMethod>,
|
pub(crate) auth_method: Option<AuthMethod>,
|
||||||
success: bool,
|
success: bool,
|
||||||
cold_start_info: Option<ColdStartInfo>,
|
pub(crate) cold_start_info: ColdStartInfo,
|
||||||
|
|
||||||
// extra
|
// extra
|
||||||
// This sender is here to keep the request monitoring channel open while requests are taking place.
|
// This sender is here to keep the request monitoring channel open while requests are taking place.
|
||||||
@@ -92,7 +93,7 @@ impl RequestMonitoring {
|
|||||||
error_kind: None,
|
error_kind: None,
|
||||||
auth_method: None,
|
auth_method: None,
|
||||||
success: false,
|
success: false,
|
||||||
cold_start_info: None,
|
cold_start_info: ColdStartInfo::Unknown,
|
||||||
|
|
||||||
sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
|
sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
|
||||||
latency_timer: LatencyTimer::new(protocol),
|
latency_timer: LatencyTimer::new(protocol),
|
||||||
@@ -113,26 +114,31 @@ impl RequestMonitoring {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn set_cold_start_info(&mut self, info: ColdStartInfo) {
|
pub fn set_cold_start_info(&mut self, info: ColdStartInfo) {
|
||||||
self.cold_start_info = Some(info);
|
self.cold_start_info = info;
|
||||||
|
self.latency_timer.cold_start_info(info);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn set_project(&mut self, x: MetricsAuxInfo) {
|
pub fn set_project(&mut self, x: MetricsAuxInfo) {
|
||||||
self.set_endpoint_id(x.endpoint_id);
|
if self.endpoint_id.is_none() {
|
||||||
|
self.set_endpoint_id(x.endpoint_id.as_str().into())
|
||||||
|
}
|
||||||
self.branch = Some(x.branch_id);
|
self.branch = Some(x.branch_id);
|
||||||
self.project = Some(x.project_id);
|
self.project = Some(x.project_id);
|
||||||
self.cold_start_info = x.cold_start_info;
|
self.set_cold_start_info(x.cold_start_info);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn set_project_id(&mut self, project_id: ProjectId) {
|
pub fn set_project_id(&mut self, project_id: ProjectIdInt) {
|
||||||
self.project = Some(project_id);
|
self.project = Some(project_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
|
pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
|
||||||
self.span.record("ep", display(&endpoint_id));
|
if self.endpoint_id.is_none() {
|
||||||
crate::metrics::CONNECTING_ENDPOINTS
|
self.span.record("ep", display(&endpoint_id));
|
||||||
.with_label_values(&[self.protocol])
|
crate::metrics::CONNECTING_ENDPOINTS
|
||||||
.measure(&endpoint_id);
|
.with_label_values(&[self.protocol])
|
||||||
self.endpoint_id = Some(endpoint_id);
|
.measure(&endpoint_id);
|
||||||
|
self.endpoint_id = Some(endpoint_id);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn set_application(&mut self, app: Option<SmolStr>) {
|
pub fn set_application(&mut self, app: Option<SmolStr>) {
|
||||||
|
|||||||
@@ -13,12 +13,14 @@ use parquet::{
|
|||||||
},
|
},
|
||||||
record::RecordWriter,
|
record::RecordWriter,
|
||||||
};
|
};
|
||||||
use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig, TimeoutOrCancel};
|
use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
|
||||||
use tokio::{sync::mpsc, time};
|
use tokio::{sync::mpsc, time};
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
use tracing::{debug, info, Span};
|
use tracing::{debug, info, Span};
|
||||||
use utils::backoff;
|
use utils::backoff;
|
||||||
|
|
||||||
|
use crate::config::{remote_storage_from_toml, OptRemoteStorageConfig};
|
||||||
|
|
||||||
use super::{RequestMonitoring, LOG_CHAN};
|
use super::{RequestMonitoring, LOG_CHAN};
|
||||||
|
|
||||||
#[derive(clap::Args, Clone, Debug)]
|
#[derive(clap::Args, Clone, Debug)]
|
||||||
@@ -50,21 +52,13 @@ pub struct ParquetUploadArgs {
|
|||||||
parquet_upload_compression: Compression,
|
parquet_upload_compression: Compression,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Hack to avoid clap being smarter. If you don't use this type alias, clap assumes more about the optional state and you get
|
|
||||||
/// runtime type errors from the value parser we use.
|
|
||||||
type OptRemoteStorageConfig = Option<RemoteStorageConfig>;
|
|
||||||
|
|
||||||
fn remote_storage_from_toml(s: &str) -> anyhow::Result<OptRemoteStorageConfig> {
|
|
||||||
RemoteStorageConfig::from_toml(&s.parse()?)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Occasional network issues and such can cause remote operations to fail, and
|
// Occasional network issues and such can cause remote operations to fail, and
|
||||||
// that's expected. If a upload fails, we log it at info-level, and retry.
|
// that's expected. If a upload fails, we log it at info-level, and retry.
|
||||||
// But after FAILED_UPLOAD_WARN_THRESHOLD retries, we start to log it at WARN
|
// But after FAILED_UPLOAD_WARN_THRESHOLD retries, we start to log it at WARN
|
||||||
// level instead, as repeated failures can mean a more serious problem. If it
|
// level instead, as repeated failures can mean a more serious problem. If it
|
||||||
// fails more than FAILED_UPLOAD_RETRIES times, we give up
|
// fails more than FAILED_UPLOAD_RETRIES times, we give up
|
||||||
pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
|
pub const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
|
||||||
pub(crate) const FAILED_UPLOAD_MAX_RETRIES: u32 = 10;
|
pub const FAILED_UPLOAD_MAX_RETRIES: u32 = 10;
|
||||||
|
|
||||||
// the parquet crate leaves a lot to be desired...
|
// the parquet crate leaves a lot to be desired...
|
||||||
// what follows is an attempt to write parquet files with minimal allocs.
|
// what follows is an attempt to write parquet files with minimal allocs.
|
||||||
@@ -93,7 +87,7 @@ pub struct RequestData {
|
|||||||
/// Or if we make it to proxy_pass
|
/// Or if we make it to proxy_pass
|
||||||
success: bool,
|
success: bool,
|
||||||
/// Indicates if the cplane started the new compute node for this request.
|
/// Indicates if the cplane started the new compute node for this request.
|
||||||
cold_start_info: Option<&'static str>,
|
cold_start_info: &'static str,
|
||||||
/// Tracks time from session start (HTTP request/libpq TCP handshake)
|
/// Tracks time from session start (HTTP request/libpq TCP handshake)
|
||||||
/// Through to success/failure
|
/// Through to success/failure
|
||||||
duration_us: u64,
|
duration_us: u64,
|
||||||
@@ -121,12 +115,7 @@ impl From<&RequestMonitoring> for RequestData {
|
|||||||
region: value.region,
|
region: value.region,
|
||||||
error: value.error_kind.as_ref().map(|e| e.to_metric_label()),
|
error: value.error_kind.as_ref().map(|e| e.to_metric_label()),
|
||||||
success: value.success,
|
success: value.success,
|
||||||
cold_start_info: value.cold_start_info.as_ref().map(|x| match x {
|
cold_start_info: value.cold_start_info.as_str(),
|
||||||
crate::console::messages::ColdStartInfo::Unknown => "unknown",
|
|
||||||
crate::console::messages::ColdStartInfo::Warm => "warm",
|
|
||||||
crate::console::messages::ColdStartInfo::PoolHit => "pool_hit",
|
|
||||||
crate::console::messages::ColdStartInfo::PoolMiss => "pool_miss",
|
|
||||||
}),
|
|
||||||
duration_us: SystemTime::from(value.first_packet)
|
duration_us: SystemTime::from(value.first_packet)
|
||||||
.elapsed()
|
.elapsed()
|
||||||
.unwrap_or_default()
|
.unwrap_or_default()
|
||||||
@@ -460,7 +449,7 @@ mod tests {
|
|||||||
region: "us-east-1",
|
region: "us-east-1",
|
||||||
error: None,
|
error: None,
|
||||||
success: rng.gen(),
|
success: rng.gen(),
|
||||||
cold_start_info: Some("no"),
|
cold_start_info: "no",
|
||||||
duration_us: rng.gen_range(0..30_000_000),
|
duration_us: rng.gen_range(0..30_000_000),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -530,15 +519,15 @@ mod tests {
|
|||||||
assert_eq!(
|
assert_eq!(
|
||||||
file_stats,
|
file_stats,
|
||||||
[
|
[
|
||||||
(1314406, 3, 6000),
|
(1314385, 3, 6000),
|
||||||
(1314399, 3, 6000),
|
(1314378, 3, 6000),
|
||||||
(1314459, 3, 6000),
|
(1314438, 3, 6000),
|
||||||
(1314416, 3, 6000),
|
(1314395, 3, 6000),
|
||||||
(1314546, 3, 6000),
|
(1314525, 3, 6000),
|
||||||
(1314388, 3, 6000),
|
(1314367, 3, 6000),
|
||||||
(1314180, 3, 6000),
|
(1314159, 3, 6000),
|
||||||
(1314416, 3, 6000),
|
(1314395, 3, 6000),
|
||||||
(438359, 1, 2000)
|
(438352, 1, 2000)
|
||||||
]
|
]
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -568,11 +557,11 @@ mod tests {
|
|||||||
assert_eq!(
|
assert_eq!(
|
||||||
file_stats,
|
file_stats,
|
||||||
[
|
[
|
||||||
(1220668, 5, 10000),
|
(1220633, 5, 10000),
|
||||||
(1226818, 5, 10000),
|
(1226783, 5, 10000),
|
||||||
(1228612, 5, 10000),
|
(1228577, 5, 10000),
|
||||||
(1227974, 5, 10000),
|
(1227939, 5, 10000),
|
||||||
(1219252, 5, 10000)
|
(1219217, 5, 10000)
|
||||||
]
|
]
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -604,11 +593,11 @@ mod tests {
|
|||||||
assert_eq!(
|
assert_eq!(
|
||||||
file_stats,
|
file_stats,
|
||||||
[
|
[
|
||||||
(1206315, 5, 10000),
|
(1206280, 5, 10000),
|
||||||
(1206046, 5, 10000),
|
(1206011, 5, 10000),
|
||||||
(1206339, 5, 10000),
|
(1206304, 5, 10000),
|
||||||
(1206327, 5, 10000),
|
(1206292, 5, 10000),
|
||||||
(1206582, 5, 10000)
|
(1206547, 5, 10000)
|
||||||
]
|
]
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -633,15 +622,15 @@ mod tests {
|
|||||||
assert_eq!(
|
assert_eq!(
|
||||||
file_stats,
|
file_stats,
|
||||||
[
|
[
|
||||||
(1314406, 3, 6000),
|
(1314385, 3, 6000),
|
||||||
(1314399, 3, 6000),
|
(1314378, 3, 6000),
|
||||||
(1314459, 3, 6000),
|
(1314438, 3, 6000),
|
||||||
(1314416, 3, 6000),
|
(1314395, 3, 6000),
|
||||||
(1314546, 3, 6000),
|
(1314525, 3, 6000),
|
||||||
(1314388, 3, 6000),
|
(1314367, 3, 6000),
|
||||||
(1314180, 3, 6000),
|
(1314159, 3, 6000),
|
||||||
(1314416, 3, 6000),
|
(1314395, 3, 6000),
|
||||||
(438359, 1, 2000)
|
(438352, 1, 2000)
|
||||||
]
|
]
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -678,7 +667,7 @@ mod tests {
|
|||||||
// files are smaller than the size threshold, but they took too long to fill so were flushed early
|
// files are smaller than the size threshold, but they took too long to fill so were flushed early
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
file_stats,
|
file_stats,
|
||||||
[(658837, 2, 3001), (658551, 2, 3000), (658347, 2, 2999)]
|
[(658823, 2, 3001), (658537, 2, 3000), (658333, 2, 2999)]
|
||||||
);
|
);
|
||||||
|
|
||||||
tmpdir.close().unwrap();
|
tmpdir.close().unwrap();
|
||||||
|
|||||||
@@ -12,6 +12,8 @@ use metrics::{
|
|||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use tokio::time::{self, Instant};
|
use tokio::time::{self, Instant};
|
||||||
|
|
||||||
|
use crate::console::messages::ColdStartInfo;
|
||||||
|
|
||||||
pub static NUM_DB_CONNECTIONS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
|
pub static NUM_DB_CONNECTIONS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
|
||||||
register_int_counter_pair_vec!(
|
register_int_counter_pair_vec!(
|
||||||
"proxy_opened_db_connections_total",
|
"proxy_opened_db_connections_total",
|
||||||
@@ -50,8 +52,8 @@ pub static COMPUTE_CONNECTION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
|
|||||||
"proxy_compute_connection_latency_seconds",
|
"proxy_compute_connection_latency_seconds",
|
||||||
"Time it took for proxy to establish a connection to the compute endpoint",
|
"Time it took for proxy to establish a connection to the compute endpoint",
|
||||||
// http/ws/tcp, true/false, true/false, success/failure, client/client_and_cplane
|
// http/ws/tcp, true/false, true/false, success/failure, client/client_and_cplane
|
||||||
// 3 * 2 * 2 * 2 * 2 = 48 counters
|
// 3 * 6 * 2 * 2 = 72 counters
|
||||||
&["protocol", "cache_miss", "pool_miss", "outcome", "excluded"],
|
&["protocol", "cold_start_info", "outcome", "excluded"],
|
||||||
// largest bucket = 2^16 * 0.5ms = 32s
|
// largest bucket = 2^16 * 0.5ms = 32s
|
||||||
exponential_buckets(0.0005, 2.0, 16).unwrap(),
|
exponential_buckets(0.0005, 2.0, 16).unwrap(),
|
||||||
)
|
)
|
||||||
@@ -117,12 +119,15 @@ pub static ALLOWED_IPS_NUMBER: Lazy<Histogram> = Lazy::new(|| {
|
|||||||
.unwrap()
|
.unwrap()
|
||||||
});
|
});
|
||||||
|
|
||||||
pub static HTTP_CONTENT_LENGTH: Lazy<Histogram> = Lazy::new(|| {
|
pub static HTTP_CONTENT_LENGTH: Lazy<HistogramVec> = Lazy::new(|| {
|
||||||
register_histogram!(
|
register_histogram_vec!(
|
||||||
"proxy_http_conn_content_length_bytes",
|
"proxy_http_conn_content_length_bytes",
|
||||||
"Time it took for proxy to establish a connection to the compute endpoint",
|
"Number of bytes the HTTP response content consumes",
|
||||||
// largest bucket = 3^16 * 0.05ms = 2.15s
|
// request/response
|
||||||
exponential_buckets(8.0, 2.0, 20).unwrap()
|
&["direction"],
|
||||||
|
// smallest bucket = 16 bytes
|
||||||
|
// largest bucket = 4^12 * 16 bytes = 256MB
|
||||||
|
exponential_buckets(16.0, 4.0, 12).unwrap()
|
||||||
)
|
)
|
||||||
.unwrap()
|
.unwrap()
|
||||||
});
|
});
|
||||||
@@ -180,6 +185,20 @@ struct Accumulated {
|
|||||||
compute: time::Duration,
|
compute: time::Duration,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
enum Outcome {
|
||||||
|
Success,
|
||||||
|
Failed,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Outcome {
|
||||||
|
fn as_str(&self) -> &'static str {
|
||||||
|
match self {
|
||||||
|
Outcome::Success => "success",
|
||||||
|
Outcome::Failed => "failed",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub struct LatencyTimer {
|
pub struct LatencyTimer {
|
||||||
// time since the stopwatch was started
|
// time since the stopwatch was started
|
||||||
start: time::Instant,
|
start: time::Instant,
|
||||||
@@ -189,9 +208,8 @@ pub struct LatencyTimer {
|
|||||||
accumulated: Accumulated,
|
accumulated: Accumulated,
|
||||||
// label data
|
// label data
|
||||||
protocol: &'static str,
|
protocol: &'static str,
|
||||||
cache_miss: bool,
|
cold_start_info: ColdStartInfo,
|
||||||
pool_miss: bool,
|
outcome: Outcome,
|
||||||
outcome: &'static str,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct LatencyTimerPause<'a> {
|
pub struct LatencyTimerPause<'a> {
|
||||||
@@ -207,11 +225,9 @@ impl LatencyTimer {
|
|||||||
stop: None,
|
stop: None,
|
||||||
accumulated: Accumulated::default(),
|
accumulated: Accumulated::default(),
|
||||||
protocol,
|
protocol,
|
||||||
cache_miss: false,
|
cold_start_info: ColdStartInfo::Unknown,
|
||||||
// by default we don't do pooling
|
|
||||||
pool_miss: true,
|
|
||||||
// assume failed unless otherwise specified
|
// assume failed unless otherwise specified
|
||||||
outcome: "failed",
|
outcome: Outcome::Failed,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -223,12 +239,8 @@ impl LatencyTimer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn cache_miss(&mut self) {
|
pub fn cold_start_info(&mut self, cold_start_info: ColdStartInfo) {
|
||||||
self.cache_miss = true;
|
self.cold_start_info = cold_start_info;
|
||||||
}
|
|
||||||
|
|
||||||
pub fn pool_hit(&mut self) {
|
|
||||||
self.pool_miss = false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn success(&mut self) {
|
pub fn success(&mut self) {
|
||||||
@@ -236,7 +248,7 @@ impl LatencyTimer {
|
|||||||
self.stop = Some(time::Instant::now());
|
self.stop = Some(time::Instant::now());
|
||||||
|
|
||||||
// success
|
// success
|
||||||
self.outcome = "success";
|
self.outcome = Outcome::Success;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -261,9 +273,8 @@ impl Drop for LatencyTimer {
|
|||||||
COMPUTE_CONNECTION_LATENCY
|
COMPUTE_CONNECTION_LATENCY
|
||||||
.with_label_values(&[
|
.with_label_values(&[
|
||||||
self.protocol,
|
self.protocol,
|
||||||
bool_to_str(self.cache_miss),
|
self.cold_start_info.as_str(),
|
||||||
bool_to_str(self.pool_miss),
|
self.outcome.as_str(),
|
||||||
self.outcome,
|
|
||||||
"client",
|
"client",
|
||||||
])
|
])
|
||||||
.observe((duration.saturating_sub(self.accumulated.client)).as_secs_f64());
|
.observe((duration.saturating_sub(self.accumulated.client)).as_secs_f64());
|
||||||
@@ -272,9 +283,8 @@ impl Drop for LatencyTimer {
|
|||||||
COMPUTE_CONNECTION_LATENCY
|
COMPUTE_CONNECTION_LATENCY
|
||||||
.with_label_values(&[
|
.with_label_values(&[
|
||||||
self.protocol,
|
self.protocol,
|
||||||
bool_to_str(self.cache_miss),
|
self.cold_start_info.as_str(),
|
||||||
bool_to_str(self.pool_miss),
|
self.outcome.as_str(),
|
||||||
self.outcome,
|
|
||||||
"client_and_cplane",
|
"client_and_cplane",
|
||||||
])
|
])
|
||||||
.observe((duration.saturating_sub(accumulated_total)).as_secs_f64());
|
.observe((duration.saturating_sub(accumulated_total)).as_secs_f64());
|
||||||
|
|||||||
@@ -87,7 +87,6 @@ impl ConnectMechanism for TcpMechanism<'_> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Try to connect to the compute node, retrying if necessary.
|
/// Try to connect to the compute node, retrying if necessary.
|
||||||
/// This function might update `node_info`, so we take it by `&mut`.
|
|
||||||
#[tracing::instrument(skip_all)]
|
#[tracing::instrument(skip_all)]
|
||||||
pub async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBackend>(
|
pub async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBackend>(
|
||||||
ctx: &mut RequestMonitoring,
|
ctx: &mut RequestMonitoring,
|
||||||
@@ -132,7 +131,6 @@ where
|
|||||||
} else {
|
} else {
|
||||||
// if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
|
// if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
|
||||||
info!("compute node's state has likely changed; requesting a wake-up");
|
info!("compute node's state has likely changed; requesting a wake-up");
|
||||||
ctx.latency_timer.cache_miss();
|
|
||||||
let old_node_info = invalidate_cache(node_info);
|
let old_node_info = invalidate_cache(node_info);
|
||||||
let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
|
let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
|
||||||
node_info.reuse_settings(old_node_info);
|
node_info.reuse_settings(old_node_info);
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ use crate::{
|
|||||||
console::messages::MetricsAuxInfo,
|
console::messages::MetricsAuxInfo,
|
||||||
metrics::NUM_BYTES_PROXIED_COUNTER,
|
metrics::NUM_BYTES_PROXIED_COUNTER,
|
||||||
stream::Stream,
|
stream::Stream,
|
||||||
usage_metrics::{Ids, USAGE_METRICS},
|
usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS},
|
||||||
};
|
};
|
||||||
use metrics::IntCounterPairGuard;
|
use metrics::IntCounterPairGuard;
|
||||||
use tokio::io::{AsyncRead, AsyncWrite};
|
use tokio::io::{AsyncRead, AsyncWrite};
|
||||||
@@ -19,8 +19,8 @@ pub async fn proxy_pass(
|
|||||||
aux: MetricsAuxInfo,
|
aux: MetricsAuxInfo,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let usage = USAGE_METRICS.register(Ids {
|
let usage = USAGE_METRICS.register(Ids {
|
||||||
endpoint_id: aux.endpoint_id.clone(),
|
endpoint_id: aux.endpoint_id,
|
||||||
branch_id: aux.branch_id.clone(),
|
branch_id: aux.branch_id,
|
||||||
});
|
});
|
||||||
|
|
||||||
let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]);
|
let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]);
|
||||||
|
|||||||
@@ -12,11 +12,12 @@ use crate::auth::backend::{
|
|||||||
};
|
};
|
||||||
use crate::config::CertResolver;
|
use crate::config::CertResolver;
|
||||||
use crate::console::caches::NodeInfoCache;
|
use crate::console::caches::NodeInfoCache;
|
||||||
|
use crate::console::messages::MetricsAuxInfo;
|
||||||
use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend};
|
use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend};
|
||||||
use crate::console::{self, CachedNodeInfo, NodeInfo};
|
use crate::console::{self, CachedNodeInfo, NodeInfo};
|
||||||
use crate::error::ErrorKind;
|
use crate::error::ErrorKind;
|
||||||
use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
|
use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
|
||||||
use crate::{http, sasl, scram};
|
use crate::{http, sasl, scram, BranchId, EndpointId, ProjectId};
|
||||||
use anyhow::{bail, Context};
|
use anyhow::{bail, Context};
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use rstest::rstest;
|
use rstest::rstest;
|
||||||
@@ -512,7 +513,12 @@ impl TestBackend for TestConnectMechanism {
|
|||||||
fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo {
|
fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo {
|
||||||
let node = NodeInfo {
|
let node = NodeInfo {
|
||||||
config: compute::ConnCfg::new(),
|
config: compute::ConnCfg::new(),
|
||||||
aux: Default::default(),
|
aux: MetricsAuxInfo {
|
||||||
|
endpoint_id: (&EndpointId::from("endpoint")).into(),
|
||||||
|
project_id: (&ProjectId::from("project")).into(),
|
||||||
|
branch_id: (&BranchId::from("branch")).into(),
|
||||||
|
cold_start_info: crate::console::messages::ColdStartInfo::Warm,
|
||||||
|
},
|
||||||
allow_self_signed_compute: false,
|
allow_self_signed_compute: false,
|
||||||
};
|
};
|
||||||
let (_, node) = cache.insert("key".into(), node);
|
let (_, node) = cache.insert("key".into(), node);
|
||||||
|
|||||||
@@ -9,7 +9,6 @@ use crate::{
|
|||||||
config::ProxyConfig,
|
config::ProxyConfig,
|
||||||
console::{
|
console::{
|
||||||
errors::{GetAuthInfoError, WakeComputeError},
|
errors::{GetAuthInfoError, WakeComputeError},
|
||||||
messages::ColdStartInfo,
|
|
||||||
CachedNodeInfo,
|
CachedNodeInfo,
|
||||||
},
|
},
|
||||||
context::RequestMonitoring,
|
context::RequestMonitoring,
|
||||||
@@ -57,7 +56,10 @@ impl PoolingBackend {
|
|||||||
let auth_outcome =
|
let auth_outcome =
|
||||||
crate::auth::validate_password_and_exchange(&conn_info.password, secret).await?;
|
crate::auth::validate_password_and_exchange(&conn_info.password, secret).await?;
|
||||||
let res = match auth_outcome {
|
let res = match auth_outcome {
|
||||||
crate::sasl::Outcome::Success(key) => Ok(key),
|
crate::sasl::Outcome::Success(key) => {
|
||||||
|
info!("user successfully authenticated");
|
||||||
|
Ok(key)
|
||||||
|
}
|
||||||
crate::sasl::Outcome::Failure(reason) => {
|
crate::sasl::Outcome::Failure(reason) => {
|
||||||
info!("auth backend failed with an error: {reason}");
|
info!("auth backend failed with an error: {reason}");
|
||||||
Err(AuthError::auth_failed(&*conn_info.user_info.user))
|
Err(AuthError::auth_failed(&*conn_info.user_info.user))
|
||||||
@@ -89,8 +91,6 @@ impl PoolingBackend {
|
|||||||
};
|
};
|
||||||
|
|
||||||
if let Some(client) = maybe_client {
|
if let Some(client) = maybe_client {
|
||||||
info!("cold_start_info=warm");
|
|
||||||
ctx.set_cold_start_info(ColdStartInfo::Warm);
|
|
||||||
return Ok(client);
|
return Ok(client);
|
||||||
}
|
}
|
||||||
let conn_id = uuid::Uuid::new_v4();
|
let conn_id = uuid::Uuid::new_v4();
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ use tokio::time::Instant;
|
|||||||
use tokio_postgres::tls::NoTlsStream;
|
use tokio_postgres::tls::NoTlsStream;
|
||||||
use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket};
|
use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket};
|
||||||
|
|
||||||
use crate::console::messages::MetricsAuxInfo;
|
use crate::console::messages::{ColdStartInfo, MetricsAuxInfo};
|
||||||
use crate::metrics::{ENDPOINT_POOLS, GC_LATENCY, NUM_OPEN_CLIENTS_IN_HTTP_POOL};
|
use crate::metrics::{ENDPOINT_POOLS, GC_LATENCY, NUM_OPEN_CLIENTS_IN_HTTP_POOL};
|
||||||
use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
|
use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
|
||||||
use crate::{
|
use crate::{
|
||||||
@@ -383,9 +383,12 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
|
|||||||
"pid",
|
"pid",
|
||||||
&tracing::field::display(client.inner.get_process_id()),
|
&tracing::field::display(client.inner.get_process_id()),
|
||||||
);
|
);
|
||||||
info!("pool: reusing connection '{conn_info}'");
|
info!(
|
||||||
|
cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
|
||||||
|
"pool: reusing connection '{conn_info}'"
|
||||||
|
);
|
||||||
client.session.send(ctx.session_id)?;
|
client.session.send(ctx.session_id)?;
|
||||||
ctx.latency_timer.pool_hit();
|
ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit);
|
||||||
ctx.latency_timer.success();
|
ctx.latency_timer.success();
|
||||||
return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool)));
|
return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool)));
|
||||||
}
|
}
|
||||||
@@ -454,8 +457,9 @@ pub fn poll_client<C: ClientInnerExt>(
|
|||||||
let (tx, mut rx) = tokio::sync::watch::channel(session_id);
|
let (tx, mut rx) = tokio::sync::watch::channel(session_id);
|
||||||
|
|
||||||
let span = info_span!(parent: None, "connection", %conn_id);
|
let span = info_span!(parent: None, "connection", %conn_id);
|
||||||
|
let cold_start_info = ctx.cold_start_info;
|
||||||
span.in_scope(|| {
|
span.in_scope(|| {
|
||||||
info!(%conn_info, %session_id, "new connection");
|
info!(cold_start_info = cold_start_info.as_str(), %conn_info, %session_id, "new connection");
|
||||||
});
|
});
|
||||||
let pool = match conn_info.endpoint_cache_key() {
|
let pool = match conn_info.endpoint_cache_key() {
|
||||||
Some(endpoint) => Arc::downgrade(&global_pool.get_or_create_endpoint_pool(&endpoint)),
|
Some(endpoint) => Arc::downgrade(&global_pool.get_or_create_endpoint_pool(&endpoint)),
|
||||||
@@ -565,8 +569,8 @@ impl<C: ClientInnerExt> Client<C> {
|
|||||||
pub fn metrics(&self) -> Arc<MetricCounter> {
|
pub fn metrics(&self) -> Arc<MetricCounter> {
|
||||||
let aux = &self.inner.as_ref().unwrap().aux;
|
let aux = &self.inner.as_ref().unwrap().aux;
|
||||||
USAGE_METRICS.register(Ids {
|
USAGE_METRICS.register(Ids {
|
||||||
endpoint_id: aux.endpoint_id.clone(),
|
endpoint_id: aux.endpoint_id,
|
||||||
branch_id: aux.branch_id.clone(),
|
branch_id: aux.branch_id,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -666,6 +670,8 @@ impl<C: ClientInnerExt> Drop for Client<C> {
|
|||||||
mod tests {
|
mod tests {
|
||||||
use std::{mem, sync::atomic::AtomicBool};
|
use std::{mem, sync::atomic::AtomicBool};
|
||||||
|
|
||||||
|
use crate::{BranchId, EndpointId, ProjectId};
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
struct MockClient(Arc<AtomicBool>);
|
struct MockClient(Arc<AtomicBool>);
|
||||||
@@ -691,7 +697,12 @@ mod tests {
|
|||||||
ClientInner {
|
ClientInner {
|
||||||
inner: client,
|
inner: client,
|
||||||
session: tokio::sync::watch::Sender::new(uuid::Uuid::new_v4()),
|
session: tokio::sync::watch::Sender::new(uuid::Uuid::new_v4()),
|
||||||
aux: Default::default(),
|
aux: MetricsAuxInfo {
|
||||||
|
endpoint_id: (&EndpointId::from("endpoint")).into(),
|
||||||
|
project_id: (&ProjectId::from("project")).into(),
|
||||||
|
branch_id: (&BranchId::from("branch")).into(),
|
||||||
|
cold_start_info: crate::console::messages::ColdStartInfo::Warm,
|
||||||
|
},
|
||||||
conn_id: uuid::Uuid::new_v4(),
|
conn_id: uuid::Uuid::new_v4(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -42,12 +42,15 @@ use crate::error::ReportableError;
|
|||||||
use crate::error::UserFacingError;
|
use crate::error::UserFacingError;
|
||||||
use crate::metrics::HTTP_CONTENT_LENGTH;
|
use crate::metrics::HTTP_CONTENT_LENGTH;
|
||||||
use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
|
use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
|
||||||
|
use crate::proxy::run_until_cancelled;
|
||||||
use crate::proxy::NeonOptions;
|
use crate::proxy::NeonOptions;
|
||||||
use crate::serverless::backend::HttpConnError;
|
use crate::serverless::backend::HttpConnError;
|
||||||
|
use crate::usage_metrics::MetricCounterRecorder;
|
||||||
use crate::DbName;
|
use crate::DbName;
|
||||||
use crate::RoleName;
|
use crate::RoleName;
|
||||||
|
|
||||||
use super::backend::PoolingBackend;
|
use super::backend::PoolingBackend;
|
||||||
|
use super::conn_pool::Client;
|
||||||
use super::conn_pool::ConnInfo;
|
use super::conn_pool::ConnInfo;
|
||||||
use super::json::json_to_pg_text;
|
use super::json::json_to_pg_text;
|
||||||
use super::json::pg_text_row_to_json;
|
use super::json::pg_text_row_to_json;
|
||||||
@@ -219,14 +222,7 @@ pub async fn handle(
|
|||||||
backend: Arc<PoolingBackend>,
|
backend: Arc<PoolingBackend>,
|
||||||
cancel: CancellationToken,
|
cancel: CancellationToken,
|
||||||
) -> Result<Response<Body>, ApiError> {
|
) -> Result<Response<Body>, ApiError> {
|
||||||
let cancel2 = cancel.clone();
|
|
||||||
let handle = tokio::spawn(async move {
|
|
||||||
time::sleep(config.http_config.request_timeout).await;
|
|
||||||
cancel2.cancel();
|
|
||||||
});
|
|
||||||
|
|
||||||
let result = handle_inner(cancel, config, &mut ctx, request, backend).await;
|
let result = handle_inner(cancel, config, &mut ctx, request, backend).await;
|
||||||
handle.abort();
|
|
||||||
|
|
||||||
let mut response = match result {
|
let mut response = match result {
|
||||||
Ok(r) => {
|
Ok(r) => {
|
||||||
@@ -237,10 +233,7 @@ pub async fn handle(
|
|||||||
let error_kind = e.get_error_kind();
|
let error_kind = e.get_error_kind();
|
||||||
ctx.set_error_kind(error_kind);
|
ctx.set_error_kind(error_kind);
|
||||||
|
|
||||||
let message = format!(
|
let message = "Query cancelled, connection was terminated";
|
||||||
"Query cancelled, runtime exceeded. SQL queries over HTTP must not exceed {} seconds of runtime. Please consider using our websocket based connections",
|
|
||||||
config.http_config.request_timeout.as_secs_f64()
|
|
||||||
);
|
|
||||||
|
|
||||||
tracing::info!(
|
tracing::info!(
|
||||||
kind=error_kind.to_metric_label(),
|
kind=error_kind.to_metric_label(),
|
||||||
@@ -434,6 +427,63 @@ impl ReportableError for SqlOverHttpCancel {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Copy, Debug)]
|
||||||
|
struct HttpHeaders {
|
||||||
|
raw_output: bool,
|
||||||
|
default_array_mode: bool,
|
||||||
|
txn_isolation_level: Option<IsolationLevel>,
|
||||||
|
txn_read_only: bool,
|
||||||
|
txn_deferrable: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl HttpHeaders {
|
||||||
|
fn try_parse(headers: &hyper::http::HeaderMap) -> Result<Self, SqlOverHttpError> {
|
||||||
|
// Determine the output options. Default behaviour is 'false'. Anything that is not
|
||||||
|
// strictly 'true' assumed to be false.
|
||||||
|
let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE);
|
||||||
|
let default_array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE);
|
||||||
|
|
||||||
|
// isolation level, read only and deferrable
|
||||||
|
let txn_isolation_level = match headers.get(&TXN_ISOLATION_LEVEL) {
|
||||||
|
Some(x) => Some(
|
||||||
|
map_header_to_isolation_level(x).ok_or(SqlOverHttpError::InvalidIsolationLevel)?,
|
||||||
|
),
|
||||||
|
None => None,
|
||||||
|
};
|
||||||
|
|
||||||
|
let txn_read_only = headers.get(&TXN_READ_ONLY) == Some(&HEADER_VALUE_TRUE);
|
||||||
|
let txn_deferrable = headers.get(&TXN_DEFERRABLE) == Some(&HEADER_VALUE_TRUE);
|
||||||
|
|
||||||
|
Ok(Self {
|
||||||
|
raw_output,
|
||||||
|
default_array_mode,
|
||||||
|
txn_isolation_level,
|
||||||
|
txn_read_only,
|
||||||
|
txn_deferrable,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn map_header_to_isolation_level(level: &HeaderValue) -> Option<IsolationLevel> {
|
||||||
|
match level.as_bytes() {
|
||||||
|
b"Serializable" => Some(IsolationLevel::Serializable),
|
||||||
|
b"ReadUncommitted" => Some(IsolationLevel::ReadUncommitted),
|
||||||
|
b"ReadCommitted" => Some(IsolationLevel::ReadCommitted),
|
||||||
|
b"RepeatableRead" => Some(IsolationLevel::RepeatableRead),
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn map_isolation_level_to_headers(level: IsolationLevel) -> Option<HeaderValue> {
|
||||||
|
match level {
|
||||||
|
IsolationLevel::ReadUncommitted => Some(HeaderValue::from_static("ReadUncommitted")),
|
||||||
|
IsolationLevel::ReadCommitted => Some(HeaderValue::from_static("ReadCommitted")),
|
||||||
|
IsolationLevel::RepeatableRead => Some(HeaderValue::from_static("RepeatableRead")),
|
||||||
|
IsolationLevel::Serializable => Some(HeaderValue::from_static("Serializable")),
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async fn handle_inner(
|
async fn handle_inner(
|
||||||
cancel: CancellationToken,
|
cancel: CancellationToken,
|
||||||
config: &'static ProxyConfig,
|
config: &'static ProxyConfig,
|
||||||
@@ -450,43 +500,26 @@ async fn handle_inner(
|
|||||||
// Determine the destination and connection params
|
// Determine the destination and connection params
|
||||||
//
|
//
|
||||||
let headers = request.headers();
|
let headers = request.headers();
|
||||||
|
|
||||||
// TLS config should be there.
|
// TLS config should be there.
|
||||||
let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref().unwrap())?;
|
let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref().unwrap())?;
|
||||||
info!(user = conn_info.user_info.user.as_str(), "credentials");
|
info!(user = conn_info.user_info.user.as_str(), "credentials");
|
||||||
|
|
||||||
// Determine the output options. Default behaviour is 'false'. Anything that is not
|
|
||||||
// strictly 'true' assumed to be false.
|
|
||||||
let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE);
|
|
||||||
let default_array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE);
|
|
||||||
|
|
||||||
// Allow connection pooling only if explicitly requested
|
// Allow connection pooling only if explicitly requested
|
||||||
// or if we have decided that http pool is no longer opt-in
|
// or if we have decided that http pool is no longer opt-in
|
||||||
let allow_pool = !config.http_config.pool_options.opt_in
|
let allow_pool = !config.http_config.pool_options.opt_in
|
||||||
|| headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);
|
|| headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);
|
||||||
|
|
||||||
// isolation level, read only and deferrable
|
let parsed_headers = HttpHeaders::try_parse(headers)?;
|
||||||
|
|
||||||
let txn_isolation_level_raw = headers.get(&TXN_ISOLATION_LEVEL).cloned();
|
|
||||||
let txn_isolation_level = match txn_isolation_level_raw {
|
|
||||||
Some(ref x) => Some(match x.as_bytes() {
|
|
||||||
b"Serializable" => IsolationLevel::Serializable,
|
|
||||||
b"ReadUncommitted" => IsolationLevel::ReadUncommitted,
|
|
||||||
b"ReadCommitted" => IsolationLevel::ReadCommitted,
|
|
||||||
b"RepeatableRead" => IsolationLevel::RepeatableRead,
|
|
||||||
_ => return Err(SqlOverHttpError::InvalidIsolationLevel),
|
|
||||||
}),
|
|
||||||
None => None,
|
|
||||||
};
|
|
||||||
|
|
||||||
let txn_read_only = headers.get(&TXN_READ_ONLY) == Some(&HEADER_VALUE_TRUE);
|
|
||||||
let txn_deferrable = headers.get(&TXN_DEFERRABLE) == Some(&HEADER_VALUE_TRUE);
|
|
||||||
|
|
||||||
let request_content_length = match request.body().size_hint().upper() {
|
let request_content_length = match request.body().size_hint().upper() {
|
||||||
Some(v) => v,
|
Some(v) => v,
|
||||||
None => MAX_REQUEST_SIZE + 1,
|
None => MAX_REQUEST_SIZE + 1,
|
||||||
};
|
};
|
||||||
info!(request_content_length, "request size in bytes");
|
info!(request_content_length, "request size in bytes");
|
||||||
HTTP_CONTENT_LENGTH.observe(request_content_length as f64);
|
HTTP_CONTENT_LENGTH
|
||||||
|
.with_label_values(&["request"])
|
||||||
|
.observe(request_content_length as f64);
|
||||||
|
|
||||||
// we don't have a streaming request support yet so this is to prevent OOM
|
// we don't have a streaming request support yet so this is to prevent OOM
|
||||||
// from a malicious user sending an extremely large request body
|
// from a malicious user sending an extremely large request body
|
||||||
@@ -514,20 +547,18 @@ async fn handle_inner(
|
|||||||
}
|
}
|
||||||
.map_err(SqlOverHttpError::from);
|
.map_err(SqlOverHttpError::from);
|
||||||
|
|
||||||
// Run both operations in parallel
|
let (payload, mut client) = match run_until_cancelled(
|
||||||
let (payload, mut client) = match select(
|
// Run both operations in parallel
|
||||||
try_join(
|
try_join(
|
||||||
pin!(fetch_and_process_request),
|
pin!(fetch_and_process_request),
|
||||||
pin!(authenticate_and_connect),
|
pin!(authenticate_and_connect),
|
||||||
),
|
),
|
||||||
pin!(cancel.cancelled()),
|
&cancel,
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
Either::Left((result, _cancelled)) => result?,
|
Some(result) => result?,
|
||||||
Either::Right((_cancelled, _)) => {
|
None => return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Connect)),
|
||||||
return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Connect))
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut response = Response::builder()
|
let mut response = Response::builder()
|
||||||
@@ -537,95 +568,143 @@ async fn handle_inner(
|
|||||||
//
|
//
|
||||||
// Now execute the query and return the result
|
// Now execute the query and return the result
|
||||||
//
|
//
|
||||||
let mut size = 0;
|
|
||||||
let result = match payload {
|
let result = match payload {
|
||||||
Payload::Single(stmt) => {
|
Payload::Single(stmt) => stmt.process(cancel, &mut client, parsed_headers).await?,
|
||||||
let mut size = 0;
|
|
||||||
let (inner, mut discard) = client.inner();
|
|
||||||
let cancel_token = inner.cancel_token();
|
|
||||||
let query = pin!(query_to_json(
|
|
||||||
&*inner,
|
|
||||||
stmt,
|
|
||||||
&mut size,
|
|
||||||
raw_output,
|
|
||||||
default_array_mode
|
|
||||||
));
|
|
||||||
let cancelled = pin!(cancel.cancelled());
|
|
||||||
let res = select(query, cancelled).await;
|
|
||||||
match res {
|
|
||||||
Either::Left((Ok((status, results)), _cancelled)) => {
|
|
||||||
discard.check_idle(status);
|
|
||||||
results
|
|
||||||
}
|
|
||||||
Either::Left((Err(e), _cancelled)) => {
|
|
||||||
discard.discard();
|
|
||||||
return Err(e);
|
|
||||||
}
|
|
||||||
Either::Right((_cancelled, query)) => {
|
|
||||||
if let Err(err) = cancel_token.cancel_query(NoTls).await {
|
|
||||||
tracing::error!(?err, "could not cancel query");
|
|
||||||
}
|
|
||||||
match time::timeout(time::Duration::from_millis(100), query).await {
|
|
||||||
Ok(Ok((status, results))) => {
|
|
||||||
discard.check_idle(status);
|
|
||||||
results
|
|
||||||
}
|
|
||||||
Ok(Err(error)) => {
|
|
||||||
let db_error = match &error {
|
|
||||||
SqlOverHttpError::ConnectCompute(
|
|
||||||
HttpConnError::ConnectionError(e),
|
|
||||||
)
|
|
||||||
| SqlOverHttpError::Postgres(e) => e.as_db_error(),
|
|
||||||
_ => None,
|
|
||||||
};
|
|
||||||
|
|
||||||
// if errored for some other reason, it might not be safe to return
|
|
||||||
if !db_error.is_some_and(|e| *e.code() == SqlState::QUERY_CANCELED) {
|
|
||||||
discard.discard();
|
|
||||||
}
|
|
||||||
|
|
||||||
return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
|
|
||||||
}
|
|
||||||
Err(_timeout) => {
|
|
||||||
discard.discard();
|
|
||||||
return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Payload::Batch(statements) => {
|
Payload::Batch(statements) => {
|
||||||
info!("starting transaction");
|
if parsed_headers.txn_read_only {
|
||||||
let (inner, mut discard) = client.inner();
|
response = response.header(TXN_READ_ONLY.clone(), &HEADER_VALUE_TRUE);
|
||||||
let cancel_token = inner.cancel_token();
|
|
||||||
let mut builder = inner.build_transaction();
|
|
||||||
if let Some(isolation_level) = txn_isolation_level {
|
|
||||||
builder = builder.isolation_level(isolation_level);
|
|
||||||
}
|
}
|
||||||
if txn_read_only {
|
if parsed_headers.txn_deferrable {
|
||||||
builder = builder.read_only(true);
|
response = response.header(TXN_DEFERRABLE.clone(), &HEADER_VALUE_TRUE);
|
||||||
}
|
}
|
||||||
if txn_deferrable {
|
if let Some(txn_isolation_level) = parsed_headers
|
||||||
builder = builder.deferrable(true);
|
.txn_isolation_level
|
||||||
}
|
.and_then(map_isolation_level_to_headers)
|
||||||
|
|
||||||
let transaction = builder.start().await.map_err(|e| {
|
|
||||||
// if we cannot start a transaction, we should return immediately
|
|
||||||
// and not return to the pool. connection is clearly broken
|
|
||||||
discard.discard();
|
|
||||||
e
|
|
||||||
})?;
|
|
||||||
|
|
||||||
let results = match query_batch(
|
|
||||||
cancel.child_token(),
|
|
||||||
&transaction,
|
|
||||||
statements,
|
|
||||||
&mut size,
|
|
||||||
raw_output,
|
|
||||||
default_array_mode,
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
{
|
{
|
||||||
|
response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
|
||||||
|
}
|
||||||
|
|
||||||
|
statements
|
||||||
|
.process(cancel, &mut client, parsed_headers)
|
||||||
|
.await?
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let metrics = client.metrics();
|
||||||
|
|
||||||
|
// how could this possibly fail
|
||||||
|
let body = serde_json::to_string(&result).expect("json serialization should not fail");
|
||||||
|
let len = body.len();
|
||||||
|
let response = response
|
||||||
|
.body(Body::from(body))
|
||||||
|
// only fails if invalid status code or invalid header/values are given.
|
||||||
|
// these are not user configurable so it cannot fail dynamically
|
||||||
|
.expect("building response payload should not fail");
|
||||||
|
|
||||||
|
// count the egress bytes - we miss the TLS and header overhead but oh well...
|
||||||
|
// moving this later in the stack is going to be a lot of effort and ehhhh
|
||||||
|
metrics.record_egress(len as u64);
|
||||||
|
HTTP_CONTENT_LENGTH
|
||||||
|
.with_label_values(&["response"])
|
||||||
|
.observe(len as f64);
|
||||||
|
|
||||||
|
Ok(response)
|
||||||
|
}
|
||||||
|
|
||||||
|
impl QueryData {
|
||||||
|
async fn process(
|
||||||
|
self,
|
||||||
|
cancel: CancellationToken,
|
||||||
|
client: &mut Client<tokio_postgres::Client>,
|
||||||
|
parsed_headers: HttpHeaders,
|
||||||
|
) -> Result<Value, SqlOverHttpError> {
|
||||||
|
let (inner, mut discard) = client.inner();
|
||||||
|
let cancel_token = inner.cancel_token();
|
||||||
|
|
||||||
|
let res = match select(
|
||||||
|
pin!(query_to_json(&*inner, self, &mut 0, parsed_headers)),
|
||||||
|
pin!(cancel.cancelled()),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
// The query successfully completed.
|
||||||
|
Either::Left((Ok((status, results)), __not_yet_cancelled)) => {
|
||||||
|
discard.check_idle(status);
|
||||||
|
Ok(results)
|
||||||
|
}
|
||||||
|
// The query failed with an error
|
||||||
|
Either::Left((Err(e), __not_yet_cancelled)) => {
|
||||||
|
discard.discard();
|
||||||
|
return Err(e);
|
||||||
|
}
|
||||||
|
// The query was cancelled.
|
||||||
|
Either::Right((_cancelled, query)) => {
|
||||||
|
if let Err(err) = cancel_token.cancel_query(NoTls).await {
|
||||||
|
tracing::error!(?err, "could not cancel query");
|
||||||
|
}
|
||||||
|
// wait for the query cancellation
|
||||||
|
match time::timeout(time::Duration::from_millis(100), query).await {
|
||||||
|
// query successed before it was cancelled.
|
||||||
|
Ok(Ok((status, results))) => {
|
||||||
|
discard.check_idle(status);
|
||||||
|
Ok(results)
|
||||||
|
}
|
||||||
|
// query failed or was cancelled.
|
||||||
|
Ok(Err(error)) => {
|
||||||
|
let db_error = match &error {
|
||||||
|
SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(e))
|
||||||
|
| SqlOverHttpError::Postgres(e) => e.as_db_error(),
|
||||||
|
_ => None,
|
||||||
|
};
|
||||||
|
|
||||||
|
// if errored for some other reason, it might not be safe to return
|
||||||
|
if !db_error.is_some_and(|e| *e.code() == SqlState::QUERY_CANCELED) {
|
||||||
|
discard.discard();
|
||||||
|
}
|
||||||
|
|
||||||
|
Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres))
|
||||||
|
}
|
||||||
|
Err(_timeout) => {
|
||||||
|
discard.discard();
|
||||||
|
Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
res
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BatchQueryData {
|
||||||
|
async fn process(
|
||||||
|
self,
|
||||||
|
cancel: CancellationToken,
|
||||||
|
client: &mut Client<tokio_postgres::Client>,
|
||||||
|
parsed_headers: HttpHeaders,
|
||||||
|
) -> Result<Value, SqlOverHttpError> {
|
||||||
|
info!("starting transaction");
|
||||||
|
let (inner, mut discard) = client.inner();
|
||||||
|
let cancel_token = inner.cancel_token();
|
||||||
|
let mut builder = inner.build_transaction();
|
||||||
|
if let Some(isolation_level) = parsed_headers.txn_isolation_level {
|
||||||
|
builder = builder.isolation_level(isolation_level);
|
||||||
|
}
|
||||||
|
if parsed_headers.txn_read_only {
|
||||||
|
builder = builder.read_only(true);
|
||||||
|
}
|
||||||
|
if parsed_headers.txn_deferrable {
|
||||||
|
builder = builder.deferrable(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
let transaction = builder.start().await.map_err(|e| {
|
||||||
|
// if we cannot start a transaction, we should return immediately
|
||||||
|
// and not return to the pool. connection is clearly broken
|
||||||
|
discard.discard();
|
||||||
|
e
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let results =
|
||||||
|
match query_batch(cancel.child_token(), &transaction, self, parsed_headers).await {
|
||||||
Ok(results) => {
|
Ok(results) => {
|
||||||
info!("commit");
|
info!("commit");
|
||||||
let status = transaction.commit().await.map_err(|e| {
|
let status = transaction.commit().await.map_err(|e| {
|
||||||
@@ -659,44 +738,15 @@ async fn handle_inner(
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
if txn_read_only {
|
Ok(json!({ "results": results }))
|
||||||
response = response.header(TXN_READ_ONLY.clone(), &HEADER_VALUE_TRUE);
|
}
|
||||||
}
|
|
||||||
if txn_deferrable {
|
|
||||||
response = response.header(TXN_DEFERRABLE.clone(), &HEADER_VALUE_TRUE);
|
|
||||||
}
|
|
||||||
if let Some(txn_isolation_level) = txn_isolation_level_raw {
|
|
||||||
response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
|
|
||||||
}
|
|
||||||
json!({ "results": results })
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let metrics = client.metrics();
|
|
||||||
|
|
||||||
// how could this possibly fail
|
|
||||||
let body = serde_json::to_string(&result).expect("json serialization should not fail");
|
|
||||||
let len = body.len();
|
|
||||||
let response = response
|
|
||||||
.body(Body::from(body))
|
|
||||||
// only fails if invalid status code or invalid header/values are given.
|
|
||||||
// these are not user configurable so it cannot fail dynamically
|
|
||||||
.expect("building response payload should not fail");
|
|
||||||
|
|
||||||
// count the egress bytes - we miss the TLS and header overhead but oh well...
|
|
||||||
// moving this later in the stack is going to be a lot of effort and ehhhh
|
|
||||||
metrics.record_egress(len as u64);
|
|
||||||
|
|
||||||
Ok(response)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn query_batch(
|
async fn query_batch(
|
||||||
cancel: CancellationToken,
|
cancel: CancellationToken,
|
||||||
transaction: &Transaction<'_>,
|
transaction: &Transaction<'_>,
|
||||||
queries: BatchQueryData,
|
queries: BatchQueryData,
|
||||||
total_size: &mut usize,
|
parsed_headers: HttpHeaders,
|
||||||
raw_output: bool,
|
|
||||||
array_mode: bool,
|
|
||||||
) -> Result<Vec<Value>, SqlOverHttpError> {
|
) -> Result<Vec<Value>, SqlOverHttpError> {
|
||||||
let mut results = Vec::with_capacity(queries.queries.len());
|
let mut results = Vec::with_capacity(queries.queries.len());
|
||||||
let mut current_size = 0;
|
let mut current_size = 0;
|
||||||
@@ -705,8 +755,7 @@ async fn query_batch(
|
|||||||
transaction,
|
transaction,
|
||||||
stmt,
|
stmt,
|
||||||
&mut current_size,
|
&mut current_size,
|
||||||
raw_output,
|
parsed_headers,
|
||||||
array_mode
|
|
||||||
));
|
));
|
||||||
let cancelled = pin!(cancel.cancelled());
|
let cancelled = pin!(cancel.cancelled());
|
||||||
let res = select(query, cancelled).await;
|
let res = select(query, cancelled).await;
|
||||||
@@ -723,7 +772,6 @@ async fn query_batch(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
*total_size += current_size;
|
|
||||||
Ok(results)
|
Ok(results)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -731,8 +779,7 @@ async fn query_to_json<T: GenericClient>(
|
|||||||
client: &T,
|
client: &T,
|
||||||
data: QueryData,
|
data: QueryData,
|
||||||
current_size: &mut usize,
|
current_size: &mut usize,
|
||||||
raw_output: bool,
|
parsed_headers: HttpHeaders,
|
||||||
default_array_mode: bool,
|
|
||||||
) -> Result<(ReadyForQueryStatus, Value), SqlOverHttpError> {
|
) -> Result<(ReadyForQueryStatus, Value), SqlOverHttpError> {
|
||||||
info!("executing query");
|
info!("executing query");
|
||||||
let query_params = data.params;
|
let query_params = data.params;
|
||||||
@@ -792,12 +839,12 @@ async fn query_to_json<T: GenericClient>(
|
|||||||
columns.push(client.get_type(c.type_oid()).await?);
|
columns.push(client.get_type(c.type_oid()).await?);
|
||||||
}
|
}
|
||||||
|
|
||||||
let array_mode = data.array_mode.unwrap_or(default_array_mode);
|
let array_mode = data.array_mode.unwrap_or(parsed_headers.default_array_mode);
|
||||||
|
|
||||||
// convert rows to JSON
|
// convert rows to JSON
|
||||||
let rows = rows
|
let rows = rows
|
||||||
.iter()
|
.iter()
|
||||||
.map(|row| pg_text_row_to_json(row, &columns, raw_output, array_mode))
|
.map(|row| pg_text_row_to_json(row, &columns, parsed_headers.raw_output, array_mode))
|
||||||
.collect::<Result<Vec<_>, _>>()?;
|
.collect::<Result<Vec<_>, _>>()?;
|
||||||
|
|
||||||
// resulting JSON format is based on the format of node-postgres result
|
// resulting JSON format is based on the format of node-postgres result
|
||||||
|
|||||||
@@ -1,20 +1,35 @@
|
|||||||
//! Periodically collect proxy consumption metrics
|
//! Periodically collect proxy consumption metrics
|
||||||
//! and push them to a HTTP endpoint.
|
//! and push them to a HTTP endpoint.
|
||||||
use crate::{config::MetricCollectionConfig, http, BranchId, EndpointId};
|
use crate::{
|
||||||
use chrono::{DateTime, Utc};
|
config::{MetricBackupCollectionConfig, MetricCollectionConfig},
|
||||||
|
context::parquet::{FAILED_UPLOAD_MAX_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
|
||||||
|
http,
|
||||||
|
intern::{BranchIdInt, EndpointIdInt},
|
||||||
|
};
|
||||||
|
use anyhow::Context;
|
||||||
|
use async_compression::tokio::write::GzipEncoder;
|
||||||
|
use bytes::Bytes;
|
||||||
|
use chrono::{DateTime, Datelike, Timelike, Utc};
|
||||||
use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
|
use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
|
||||||
use dashmap::{mapref::entry::Entry, DashMap};
|
use dashmap::{mapref::entry::Entry, DashMap};
|
||||||
|
use futures::future::select;
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
|
use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::{
|
use std::{
|
||||||
convert::Infallible,
|
convert::Infallible,
|
||||||
|
pin::pin,
|
||||||
sync::{
|
sync::{
|
||||||
atomic::{AtomicU64, AtomicUsize, Ordering},
|
atomic::{AtomicU64, AtomicUsize, Ordering},
|
||||||
Arc,
|
Arc,
|
||||||
},
|
},
|
||||||
time::Duration,
|
time::Duration,
|
||||||
};
|
};
|
||||||
|
use tokio::io::AsyncWriteExt;
|
||||||
|
use tokio_util::sync::CancellationToken;
|
||||||
use tracing::{error, info, instrument, trace};
|
use tracing::{error, info, instrument, trace};
|
||||||
|
use utils::backoff;
|
||||||
|
use uuid::{NoContext, Timestamp};
|
||||||
|
|
||||||
const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";
|
const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";
|
||||||
|
|
||||||
@@ -29,23 +44,97 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
|
|||||||
/// because we enrich the event with project_id in the control-plane endpoint.
|
/// because we enrich the event with project_id in the control-plane endpoint.
|
||||||
#[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)]
|
#[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)]
|
||||||
pub struct Ids {
|
pub struct Ids {
|
||||||
pub endpoint_id: EndpointId,
|
pub endpoint_id: EndpointIdInt,
|
||||||
pub branch_id: BranchId,
|
pub branch_id: BranchIdInt,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub trait MetricCounterRecorder {
|
||||||
|
/// Record that some bytes were sent from the proxy to the client
|
||||||
|
fn record_egress(&self, bytes: u64);
|
||||||
|
/// Record that some connections were opened
|
||||||
|
fn record_connection(&self, count: usize);
|
||||||
|
}
|
||||||
|
|
||||||
|
trait MetricCounterReporter {
|
||||||
|
fn get_metrics(&mut self) -> (u64, usize);
|
||||||
|
fn move_metrics(&self) -> (u64, usize);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
struct MetricBackupCounter {
|
||||||
|
transmitted: AtomicU64,
|
||||||
|
opened_connections: AtomicUsize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MetricCounterRecorder for MetricBackupCounter {
|
||||||
|
fn record_egress(&self, bytes: u64) {
|
||||||
|
self.transmitted.fetch_add(bytes, Ordering::AcqRel);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn record_connection(&self, count: usize) {
|
||||||
|
self.opened_connections.fetch_add(count, Ordering::AcqRel);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MetricCounterReporter for MetricBackupCounter {
|
||||||
|
fn get_metrics(&mut self) -> (u64, usize) {
|
||||||
|
(
|
||||||
|
*self.transmitted.get_mut(),
|
||||||
|
*self.opened_connections.get_mut(),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
fn move_metrics(&self) -> (u64, usize) {
|
||||||
|
(
|
||||||
|
self.transmitted.swap(0, Ordering::AcqRel),
|
||||||
|
self.opened_connections.swap(0, Ordering::AcqRel),
|
||||||
|
)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct MetricCounter {
|
pub struct MetricCounter {
|
||||||
transmitted: AtomicU64,
|
transmitted: AtomicU64,
|
||||||
opened_connections: AtomicUsize,
|
opened_connections: AtomicUsize,
|
||||||
|
backup: Arc<MetricBackupCounter>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl MetricCounter {
|
impl MetricCounterRecorder for MetricCounter {
|
||||||
/// Record that some bytes were sent from the proxy to the client
|
/// Record that some bytes were sent from the proxy to the client
|
||||||
pub fn record_egress(&self, bytes: u64) {
|
fn record_egress(&self, bytes: u64) {
|
||||||
self.transmitted.fetch_add(bytes, Ordering::AcqRel);
|
self.transmitted.fetch_add(bytes, Ordering::AcqRel);
|
||||||
|
self.backup.record_egress(bytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Record that some connections were opened
|
||||||
|
fn record_connection(&self, count: usize) {
|
||||||
|
self.opened_connections.fetch_add(count, Ordering::AcqRel);
|
||||||
|
self.backup.record_connection(count);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MetricCounterReporter for MetricCounter {
|
||||||
|
fn get_metrics(&mut self) -> (u64, usize) {
|
||||||
|
(
|
||||||
|
*self.transmitted.get_mut(),
|
||||||
|
*self.opened_connections.get_mut(),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
fn move_metrics(&self) -> (u64, usize) {
|
||||||
|
(
|
||||||
|
self.transmitted.swap(0, Ordering::AcqRel),
|
||||||
|
self.opened_connections.swap(0, Ordering::AcqRel),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
trait Clearable {
|
||||||
/// extract the value that should be reported
|
/// extract the value that should be reported
|
||||||
|
fn should_report(self: &Arc<Self>) -> Option<u64>;
|
||||||
|
/// Determine whether the counter should be cleared from the global map.
|
||||||
|
fn should_clear(self: &mut Arc<Self>) -> bool;
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<C: MetricCounterReporter> Clearable for C {
|
||||||
fn should_report(self: &Arc<Self>) -> Option<u64> {
|
fn should_report(self: &Arc<Self>) -> Option<u64> {
|
||||||
// heuristic to see if the branch is still open
|
// heuristic to see if the branch is still open
|
||||||
// if a clone happens while we are observing, the heuristic will be incorrect.
|
// if a clone happens while we are observing, the heuristic will be incorrect.
|
||||||
@@ -54,13 +143,12 @@ impl MetricCounter {
|
|||||||
// However, for the strong count to be 1 it must have occured that at one instant
|
// However, for the strong count to be 1 it must have occured that at one instant
|
||||||
// all the endpoints were closed, so missing a report because the endpoints are closed is valid.
|
// all the endpoints were closed, so missing a report because the endpoints are closed is valid.
|
||||||
let is_open = Arc::strong_count(self) > 1;
|
let is_open = Arc::strong_count(self) > 1;
|
||||||
let opened = self.opened_connections.swap(0, Ordering::AcqRel);
|
|
||||||
|
|
||||||
// update cached metrics eagerly, even if they can't get sent
|
// update cached metrics eagerly, even if they can't get sent
|
||||||
// (to avoid sending the same metrics twice)
|
// (to avoid sending the same metrics twice)
|
||||||
// see the relevant discussion on why to do so even if the status is not success:
|
// see the relevant discussion on why to do so even if the status is not success:
|
||||||
// https://github.com/neondatabase/neon/pull/4563#discussion_r1246710956
|
// https://github.com/neondatabase/neon/pull/4563#discussion_r1246710956
|
||||||
let value = self.transmitted.swap(0, Ordering::AcqRel);
|
let (value, opened) = self.move_metrics();
|
||||||
|
|
||||||
// Our only requirement is that we report in every interval if there was an open connection
|
// Our only requirement is that we report in every interval if there was an open connection
|
||||||
// if there were no opened connections since, then we don't need to report
|
// if there were no opened connections since, then we don't need to report
|
||||||
@@ -70,15 +158,12 @@ impl MetricCounter {
|
|||||||
Some(value)
|
Some(value)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Determine whether the counter should be cleared from the global map.
|
|
||||||
fn should_clear(self: &mut Arc<Self>) -> bool {
|
fn should_clear(self: &mut Arc<Self>) -> bool {
|
||||||
// we can't clear this entry if it's acquired elsewhere
|
// we can't clear this entry if it's acquired elsewhere
|
||||||
let Some(counter) = Arc::get_mut(self) else {
|
let Some(counter) = Arc::get_mut(self) else {
|
||||||
return false;
|
return false;
|
||||||
};
|
};
|
||||||
let opened = *counter.opened_connections.get_mut();
|
let (opened, value) = counter.get_metrics();
|
||||||
let value = *counter.transmitted.get_mut();
|
|
||||||
// clear if there's no data to report
|
// clear if there's no data to report
|
||||||
value == 0 && opened == 0
|
value == 0 && opened == 0
|
||||||
}
|
}
|
||||||
@@ -90,11 +175,26 @@ type FastHasher = std::hash::BuildHasherDefault<rustc_hash::FxHasher>;
|
|||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
pub struct Metrics {
|
pub struct Metrics {
|
||||||
endpoints: DashMap<Ids, Arc<MetricCounter>, FastHasher>,
|
endpoints: DashMap<Ids, Arc<MetricCounter>, FastHasher>,
|
||||||
|
backup_endpoints: DashMap<Ids, Arc<MetricBackupCounter>, FastHasher>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Metrics {
|
impl Metrics {
|
||||||
/// Register a new byte metrics counter for this endpoint
|
/// Register a new byte metrics counter for this endpoint
|
||||||
pub fn register(&self, ids: Ids) -> Arc<MetricCounter> {
|
pub fn register(&self, ids: Ids) -> Arc<MetricCounter> {
|
||||||
|
let backup = if let Some(entry) = self.backup_endpoints.get(&ids) {
|
||||||
|
entry.clone()
|
||||||
|
} else {
|
||||||
|
self.backup_endpoints
|
||||||
|
.entry(ids.clone())
|
||||||
|
.or_insert_with(|| {
|
||||||
|
Arc::new(MetricBackupCounter {
|
||||||
|
transmitted: AtomicU64::new(0),
|
||||||
|
opened_connections: AtomicUsize::new(0),
|
||||||
|
})
|
||||||
|
})
|
||||||
|
.clone()
|
||||||
|
};
|
||||||
|
|
||||||
let entry = if let Some(entry) = self.endpoints.get(&ids) {
|
let entry = if let Some(entry) = self.endpoints.get(&ids) {
|
||||||
entry.clone()
|
entry.clone()
|
||||||
} else {
|
} else {
|
||||||
@@ -104,12 +204,13 @@ impl Metrics {
|
|||||||
Arc::new(MetricCounter {
|
Arc::new(MetricCounter {
|
||||||
transmitted: AtomicU64::new(0),
|
transmitted: AtomicU64::new(0),
|
||||||
opened_connections: AtomicUsize::new(0),
|
opened_connections: AtomicUsize::new(0),
|
||||||
|
backup: backup.clone(),
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
.clone()
|
.clone()
|
||||||
};
|
};
|
||||||
|
|
||||||
entry.opened_connections.fetch_add(1, Ordering::AcqRel);
|
entry.record_connection(1);
|
||||||
entry
|
entry
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -132,7 +233,7 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<Infall
|
|||||||
|
|
||||||
let now = Utc::now();
|
let now = Utc::now();
|
||||||
collect_metrics_iteration(
|
collect_metrics_iteration(
|
||||||
&USAGE_METRICS,
|
&USAGE_METRICS.endpoints,
|
||||||
&http_client,
|
&http_client,
|
||||||
&config.endpoint,
|
&config.endpoint,
|
||||||
&hostname,
|
&hostname,
|
||||||
@@ -144,24 +245,12 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<Infall
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[instrument(skip_all)]
|
fn collect_and_clear_metrics<C: Clearable>(
|
||||||
async fn collect_metrics_iteration(
|
endpoints: &DashMap<Ids, Arc<C>, FastHasher>,
|
||||||
metrics: &Metrics,
|
) -> Vec<(Ids, u64)> {
|
||||||
client: &http::ClientWithMiddleware,
|
|
||||||
metric_collection_endpoint: &reqwest::Url,
|
|
||||||
hostname: &str,
|
|
||||||
prev: DateTime<Utc>,
|
|
||||||
now: DateTime<Utc>,
|
|
||||||
) {
|
|
||||||
info!(
|
|
||||||
"starting collect_metrics_iteration. metric_collection_endpoint: {}",
|
|
||||||
metric_collection_endpoint
|
|
||||||
);
|
|
||||||
|
|
||||||
let mut metrics_to_clear = Vec::new();
|
let mut metrics_to_clear = Vec::new();
|
||||||
|
|
||||||
let metrics_to_send: Vec<(Ids, u64)> = metrics
|
let metrics_to_send: Vec<(Ids, u64)> = endpoints
|
||||||
.endpoints
|
|
||||||
.iter()
|
.iter()
|
||||||
.filter_map(|counter| {
|
.filter_map(|counter| {
|
||||||
let key = counter.key().clone();
|
let key = counter.key().clone();
|
||||||
@@ -173,33 +262,71 @@ async fn collect_metrics_iteration(
|
|||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
|
for metric in metrics_to_clear {
|
||||||
|
match endpoints.entry(metric) {
|
||||||
|
Entry::Occupied(mut counter) => {
|
||||||
|
if counter.get_mut().should_clear() {
|
||||||
|
counter.remove_entry();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Entry::Vacant(_) => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
metrics_to_send
|
||||||
|
}
|
||||||
|
|
||||||
|
fn create_event_chunks<'a>(
|
||||||
|
metrics_to_send: &'a [(Ids, u64)],
|
||||||
|
hostname: &'a str,
|
||||||
|
prev: DateTime<Utc>,
|
||||||
|
now: DateTime<Utc>,
|
||||||
|
chunk_size: usize,
|
||||||
|
) -> impl Iterator<Item = EventChunk<'a, Event<Ids, &'static str>>> + 'a {
|
||||||
|
// Split into chunks of 1000 metrics to avoid exceeding the max request size
|
||||||
|
metrics_to_send
|
||||||
|
.chunks(chunk_size)
|
||||||
|
.map(move |chunk| EventChunk {
|
||||||
|
events: chunk
|
||||||
|
.iter()
|
||||||
|
.map(|(ids, value)| Event {
|
||||||
|
kind: EventType::Incremental {
|
||||||
|
start_time: prev,
|
||||||
|
stop_time: now,
|
||||||
|
},
|
||||||
|
metric: PROXY_IO_BYTES_PER_CLIENT,
|
||||||
|
idempotency_key: idempotency_key(hostname),
|
||||||
|
value: *value,
|
||||||
|
extra: ids.clone(),
|
||||||
|
})
|
||||||
|
.collect(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
#[instrument(skip_all)]
|
||||||
|
async fn collect_metrics_iteration(
|
||||||
|
endpoints: &DashMap<Ids, Arc<MetricCounter>, FastHasher>,
|
||||||
|
client: &http::ClientWithMiddleware,
|
||||||
|
metric_collection_endpoint: &reqwest::Url,
|
||||||
|
hostname: &str,
|
||||||
|
prev: DateTime<Utc>,
|
||||||
|
now: DateTime<Utc>,
|
||||||
|
) {
|
||||||
|
info!(
|
||||||
|
"starting collect_metrics_iteration. metric_collection_endpoint: {}",
|
||||||
|
metric_collection_endpoint
|
||||||
|
);
|
||||||
|
|
||||||
|
let metrics_to_send = collect_and_clear_metrics(endpoints);
|
||||||
|
|
||||||
if metrics_to_send.is_empty() {
|
if metrics_to_send.is_empty() {
|
||||||
trace!("no new metrics to send");
|
trace!("no new metrics to send");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Send metrics.
|
// Send metrics.
|
||||||
// Split into chunks of 1000 metrics to avoid exceeding the max request size
|
for chunk in create_event_chunks(&metrics_to_send, hostname, prev, now, CHUNK_SIZE) {
|
||||||
for chunk in metrics_to_send.chunks(CHUNK_SIZE) {
|
|
||||||
let events = chunk
|
|
||||||
.iter()
|
|
||||||
.map(|(ids, value)| Event {
|
|
||||||
kind: EventType::Incremental {
|
|
||||||
start_time: prev,
|
|
||||||
stop_time: now,
|
|
||||||
},
|
|
||||||
metric: PROXY_IO_BYTES_PER_CLIENT,
|
|
||||||
idempotency_key: idempotency_key(hostname),
|
|
||||||
value: *value,
|
|
||||||
extra: Ids {
|
|
||||||
endpoint_id: ids.endpoint_id.clone(),
|
|
||||||
branch_id: ids.branch_id.clone(),
|
|
||||||
},
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
let res = client
|
let res = client
|
||||||
.post(metric_collection_endpoint.clone())
|
.post(metric_collection_endpoint.clone())
|
||||||
.json(&EventChunk { events })
|
.json(&chunk)
|
||||||
.send()
|
.send()
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
@@ -213,23 +340,142 @@ async fn collect_metrics_iteration(
|
|||||||
|
|
||||||
if !res.status().is_success() {
|
if !res.status().is_success() {
|
||||||
error!("metrics endpoint refused the sent metrics: {:?}", res);
|
error!("metrics endpoint refused the sent metrics: {:?}", res);
|
||||||
for metric in chunk.iter().filter(|(_, value)| *value > (1u64 << 40)) {
|
for metric in chunk.events.iter().filter(|e| e.value > (1u64 << 40)) {
|
||||||
// Report if the metric value is suspiciously large
|
// Report if the metric value is suspiciously large
|
||||||
error!("potentially abnormal metric value: {:?}", metric);
|
error!("potentially abnormal metric value: {:?}", metric);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for metric in metrics_to_clear {
|
pub async fn task_backup(
|
||||||
match metrics.endpoints.entry(metric) {
|
backup_config: &MetricBackupCollectionConfig,
|
||||||
Entry::Occupied(mut counter) => {
|
cancellation_token: CancellationToken,
|
||||||
if counter.get_mut().should_clear() {
|
) -> anyhow::Result<()> {
|
||||||
counter.remove_entry();
|
info!("metrics backup config: {backup_config:?}");
|
||||||
}
|
scopeguard::defer! {
|
||||||
}
|
info!("metrics backup has shut down");
|
||||||
Entry::Vacant(_) => {}
|
}
|
||||||
|
// Even if the remote storage is not configured, we still want to clear the metrics.
|
||||||
|
let storage = backup_config
|
||||||
|
.remote_storage_config
|
||||||
|
.as_ref()
|
||||||
|
.map(|config| GenericRemoteStorage::from_config(config).context("remote storage init"))
|
||||||
|
.transpose()?;
|
||||||
|
let mut ticker = tokio::time::interval(backup_config.interval);
|
||||||
|
let mut prev = Utc::now();
|
||||||
|
let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned();
|
||||||
|
loop {
|
||||||
|
select(pin!(ticker.tick()), pin!(cancellation_token.cancelled())).await;
|
||||||
|
let now = Utc::now();
|
||||||
|
collect_metrics_backup_iteration(
|
||||||
|
&USAGE_METRICS.backup_endpoints,
|
||||||
|
&storage,
|
||||||
|
&hostname,
|
||||||
|
prev,
|
||||||
|
now,
|
||||||
|
backup_config.chunk_size,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
|
||||||
|
prev = now;
|
||||||
|
if cancellation_token.is_cancelled() {
|
||||||
|
info!("metrics backup has been cancelled");
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[instrument(skip_all)]
|
||||||
|
async fn collect_metrics_backup_iteration(
|
||||||
|
endpoints: &DashMap<Ids, Arc<MetricBackupCounter>, FastHasher>,
|
||||||
|
storage: &Option<GenericRemoteStorage>,
|
||||||
|
hostname: &str,
|
||||||
|
prev: DateTime<Utc>,
|
||||||
|
now: DateTime<Utc>,
|
||||||
|
chunk_size: usize,
|
||||||
|
) {
|
||||||
|
let year = now.year();
|
||||||
|
let month = now.month();
|
||||||
|
let day = now.day();
|
||||||
|
let hour = now.hour();
|
||||||
|
let minute = now.minute();
|
||||||
|
let second = now.second();
|
||||||
|
let cancel = CancellationToken::new();
|
||||||
|
|
||||||
|
info!("starting collect_metrics_backup_iteration");
|
||||||
|
|
||||||
|
let metrics_to_send = collect_and_clear_metrics(endpoints);
|
||||||
|
|
||||||
|
if metrics_to_send.is_empty() {
|
||||||
|
trace!("no new metrics to send");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Send metrics.
|
||||||
|
for chunk in create_event_chunks(&metrics_to_send, hostname, prev, now, chunk_size) {
|
||||||
|
let real_now = Utc::now();
|
||||||
|
let id = uuid::Uuid::new_v7(Timestamp::from_unix(
|
||||||
|
NoContext,
|
||||||
|
real_now.second().into(),
|
||||||
|
real_now.nanosecond(),
|
||||||
|
));
|
||||||
|
let path = format!("year={year:04}/month={month:02}/day={day:02}/{hour:02}:{minute:02}:{second:02}Z_{id}.json.gz");
|
||||||
|
let remote_path = match RemotePath::from_string(&path) {
|
||||||
|
Ok(remote_path) => remote_path,
|
||||||
|
Err(e) => {
|
||||||
|
error!("failed to create remote path from str {path}: {:?}", e);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let res = upload_events_chunk(storage, chunk, &remote_path, &cancel).await;
|
||||||
|
|
||||||
|
if let Err(e) = res {
|
||||||
|
error!(
|
||||||
|
"failed to upload consumption events to remote storage: {:?}",
|
||||||
|
e
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn upload_events_chunk(
|
||||||
|
storage: &Option<GenericRemoteStorage>,
|
||||||
|
chunk: EventChunk<'_, Event<Ids, &'static str>>,
|
||||||
|
remote_path: &RemotePath,
|
||||||
|
cancel: &CancellationToken,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
let storage = match storage {
|
||||||
|
Some(storage) => storage,
|
||||||
|
None => {
|
||||||
|
error!("no remote storage configured");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let data = serde_json::to_vec(&chunk).context("serialize metrics")?;
|
||||||
|
let mut encoder = GzipEncoder::new(Vec::new());
|
||||||
|
encoder.write_all(&data).await.context("compress metrics")?;
|
||||||
|
encoder.shutdown().await.context("compress metrics")?;
|
||||||
|
let compressed_data: Bytes = encoder.get_ref().clone().into();
|
||||||
|
backoff::retry(
|
||||||
|
|| async {
|
||||||
|
let stream = futures::stream::once(futures::future::ready(Ok(compressed_data.clone())));
|
||||||
|
storage
|
||||||
|
.upload(stream, compressed_data.len(), remote_path, None, cancel)
|
||||||
|
.await
|
||||||
|
},
|
||||||
|
TimeoutOrCancel::caused_by_cancel,
|
||||||
|
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||||
|
FAILED_UPLOAD_MAX_RETRIES,
|
||||||
|
"request_data_upload",
|
||||||
|
cancel,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
|
||||||
|
.and_then(|x| x)
|
||||||
|
.context("request_data_upload")?;
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@@ -248,8 +494,8 @@ mod tests {
|
|||||||
};
|
};
|
||||||
use url::Url;
|
use url::Url;
|
||||||
|
|
||||||
use super::{collect_metrics_iteration, Ids, Metrics};
|
use super::*;
|
||||||
use crate::{http, rate_limiter::RateLimiterConfig};
|
use crate::{http, rate_limiter::RateLimiterConfig, BranchId, EndpointId};
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn metrics() {
|
async fn metrics() {
|
||||||
@@ -284,18 +530,19 @@ mod tests {
|
|||||||
let now = Utc::now();
|
let now = Utc::now();
|
||||||
|
|
||||||
// no counters have been registered
|
// no counters have been registered
|
||||||
collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
|
collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
|
||||||
let r = std::mem::take(&mut *reports2.lock().unwrap());
|
let r = std::mem::take(&mut *reports2.lock().unwrap());
|
||||||
assert!(r.is_empty());
|
assert!(r.is_empty());
|
||||||
|
|
||||||
// register a new counter
|
// register a new counter
|
||||||
|
|
||||||
let counter = metrics.register(Ids {
|
let counter = metrics.register(Ids {
|
||||||
endpoint_id: "e1".into(),
|
endpoint_id: (&EndpointId::from("e1")).into(),
|
||||||
branch_id: "b1".into(),
|
branch_id: (&BranchId::from("b1")).into(),
|
||||||
});
|
});
|
||||||
|
|
||||||
// the counter should be observed despite 0 egress
|
// the counter should be observed despite 0 egress
|
||||||
collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
|
collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
|
||||||
let r = std::mem::take(&mut *reports2.lock().unwrap());
|
let r = std::mem::take(&mut *reports2.lock().unwrap());
|
||||||
assert_eq!(r.len(), 1);
|
assert_eq!(r.len(), 1);
|
||||||
assert_eq!(r[0].events.len(), 1);
|
assert_eq!(r[0].events.len(), 1);
|
||||||
@@ -305,7 +552,7 @@ mod tests {
|
|||||||
counter.record_egress(1);
|
counter.record_egress(1);
|
||||||
|
|
||||||
// egress should be observered
|
// egress should be observered
|
||||||
collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
|
collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
|
||||||
let r = std::mem::take(&mut *reports2.lock().unwrap());
|
let r = std::mem::take(&mut *reports2.lock().unwrap());
|
||||||
assert_eq!(r.len(), 1);
|
assert_eq!(r.len(), 1);
|
||||||
assert_eq!(r[0].events.len(), 1);
|
assert_eq!(r[0].events.len(), 1);
|
||||||
@@ -315,11 +562,19 @@ mod tests {
|
|||||||
drop(counter);
|
drop(counter);
|
||||||
|
|
||||||
// we do not observe the counter
|
// we do not observe the counter
|
||||||
collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
|
collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
|
||||||
let r = std::mem::take(&mut *reports2.lock().unwrap());
|
let r = std::mem::take(&mut *reports2.lock().unwrap());
|
||||||
assert!(r.is_empty());
|
assert!(r.is_empty());
|
||||||
|
|
||||||
// counter is unregistered
|
// counter is unregistered
|
||||||
assert!(metrics.endpoints.is_empty());
|
assert!(metrics.endpoints.is_empty());
|
||||||
|
|
||||||
|
collect_metrics_backup_iteration(&metrics.backup_endpoints, &None, "foo", now, now, 1000)
|
||||||
|
.await;
|
||||||
|
assert!(!metrics.backup_endpoints.is_empty());
|
||||||
|
collect_metrics_backup_iteration(&metrics.backup_endpoints, &None, "foo", now, now, 1000)
|
||||||
|
.await;
|
||||||
|
// backup counter is unregistered after the second iteration
|
||||||
|
assert!(metrics.backup_endpoints.is_empty());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -94,4 +94,5 @@ select = [
|
|||||||
"I", # isort
|
"I", # isort
|
||||||
"W", # pycodestyle
|
"W", # pycodestyle
|
||||||
"B", # bugbear
|
"B", # bugbear
|
||||||
|
"UP032", # f-string
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -33,6 +33,7 @@ once_cell.workspace = true
|
|||||||
parking_lot.workspace = true
|
parking_lot.workspace = true
|
||||||
postgres.workspace = true
|
postgres.workspace = true
|
||||||
postgres-protocol.workspace = true
|
postgres-protocol.workspace = true
|
||||||
|
rand.workspace = true
|
||||||
regex.workspace = true
|
regex.workspace = true
|
||||||
scopeguard.workspace = true
|
scopeguard.workspace = true
|
||||||
reqwest = { workspace = true, features = ["json"] }
|
reqwest = { workspace = true, features = ["json"] }
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ use utils::pid_file;
|
|||||||
use metrics::set_build_info_metric;
|
use metrics::set_build_info_metric;
|
||||||
use safekeeper::defaults::{
|
use safekeeper::defaults::{
|
||||||
DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
|
DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
|
||||||
DEFAULT_PG_LISTEN_ADDR,
|
DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
|
||||||
};
|
};
|
||||||
use safekeeper::wal_service;
|
use safekeeper::wal_service;
|
||||||
use safekeeper::GlobalTimelines;
|
use safekeeper::GlobalTimelines;
|
||||||
@@ -170,6 +170,13 @@ struct Args {
|
|||||||
/// still needed for existing replication connection.
|
/// still needed for existing replication connection.
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
walsenders_keep_horizon: bool,
|
walsenders_keep_horizon: bool,
|
||||||
|
/// Enable partial backup. If disabled, safekeeper will not upload partial
|
||||||
|
/// segments to remote storage.
|
||||||
|
#[arg(long)]
|
||||||
|
partial_backup_enabled: bool,
|
||||||
|
/// Controls how long backup will wait until uploading the partial segment.
|
||||||
|
#[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_PARTIAL_BACKUP_TIMEOUT, verbatim_doc_comment)]
|
||||||
|
partial_backup_timeout: Duration,
|
||||||
}
|
}
|
||||||
|
|
||||||
// Like PathBufValueParser, but allows empty string.
|
// Like PathBufValueParser, but allows empty string.
|
||||||
@@ -300,6 +307,8 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
http_auth,
|
http_auth,
|
||||||
current_thread_runtime: args.current_thread_runtime,
|
current_thread_runtime: args.current_thread_runtime,
|
||||||
walsenders_keep_horizon: args.walsenders_keep_horizon,
|
walsenders_keep_horizon: args.walsenders_keep_horizon,
|
||||||
|
partial_backup_enabled: args.partial_backup_enabled,
|
||||||
|
partial_backup_timeout: args.partial_backup_timeout,
|
||||||
};
|
};
|
||||||
|
|
||||||
// initialize sentry if SENTRY_DSN is provided
|
// initialize sentry if SENTRY_DSN is provided
|
||||||
@@ -365,6 +374,8 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
|||||||
|
|
||||||
let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100);
|
let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100);
|
||||||
|
|
||||||
|
wal_backup::init_remote_storage(&conf);
|
||||||
|
|
||||||
// Keep handles to main tasks to die if any of them disappears.
|
// Keep handles to main tasks to die if any of them disappears.
|
||||||
let mut tasks_handles: FuturesUnordered<BoxFuture<(String, JoinTaskRes)>> =
|
let mut tasks_handles: FuturesUnordered<BoxFuture<(String, JoinTaskRes)>> =
|
||||||
FuturesUnordered::new();
|
FuturesUnordered::new();
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ use utils::{bin_ser::LeSer, id::TenantTimelineId};
|
|||||||
use crate::SafeKeeperConf;
|
use crate::SafeKeeperConf;
|
||||||
|
|
||||||
pub const SK_MAGIC: u32 = 0xcafeceefu32;
|
pub const SK_MAGIC: u32 = 0xcafeceefu32;
|
||||||
pub const SK_FORMAT_VERSION: u32 = 7;
|
pub const SK_FORMAT_VERSION: u32 = 8;
|
||||||
|
|
||||||
// contains persistent metadata for safekeeper
|
// contains persistent metadata for safekeeper
|
||||||
const CONTROL_FILE_NAME: &str = "safekeeper.control";
|
const CONTROL_FILE_NAME: &str = "safekeeper.control";
|
||||||
|
|||||||
@@ -2,6 +2,7 @@
|
|||||||
use crate::{
|
use crate::{
|
||||||
safekeeper::{AcceptorState, PgUuid, ServerInfo, Term, TermHistory, TermLsn},
|
safekeeper::{AcceptorState, PgUuid, ServerInfo, Term, TermHistory, TermLsn},
|
||||||
state::{PersistedPeers, TimelinePersistentState},
|
state::{PersistedPeers, TimelinePersistentState},
|
||||||
|
wal_backup_partial,
|
||||||
};
|
};
|
||||||
use anyhow::{bail, Result};
|
use anyhow::{bail, Result};
|
||||||
use pq_proto::SystemId;
|
use pq_proto::SystemId;
|
||||||
@@ -138,6 +139,50 @@ pub struct SafeKeeperStateV4 {
|
|||||||
pub peers: PersistedPeers,
|
pub peers: PersistedPeers,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||||
|
pub struct SafeKeeperStateV7 {
|
||||||
|
#[serde(with = "hex")]
|
||||||
|
pub tenant_id: TenantId,
|
||||||
|
#[serde(with = "hex")]
|
||||||
|
pub timeline_id: TimelineId,
|
||||||
|
/// persistent acceptor state
|
||||||
|
pub acceptor_state: AcceptorState,
|
||||||
|
/// information about server
|
||||||
|
pub server: ServerInfo,
|
||||||
|
/// Unique id of the last *elected* proposer we dealt with. Not needed
|
||||||
|
/// for correctness, exists for monitoring purposes.
|
||||||
|
#[serde(with = "hex")]
|
||||||
|
pub proposer_uuid: PgUuid,
|
||||||
|
/// Since which LSN this timeline generally starts. Safekeeper might have
|
||||||
|
/// joined later.
|
||||||
|
pub timeline_start_lsn: Lsn,
|
||||||
|
/// Since which LSN safekeeper has (had) WAL for this timeline.
|
||||||
|
/// All WAL segments next to one containing local_start_lsn are
|
||||||
|
/// filled with data from the beginning.
|
||||||
|
pub local_start_lsn: Lsn,
|
||||||
|
/// Part of WAL acknowledged by quorum *and available locally*. Always points
|
||||||
|
/// to record boundary.
|
||||||
|
pub commit_lsn: Lsn,
|
||||||
|
/// LSN that points to the end of the last backed up segment. Useful to
|
||||||
|
/// persist to avoid finding out offloading progress on boot.
|
||||||
|
pub backup_lsn: Lsn,
|
||||||
|
/// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn
|
||||||
|
/// of last record streamed to everyone). Persisting it helps skipping
|
||||||
|
/// recovery in walproposer, generally we compute it from peers. In
|
||||||
|
/// walproposer proto called 'truncate_lsn'. Updates are currently drived
|
||||||
|
/// only by walproposer.
|
||||||
|
pub peer_horizon_lsn: Lsn,
|
||||||
|
/// LSN of the oldest known checkpoint made by pageserver and successfully
|
||||||
|
/// pushed to s3. We don't remove WAL beyond it. Persisted only for
|
||||||
|
/// informational purposes, we receive it from pageserver (or broker).
|
||||||
|
pub remote_consistent_lsn: Lsn,
|
||||||
|
// Peers and their state as we remember it. Knowing peers themselves is
|
||||||
|
// fundamental; but state is saved here only for informational purposes and
|
||||||
|
// obviously can be stale. (Currently not saved at all, but let's provision
|
||||||
|
// place to have less file version upgrades).
|
||||||
|
pub peers: PersistedPeers,
|
||||||
|
}
|
||||||
|
|
||||||
pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersistentState> {
|
pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersistentState> {
|
||||||
// migrate to storing full term history
|
// migrate to storing full term history
|
||||||
if version == 1 {
|
if version == 1 {
|
||||||
@@ -167,6 +212,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
|||||||
peer_horizon_lsn: oldstate.truncate_lsn,
|
peer_horizon_lsn: oldstate.truncate_lsn,
|
||||||
remote_consistent_lsn: Lsn(0),
|
remote_consistent_lsn: Lsn(0),
|
||||||
peers: PersistedPeers(vec![]),
|
peers: PersistedPeers(vec![]),
|
||||||
|
partial_backup: wal_backup_partial::State::default(),
|
||||||
});
|
});
|
||||||
// migrate to hexing some ids
|
// migrate to hexing some ids
|
||||||
} else if version == 2 {
|
} else if version == 2 {
|
||||||
@@ -190,6 +236,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
|||||||
peer_horizon_lsn: oldstate.truncate_lsn,
|
peer_horizon_lsn: oldstate.truncate_lsn,
|
||||||
remote_consistent_lsn: Lsn(0),
|
remote_consistent_lsn: Lsn(0),
|
||||||
peers: PersistedPeers(vec![]),
|
peers: PersistedPeers(vec![]),
|
||||||
|
partial_backup: wal_backup_partial::State::default(),
|
||||||
});
|
});
|
||||||
// migrate to moving tenant_id/timeline_id to the top and adding some lsns
|
// migrate to moving tenant_id/timeline_id to the top and adding some lsns
|
||||||
} else if version == 3 {
|
} else if version == 3 {
|
||||||
@@ -213,6 +260,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
|||||||
peer_horizon_lsn: oldstate.truncate_lsn,
|
peer_horizon_lsn: oldstate.truncate_lsn,
|
||||||
remote_consistent_lsn: Lsn(0),
|
remote_consistent_lsn: Lsn(0),
|
||||||
peers: PersistedPeers(vec![]),
|
peers: PersistedPeers(vec![]),
|
||||||
|
partial_backup: wal_backup_partial::State::default(),
|
||||||
});
|
});
|
||||||
// migrate to having timeline_start_lsn
|
// migrate to having timeline_start_lsn
|
||||||
} else if version == 4 {
|
} else if version == 4 {
|
||||||
@@ -236,6 +284,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
|||||||
peer_horizon_lsn: oldstate.peer_horizon_lsn,
|
peer_horizon_lsn: oldstate.peer_horizon_lsn,
|
||||||
remote_consistent_lsn: Lsn(0),
|
remote_consistent_lsn: Lsn(0),
|
||||||
peers: PersistedPeers(vec![]),
|
peers: PersistedPeers(vec![]),
|
||||||
|
partial_backup: wal_backup_partial::State::default(),
|
||||||
});
|
});
|
||||||
} else if version == 5 {
|
} else if version == 5 {
|
||||||
info!("reading safekeeper control file version {}", version);
|
info!("reading safekeeper control file version {}", version);
|
||||||
@@ -262,7 +311,30 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
|||||||
oldstate.server.pg_version = 140005;
|
oldstate.server.pg_version = 140005;
|
||||||
|
|
||||||
return Ok(oldstate);
|
return Ok(oldstate);
|
||||||
|
} else if version == 7 {
|
||||||
|
info!("reading safekeeper control file version {}", version);
|
||||||
|
let oldstate = SafeKeeperStateV7::des(&buf[..buf.len()])?;
|
||||||
|
|
||||||
|
return Ok(TimelinePersistentState {
|
||||||
|
tenant_id: oldstate.tenant_id,
|
||||||
|
timeline_id: oldstate.timeline_id,
|
||||||
|
acceptor_state: oldstate.acceptor_state,
|
||||||
|
server: oldstate.server,
|
||||||
|
proposer_uuid: oldstate.proposer_uuid,
|
||||||
|
timeline_start_lsn: oldstate.timeline_start_lsn,
|
||||||
|
local_start_lsn: oldstate.local_start_lsn,
|
||||||
|
commit_lsn: oldstate.commit_lsn,
|
||||||
|
backup_lsn: oldstate.backup_lsn,
|
||||||
|
peer_horizon_lsn: oldstate.peer_horizon_lsn,
|
||||||
|
remote_consistent_lsn: oldstate.remote_consistent_lsn,
|
||||||
|
peers: oldstate.peers,
|
||||||
|
partial_backup: wal_backup_partial::State::default(),
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: persist the file back to the disk after upgrade
|
||||||
|
// TODO: think about backward compatibility and rollbacks
|
||||||
|
|
||||||
bail!("unsupported safekeeper control file version {}", version)
|
bail!("unsupported safekeeper control file version {}", version)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -32,6 +32,7 @@ pub mod send_wal;
|
|||||||
pub mod state;
|
pub mod state;
|
||||||
pub mod timeline;
|
pub mod timeline;
|
||||||
pub mod wal_backup;
|
pub mod wal_backup;
|
||||||
|
pub mod wal_backup_partial;
|
||||||
pub mod wal_service;
|
pub mod wal_service;
|
||||||
pub mod wal_storage;
|
pub mod wal_storage;
|
||||||
|
|
||||||
@@ -48,6 +49,7 @@ pub mod defaults {
|
|||||||
|
|
||||||
pub const DEFAULT_HEARTBEAT_TIMEOUT: &str = "5000ms";
|
pub const DEFAULT_HEARTBEAT_TIMEOUT: &str = "5000ms";
|
||||||
pub const DEFAULT_MAX_OFFLOADER_LAG_BYTES: u64 = 128 * (1 << 20);
|
pub const DEFAULT_MAX_OFFLOADER_LAG_BYTES: u64 = 128 * (1 << 20);
|
||||||
|
pub const DEFAULT_PARTIAL_BACKUP_TIMEOUT: &str = "15m";
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
@@ -79,6 +81,8 @@ pub struct SafeKeeperConf {
|
|||||||
pub http_auth: Option<Arc<SwappableJwtAuth>>,
|
pub http_auth: Option<Arc<SwappableJwtAuth>>,
|
||||||
pub current_thread_runtime: bool,
|
pub current_thread_runtime: bool,
|
||||||
pub walsenders_keep_horizon: bool,
|
pub walsenders_keep_horizon: bool,
|
||||||
|
pub partial_backup_enabled: bool,
|
||||||
|
pub partial_backup_timeout: Duration,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SafeKeeperConf {
|
impl SafeKeeperConf {
|
||||||
@@ -123,6 +127,8 @@ impl SafeKeeperConf {
|
|||||||
max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES,
|
max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES,
|
||||||
current_thread_runtime: false,
|
current_thread_runtime: false,
|
||||||
walsenders_keep_horizon: false,
|
walsenders_keep_horizon: false,
|
||||||
|
partial_backup_enabled: false,
|
||||||
|
partial_backup_timeout: Duration::from_secs(0),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -147,6 +147,21 @@ pub static RECEIVED_PS_FEEDBACKS: Lazy<IntCounter> = Lazy::new(|| {
|
|||||||
)
|
)
|
||||||
.expect("Failed to register safekeeper_received_ps_feedbacks_total counter")
|
.expect("Failed to register safekeeper_received_ps_feedbacks_total counter")
|
||||||
});
|
});
|
||||||
|
pub static PARTIAL_BACKUP_UPLOADS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||||
|
register_int_counter_vec!(
|
||||||
|
"safekeeper_partial_backup_uploads_total",
|
||||||
|
"Number of partial backup uploads to the S3",
|
||||||
|
&["result"]
|
||||||
|
)
|
||||||
|
.expect("Failed to register safekeeper_partial_backup_uploads_total counter")
|
||||||
|
});
|
||||||
|
pub static PARTIAL_BACKUP_UPLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
|
||||||
|
register_int_counter!(
|
||||||
|
"safekeeper_partial_backup_uploaded_bytes_total",
|
||||||
|
"Number of bytes uploaded to the S3 during partial backup"
|
||||||
|
)
|
||||||
|
.expect("Failed to register safekeeper_partial_backup_uploaded_bytes_total counter")
|
||||||
|
});
|
||||||
|
|
||||||
pub const LABEL_UNKNOWN: &str = "unknown";
|
pub const LABEL_UNKNOWN: &str = "unknown";
|
||||||
|
|
||||||
|
|||||||
@@ -1221,6 +1221,7 @@ mod tests {
|
|||||||
commit_lsn: Lsn(1234567600),
|
commit_lsn: Lsn(1234567600),
|
||||||
},
|
},
|
||||||
)]),
|
)]),
|
||||||
|
partial_backup: crate::wal_backup_partial::State::default(),
|
||||||
};
|
};
|
||||||
|
|
||||||
let ser = state.ser().unwrap();
|
let ser = state.ser().unwrap();
|
||||||
@@ -1266,6 +1267,8 @@ mod tests {
|
|||||||
0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||||
0x70, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
|
0x70, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
|
||||||
0xb0, 0x01, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
|
0xb0, 0x01, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
|
||||||
|
// partial_backup
|
||||||
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||||
];
|
];
|
||||||
|
|
||||||
assert_eq!(Hex(&ser), Hex(&expected));
|
assert_eq!(Hex(&ser), Hex(&expected));
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ use utils::{
|
|||||||
use crate::{
|
use crate::{
|
||||||
control_file,
|
control_file,
|
||||||
safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, TermHistory},
|
safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, TermHistory},
|
||||||
|
wal_backup_partial::{self},
|
||||||
};
|
};
|
||||||
|
|
||||||
/// Persistent information stored on safekeeper node about timeline.
|
/// Persistent information stored on safekeeper node about timeline.
|
||||||
@@ -54,11 +55,14 @@ pub struct TimelinePersistentState {
|
|||||||
/// pushed to s3. We don't remove WAL beyond it. Persisted only for
|
/// pushed to s3. We don't remove WAL beyond it. Persisted only for
|
||||||
/// informational purposes, we receive it from pageserver (or broker).
|
/// informational purposes, we receive it from pageserver (or broker).
|
||||||
pub remote_consistent_lsn: Lsn,
|
pub remote_consistent_lsn: Lsn,
|
||||||
// Peers and their state as we remember it. Knowing peers themselves is
|
/// Peers and their state as we remember it. Knowing peers themselves is
|
||||||
// fundamental; but state is saved here only for informational purposes and
|
/// fundamental; but state is saved here only for informational purposes and
|
||||||
// obviously can be stale. (Currently not saved at all, but let's provision
|
/// obviously can be stale. (Currently not saved at all, but let's provision
|
||||||
// place to have less file version upgrades).
|
/// place to have less file version upgrades).
|
||||||
pub peers: PersistedPeers,
|
pub peers: PersistedPeers,
|
||||||
|
/// Holds names of partial segments uploaded to remote storage. Used to
|
||||||
|
/// clean up old objects without leaving garbage in remote storage.
|
||||||
|
pub partial_backup: wal_backup_partial::State,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||||
@@ -93,6 +97,7 @@ impl TimelinePersistentState {
|
|||||||
.map(|p| (*p, PersistedPeerInfo::new()))
|
.map(|p| (*p, PersistedPeerInfo::new()))
|
||||||
.collect(),
|
.collect(),
|
||||||
),
|
),
|
||||||
|
partial_backup: wal_backup_partial::State::default(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
|
|||||||
|
|
||||||
use crate::metrics::FullTimelineInfo;
|
use crate::metrics::FullTimelineInfo;
|
||||||
use crate::wal_storage::Storage as wal_storage_iface;
|
use crate::wal_storage::Storage as wal_storage_iface;
|
||||||
use crate::{debug_dump, wal_storage};
|
use crate::{debug_dump, wal_backup_partial, wal_storage};
|
||||||
use crate::{GlobalTimelines, SafeKeeperConf};
|
use crate::{GlobalTimelines, SafeKeeperConf};
|
||||||
|
|
||||||
/// Things safekeeper should know about timeline state on peers.
|
/// Things safekeeper should know about timeline state on peers.
|
||||||
@@ -503,6 +503,9 @@ impl Timeline {
|
|||||||
if conf.peer_recovery_enabled {
|
if conf.peer_recovery_enabled {
|
||||||
tokio::spawn(recovery_main(self.clone(), conf.clone()));
|
tokio::spawn(recovery_main(self.clone(), conf.clone()));
|
||||||
}
|
}
|
||||||
|
if conf.is_wal_backup_enabled() && conf.partial_backup_enabled {
|
||||||
|
tokio::spawn(wal_backup_partial::main_task(self.clone(), conf.clone()));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Delete timeline from disk completely, by removing timeline directory.
|
/// Delete timeline from disk completely, by removing timeline directory.
|
||||||
@@ -667,8 +670,8 @@ impl Timeline {
|
|||||||
term_flush_lsn =
|
term_flush_lsn =
|
||||||
TermLsn::from((shared_state.sk.get_term(), shared_state.sk.flush_lsn()));
|
TermLsn::from((shared_state.sk.get_term(), shared_state.sk.flush_lsn()));
|
||||||
}
|
}
|
||||||
self.commit_lsn_watch_tx.send(commit_lsn)?;
|
|
||||||
self.term_flush_lsn_watch_tx.send(term_flush_lsn)?;
|
self.term_flush_lsn_watch_tx.send(term_flush_lsn)?;
|
||||||
|
self.commit_lsn_watch_tx.send(commit_lsn)?;
|
||||||
Ok(rmsg)
|
Ok(rmsg)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ use std::time::Duration;
|
|||||||
use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr;
|
use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr;
|
||||||
use postgres_ffi::XLogFileName;
|
use postgres_ffi::XLogFileName;
|
||||||
use postgres_ffi::{XLogSegNo, PG_TLI};
|
use postgres_ffi::{XLogSegNo, PG_TLI};
|
||||||
use remote_storage::{GenericRemoteStorage, RemotePath};
|
use remote_storage::{GenericRemoteStorage, RemotePath, StorageMetadata};
|
||||||
use tokio::fs::File;
|
use tokio::fs::File;
|
||||||
|
|
||||||
use tokio::select;
|
use tokio::select;
|
||||||
@@ -180,6 +180,16 @@ fn get_configured_remote_storage() -> &'static GenericRemoteStorage {
|
|||||||
.unwrap()
|
.unwrap()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn init_remote_storage(conf: &SafeKeeperConf) {
|
||||||
|
// TODO: refactor REMOTE_STORAGE to avoid using global variables, and provide
|
||||||
|
// dependencies to all tasks instead.
|
||||||
|
REMOTE_STORAGE.get_or_init(|| {
|
||||||
|
conf.remote_storage
|
||||||
|
.as_ref()
|
||||||
|
.map(|c| GenericRemoteStorage::from_config(c).expect("failed to create remote storage"))
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000;
|
const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000;
|
||||||
|
|
||||||
/// Sits on wal_backup_launcher_rx and starts/stops per timeline wal backup
|
/// Sits on wal_backup_launcher_rx and starts/stops per timeline wal backup
|
||||||
@@ -194,14 +204,6 @@ pub async fn wal_backup_launcher_task_main(
|
|||||||
conf.remote_storage
|
conf.remote_storage
|
||||||
);
|
);
|
||||||
|
|
||||||
let conf_ = conf.clone();
|
|
||||||
REMOTE_STORAGE.get_or_init(|| {
|
|
||||||
conf_
|
|
||||||
.remote_storage
|
|
||||||
.as_ref()
|
|
||||||
.map(|c| GenericRemoteStorage::from_config(c).expect("failed to create remote storage"))
|
|
||||||
});
|
|
||||||
|
|
||||||
// Presence in this map means launcher is aware s3 offloading is needed for
|
// Presence in this map means launcher is aware s3 offloading is needed for
|
||||||
// the timeline, but task is started only if it makes sense for to offload
|
// the timeline, but task is started only if it makes sense for to offload
|
||||||
// from this safekeeper.
|
// from this safekeeper.
|
||||||
@@ -518,6 +520,35 @@ async fn backup_object(
|
|||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) async fn backup_partial_segment(
|
||||||
|
source_file: &Utf8Path,
|
||||||
|
target_file: &RemotePath,
|
||||||
|
size: usize,
|
||||||
|
) -> Result<()> {
|
||||||
|
let storage = get_configured_remote_storage();
|
||||||
|
|
||||||
|
let file = File::open(&source_file)
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("Failed to open file {source_file:?} for wal backup"))?;
|
||||||
|
|
||||||
|
// limiting the file to read only the first `size` bytes
|
||||||
|
let limited_file = tokio::io::AsyncReadExt::take(file, size as u64);
|
||||||
|
|
||||||
|
let file = tokio_util::io::ReaderStream::with_capacity(limited_file, BUFFER_SIZE);
|
||||||
|
|
||||||
|
let cancel = CancellationToken::new();
|
||||||
|
|
||||||
|
storage
|
||||||
|
.upload(
|
||||||
|
file,
|
||||||
|
size,
|
||||||
|
target_file,
|
||||||
|
Some(StorageMetadata::from([("sk_type", "partial_segment")])),
|
||||||
|
&cancel,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn read_object(
|
pub async fn read_object(
|
||||||
file_path: &RemotePath,
|
file_path: &RemotePath,
|
||||||
offset: u64,
|
offset: u64,
|
||||||
@@ -604,6 +635,13 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Used by wal_backup_partial.
|
||||||
|
pub async fn delete_objects(paths: &[RemotePath]) -> Result<()> {
|
||||||
|
let cancel = CancellationToken::new(); // not really used
|
||||||
|
let storage = get_configured_remote_storage();
|
||||||
|
storage.delete_objects(paths, &cancel).await
|
||||||
|
}
|
||||||
|
|
||||||
/// Copy segments from one timeline to another. Used in copy_timeline.
|
/// Copy segments from one timeline to another. Used in copy_timeline.
|
||||||
pub async fn copy_s3_segments(
|
pub async fn copy_s3_segments(
|
||||||
wal_seg_size: usize,
|
wal_seg_size: usize,
|
||||||
|
|||||||
407
safekeeper/src/wal_backup_partial.rs
Normal file
407
safekeeper/src/wal_backup_partial.rs
Normal file
@@ -0,0 +1,407 @@
|
|||||||
|
//! Safekeeper timeline has a background task which is subscribed to `commit_lsn`
|
||||||
|
//! and `flush_lsn` updates. After the partial segment was updated (`flush_lsn`
|
||||||
|
//! was changed), the segment will be uploaded to S3 in about 15 minutes.
|
||||||
|
//!
|
||||||
|
//! The filename format for partial segments is
|
||||||
|
//! `Segment_Term_Flush_Commit_skNN.partial`, where:
|
||||||
|
//! - `Segment` – the segment name, like `000000010000000000000001`
|
||||||
|
//! - `Term` – current term
|
||||||
|
//! - `Flush` – flush_lsn in hex format `{:016X}`, e.g. `00000000346BC568`
|
||||||
|
//! - `Commit` – commit_lsn in the same hex format
|
||||||
|
//! - `NN` – safekeeper_id, like `1`
|
||||||
|
//!
|
||||||
|
//! The full object name example:
|
||||||
|
//! `000000010000000000000002_2_0000000002534868_0000000002534410_sk1.partial`
|
||||||
|
//!
|
||||||
|
//! Each safekeeper will keep info about remote partial segments in its control
|
||||||
|
//! file. Code updates state in the control file before doing any S3 operations.
|
||||||
|
//! This way control file stores information about all potentially existing
|
||||||
|
//! remote partial segments and can clean them up after uploading a newer version.
|
||||||
|
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use camino::Utf8PathBuf;
|
||||||
|
use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
|
||||||
|
use rand::Rng;
|
||||||
|
use remote_storage::RemotePath;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
use tracing::{debug, error, info, instrument};
|
||||||
|
use utils::lsn::Lsn;
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
metrics::{PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS},
|
||||||
|
safekeeper::Term,
|
||||||
|
timeline::Timeline,
|
||||||
|
wal_backup, SafeKeeperConf,
|
||||||
|
};
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||||
|
pub enum UploadStatus {
|
||||||
|
/// Upload is in progress
|
||||||
|
InProgress,
|
||||||
|
/// Upload is finished
|
||||||
|
Uploaded,
|
||||||
|
/// Deletion is in progress
|
||||||
|
Deleting,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||||
|
pub struct PartialRemoteSegment {
|
||||||
|
pub status: UploadStatus,
|
||||||
|
pub name: String,
|
||||||
|
pub commit_lsn: Lsn,
|
||||||
|
pub flush_lsn: Lsn,
|
||||||
|
pub term: Term,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PartialRemoteSegment {
|
||||||
|
fn eq_without_status(&self, other: &Self) -> bool {
|
||||||
|
self.name == other.name
|
||||||
|
&& self.commit_lsn == other.commit_lsn
|
||||||
|
&& self.flush_lsn == other.flush_lsn
|
||||||
|
&& self.term == other.term
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// NB: these structures are a part of a control_file, you can't change them without
|
||||||
|
// changing the control file format version.
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
|
||||||
|
pub struct State {
|
||||||
|
pub segments: Vec<PartialRemoteSegment>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl State {
|
||||||
|
/// Find an Uploaded segment. There should be only one Uploaded segment at a time.
|
||||||
|
fn uploaded_segment(&self) -> Option<PartialRemoteSegment> {
|
||||||
|
self.segments
|
||||||
|
.iter()
|
||||||
|
.find(|seg| seg.status == UploadStatus::Uploaded)
|
||||||
|
.cloned()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct PartialBackup {
|
||||||
|
wal_seg_size: usize,
|
||||||
|
tli: Arc<Timeline>,
|
||||||
|
conf: SafeKeeperConf,
|
||||||
|
local_prefix: Utf8PathBuf,
|
||||||
|
remote_prefix: Utf8PathBuf,
|
||||||
|
|
||||||
|
state: State,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read-only methods for getting segment names
|
||||||
|
impl PartialBackup {
|
||||||
|
fn segno(&self, lsn: Lsn) -> XLogSegNo {
|
||||||
|
lsn.segment_number(self.wal_seg_size)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn segment_name(&self, segno: u64) -> String {
|
||||||
|
XLogFileName(PG_TLI, segno, self.wal_seg_size)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn remote_segment_name(
|
||||||
|
&self,
|
||||||
|
segno: u64,
|
||||||
|
term: u64,
|
||||||
|
commit_lsn: Lsn,
|
||||||
|
flush_lsn: Lsn,
|
||||||
|
) -> String {
|
||||||
|
format!(
|
||||||
|
"{}_{}_{:016X}_{:016X}_sk{}.partial",
|
||||||
|
self.segment_name(segno),
|
||||||
|
term,
|
||||||
|
flush_lsn.0,
|
||||||
|
commit_lsn.0,
|
||||||
|
self.conf.my_id.0,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn local_segment_name(&self, segno: u64) -> String {
|
||||||
|
format!("{}.partial", self.segment_name(segno))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PartialBackup {
|
||||||
|
/// Takes a lock to read actual safekeeper state and returns a segment that should be uploaded.
|
||||||
|
async fn prepare_upload(&self) -> PartialRemoteSegment {
|
||||||
|
// this operation takes a lock to get the actual state
|
||||||
|
let sk_info = self.tli.get_safekeeper_info(&self.conf).await;
|
||||||
|
let flush_lsn = Lsn(sk_info.flush_lsn);
|
||||||
|
let commit_lsn = Lsn(sk_info.commit_lsn);
|
||||||
|
let term = sk_info.term;
|
||||||
|
let segno = self.segno(flush_lsn);
|
||||||
|
|
||||||
|
let name = self.remote_segment_name(segno, term, commit_lsn, flush_lsn);
|
||||||
|
|
||||||
|
PartialRemoteSegment {
|
||||||
|
status: UploadStatus::InProgress,
|
||||||
|
name,
|
||||||
|
commit_lsn,
|
||||||
|
flush_lsn,
|
||||||
|
term,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Reads segment from disk and uploads it to the remote storage.
|
||||||
|
async fn upload_segment(&mut self, prepared: PartialRemoteSegment) -> anyhow::Result<()> {
|
||||||
|
let flush_lsn = prepared.flush_lsn;
|
||||||
|
let segno = self.segno(flush_lsn);
|
||||||
|
|
||||||
|
// We're going to backup bytes from the start of the segment up to flush_lsn.
|
||||||
|
let backup_bytes = flush_lsn.segment_offset(self.wal_seg_size);
|
||||||
|
|
||||||
|
let local_path = self.local_prefix.join(self.local_segment_name(segno));
|
||||||
|
let remote_path = RemotePath::new(self.remote_prefix.join(&prepared.name).as_ref())?;
|
||||||
|
|
||||||
|
// Upload first `backup_bytes` bytes of the segment to the remote storage.
|
||||||
|
wal_backup::backup_partial_segment(&local_path, &remote_path, backup_bytes).await?;
|
||||||
|
PARTIAL_BACKUP_UPLOADED_BYTES.inc_by(backup_bytes as u64);
|
||||||
|
|
||||||
|
// We uploaded the segment, now let's verify that the data is still actual.
|
||||||
|
// If the term changed, we cannot guarantee the validity of the uploaded data.
|
||||||
|
// If the term is the same, we know the data is not corrupted.
|
||||||
|
let sk_info = self.tli.get_safekeeper_info(&self.conf).await;
|
||||||
|
if sk_info.term != prepared.term {
|
||||||
|
anyhow::bail!("term changed during upload");
|
||||||
|
}
|
||||||
|
assert!(prepared.commit_lsn <= Lsn(sk_info.commit_lsn));
|
||||||
|
assert!(prepared.flush_lsn <= Lsn(sk_info.flush_lsn));
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Write new state to disk. If in-memory and on-disk states diverged, returns an error.
|
||||||
|
async fn commit_state(&mut self, new_state: State) -> anyhow::Result<()> {
|
||||||
|
self.tli
|
||||||
|
.map_control_file(|cf| {
|
||||||
|
if cf.partial_backup != self.state {
|
||||||
|
let memory = self.state.clone();
|
||||||
|
self.state = cf.partial_backup.clone();
|
||||||
|
anyhow::bail!(
|
||||||
|
"partial backup state diverged, memory={:?}, disk={:?}",
|
||||||
|
memory,
|
||||||
|
cf.partial_backup
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
cf.partial_backup = new_state.clone();
|
||||||
|
Ok(())
|
||||||
|
})
|
||||||
|
.await?;
|
||||||
|
// update in-memory state
|
||||||
|
self.state = new_state;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Upload the latest version of the partial segment and garbage collect older versions.
|
||||||
|
#[instrument(name = "upload", skip_all, fields(name = %prepared.name))]
|
||||||
|
async fn do_upload(&mut self, prepared: &PartialRemoteSegment) -> anyhow::Result<()> {
|
||||||
|
info!("starting upload {:?}", prepared);
|
||||||
|
|
||||||
|
let state_0 = self.state.clone();
|
||||||
|
let state_1 = {
|
||||||
|
let mut state = state_0.clone();
|
||||||
|
state.segments.push(prepared.clone());
|
||||||
|
state
|
||||||
|
};
|
||||||
|
|
||||||
|
// we're going to upload a new segment, let's write it to disk to make GC later
|
||||||
|
self.commit_state(state_1).await?;
|
||||||
|
|
||||||
|
self.upload_segment(prepared.clone()).await?;
|
||||||
|
|
||||||
|
let state_2 = {
|
||||||
|
let mut state = state_0.clone();
|
||||||
|
for seg in state.segments.iter_mut() {
|
||||||
|
seg.status = UploadStatus::Deleting;
|
||||||
|
}
|
||||||
|
let mut actual_remote_segment = prepared.clone();
|
||||||
|
actual_remote_segment.status = UploadStatus::Uploaded;
|
||||||
|
state.segments.push(actual_remote_segment);
|
||||||
|
state
|
||||||
|
};
|
||||||
|
|
||||||
|
// we've uploaded new segment, it's actual, all other segments should be GCed
|
||||||
|
self.commit_state(state_2).await?;
|
||||||
|
self.gc().await?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Delete all non-Uploaded segments from the remote storage. There should be only one
|
||||||
|
/// Uploaded segment at a time.
|
||||||
|
#[instrument(name = "gc", skip_all)]
|
||||||
|
async fn gc(&mut self) -> anyhow::Result<()> {
|
||||||
|
let mut segments_to_delete = vec![];
|
||||||
|
|
||||||
|
let new_segments: Vec<PartialRemoteSegment> = self
|
||||||
|
.state
|
||||||
|
.segments
|
||||||
|
.iter()
|
||||||
|
.filter_map(|seg| {
|
||||||
|
if seg.status == UploadStatus::Uploaded {
|
||||||
|
Some(seg.clone())
|
||||||
|
} else {
|
||||||
|
segments_to_delete.push(seg.name.clone());
|
||||||
|
None
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
info!("deleting objects: {:?}", segments_to_delete);
|
||||||
|
let mut objects_to_delete = vec![];
|
||||||
|
for seg in segments_to_delete.iter() {
|
||||||
|
let remote_path = RemotePath::new(self.remote_prefix.join(seg).as_ref())?;
|
||||||
|
objects_to_delete.push(remote_path);
|
||||||
|
}
|
||||||
|
|
||||||
|
// removing segments from remote storage
|
||||||
|
wal_backup::delete_objects(&objects_to_delete).await?;
|
||||||
|
|
||||||
|
// now we can update the state on disk
|
||||||
|
let new_state = {
|
||||||
|
let mut state = self.state.clone();
|
||||||
|
state.segments = new_segments;
|
||||||
|
state
|
||||||
|
};
|
||||||
|
self.commit_state(new_state).await?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[instrument(name = "Partial backup", skip_all, fields(ttid = %tli.ttid))]
|
||||||
|
pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
|
||||||
|
debug!("started");
|
||||||
|
let await_duration = conf.partial_backup_timeout;
|
||||||
|
|
||||||
|
let mut cancellation_rx = match tli.get_cancellation_rx() {
|
||||||
|
Ok(rx) => rx,
|
||||||
|
Err(_) => {
|
||||||
|
info!("timeline canceled during task start");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// sleep for random time to avoid thundering herd
|
||||||
|
{
|
||||||
|
let randf64 = rand::thread_rng().gen_range(0.0..1.0);
|
||||||
|
let sleep_duration = await_duration.mul_f64(randf64);
|
||||||
|
tokio::time::sleep(sleep_duration).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
let (_, persistent_state) = tli.get_state().await;
|
||||||
|
let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx();
|
||||||
|
let mut flush_lsn_rx = tli.get_term_flush_lsn_watch_rx();
|
||||||
|
let wal_seg_size = tli.get_wal_seg_size().await;
|
||||||
|
|
||||||
|
let local_prefix = tli.timeline_dir.clone();
|
||||||
|
let remote_prefix = match tli.timeline_dir.strip_prefix(&conf.workdir) {
|
||||||
|
Ok(path) => path.to_owned(),
|
||||||
|
Err(e) => {
|
||||||
|
error!("failed to strip workspace dir prefix: {:?}", e);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut backup = PartialBackup {
|
||||||
|
wal_seg_size,
|
||||||
|
tli,
|
||||||
|
state: persistent_state.partial_backup,
|
||||||
|
conf,
|
||||||
|
local_prefix,
|
||||||
|
remote_prefix,
|
||||||
|
};
|
||||||
|
|
||||||
|
debug!("state: {:?}", backup.state);
|
||||||
|
|
||||||
|
'outer: loop {
|
||||||
|
// wait until we have something to upload
|
||||||
|
let uploaded_segment = backup.state.uploaded_segment();
|
||||||
|
if let Some(seg) = &uploaded_segment {
|
||||||
|
// if we already uploaded something, wait until we have something new
|
||||||
|
while flush_lsn_rx.borrow().lsn == seg.flush_lsn
|
||||||
|
&& *commit_lsn_rx.borrow() == seg.commit_lsn
|
||||||
|
&& flush_lsn_rx.borrow().term == seg.term
|
||||||
|
{
|
||||||
|
tokio::select! {
|
||||||
|
_ = cancellation_rx.changed() => {
|
||||||
|
info!("timeline canceled");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
_ = commit_lsn_rx.changed() => {}
|
||||||
|
_ = flush_lsn_rx.changed() => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// if we don't have any data and zero LSNs, wait for something
|
||||||
|
while flush_lsn_rx.borrow().lsn == Lsn(0) {
|
||||||
|
tokio::select! {
|
||||||
|
_ = cancellation_rx.changed() => {
|
||||||
|
info!("timeline canceled");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
_ = flush_lsn_rx.changed() => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// fixing the segno and waiting some time to prevent reuploading the same segment too often
|
||||||
|
let pending_segno = backup.segno(flush_lsn_rx.borrow().lsn);
|
||||||
|
let timeout = tokio::time::sleep(await_duration);
|
||||||
|
tokio::pin!(timeout);
|
||||||
|
let mut timeout_expired = false;
|
||||||
|
|
||||||
|
// waiting until timeout expires OR segno changes
|
||||||
|
'inner: loop {
|
||||||
|
tokio::select! {
|
||||||
|
_ = cancellation_rx.changed() => {
|
||||||
|
info!("timeline canceled");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
_ = commit_lsn_rx.changed() => {}
|
||||||
|
_ = flush_lsn_rx.changed() => {
|
||||||
|
let segno = backup.segno(flush_lsn_rx.borrow().lsn);
|
||||||
|
if segno != pending_segno {
|
||||||
|
// previous segment is no longer partial, aborting the wait
|
||||||
|
break 'inner;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ = &mut timeout => {
|
||||||
|
// timeout expired, now we are ready for upload
|
||||||
|
timeout_expired = true;
|
||||||
|
break 'inner;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !timeout_expired {
|
||||||
|
// likely segno has changed, let's try again in the next iteration
|
||||||
|
continue 'outer;
|
||||||
|
}
|
||||||
|
|
||||||
|
let prepared = backup.prepare_upload().await;
|
||||||
|
if let Some(seg) = &uploaded_segment {
|
||||||
|
if seg.eq_without_status(&prepared) {
|
||||||
|
// we already uploaded this segment, nothing to do
|
||||||
|
continue 'outer;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
match backup.do_upload(&prepared).await {
|
||||||
|
Ok(()) => {
|
||||||
|
debug!(
|
||||||
|
"uploaded {} up to flush_lsn {}",
|
||||||
|
prepared.name, prepared.flush_lsn
|
||||||
|
);
|
||||||
|
PARTIAL_BACKUP_UPLOADS.with_label_values(&["ok"]).inc();
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
info!("failed to upload {}: {:#}", prepared.name, e);
|
||||||
|
PARTIAL_BACKUP_UPLOADS.with_label_values(&["error"]).inc();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -176,6 +176,8 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
|
|||||||
http_auth: None,
|
http_auth: None,
|
||||||
current_thread_runtime: false,
|
current_thread_runtime: false,
|
||||||
walsenders_keep_horizon: false,
|
walsenders_keep_horizon: false,
|
||||||
|
partial_backup_enabled: false,
|
||||||
|
partial_backup_timeout: Duration::from_secs(0),
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut global = GlobalMap::new(disk, conf.clone())?;
|
let mut global = GlobalMap::new(disk, conf.clone())?;
|
||||||
|
|||||||
@@ -64,14 +64,14 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str:
|
|||||||
Returns basepath for files with captured output.
|
Returns basepath for files with captured output.
|
||||||
"""
|
"""
|
||||||
assert isinstance(cmd, list)
|
assert isinstance(cmd, list)
|
||||||
base = os.path.basename(cmd[0]) + "_{}".format(global_counter())
|
base = f"{os.path.basename(cmd[0])}_{global_counter()}"
|
||||||
basepath = os.path.join(capture_dir, base)
|
basepath = os.path.join(capture_dir, base)
|
||||||
stdout_filename = basepath + ".stdout"
|
stdout_filename = basepath + ".stdout"
|
||||||
stderr_filename = basepath + ".stderr"
|
stderr_filename = basepath + ".stderr"
|
||||||
|
|
||||||
with open(stdout_filename, "w") as stdout_f:
|
with open(stdout_filename, "w") as stdout_f:
|
||||||
with open(stderr_filename, "w") as stderr_f:
|
with open(stderr_filename, "w") as stderr_f:
|
||||||
print('(capturing output to "{}.stdout")'.format(base))
|
print(f'(capturing output to "{base}.stdout")')
|
||||||
subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f)
|
subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f)
|
||||||
|
|
||||||
return basepath
|
return basepath
|
||||||
@@ -82,11 +82,9 @@ class PgBin:
|
|||||||
|
|
||||||
def __init__(self, log_dir: Path, pg_distrib_dir, pg_version):
|
def __init__(self, log_dir: Path, pg_distrib_dir, pg_version):
|
||||||
self.log_dir = log_dir
|
self.log_dir = log_dir
|
||||||
self.pg_bin_path = os.path.join(str(pg_distrib_dir), "v{}".format(pg_version), "bin")
|
self.pg_bin_path = os.path.join(str(pg_distrib_dir), f"v{pg_version}", "bin")
|
||||||
self.env = os.environ.copy()
|
self.env = os.environ.copy()
|
||||||
self.env["LD_LIBRARY_PATH"] = os.path.join(
|
self.env["LD_LIBRARY_PATH"] = os.path.join(str(pg_distrib_dir), f"v{pg_version}", "lib")
|
||||||
str(pg_distrib_dir), "v{}".format(pg_version), "lib"
|
|
||||||
)
|
|
||||||
|
|
||||||
def _fixpath(self, command: List[str]):
|
def _fixpath(self, command: List[str]):
|
||||||
if "/" not in command[0]:
|
if "/" not in command[0]:
|
||||||
@@ -110,7 +108,7 @@ class PgBin:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
self._fixpath(command)
|
self._fixpath(command)
|
||||||
print('Running command "{}"'.format(" ".join(command)))
|
print(f'Running command "{" ".join(command)}"')
|
||||||
env = self._build_env(env)
|
env = self._build_env(env)
|
||||||
subprocess.run(command, env=env, cwd=cwd, check=True)
|
subprocess.run(command, env=env, cwd=cwd, check=True)
|
||||||
|
|
||||||
@@ -128,7 +126,7 @@ class PgBin:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
self._fixpath(command)
|
self._fixpath(command)
|
||||||
print('Running command "{}"'.format(" ".join(command)))
|
print(f'Running command "{" ".join(command)}"')
|
||||||
env = self._build_env(env)
|
env = self._build_env(env)
|
||||||
return subprocess_capture(
|
return subprocess_capture(
|
||||||
str(self.log_dir), command, env=env, cwd=cwd, check=True, **kwargs
|
str(self.log_dir), command, env=env, cwd=cwd, check=True, **kwargs
|
||||||
@@ -300,7 +298,7 @@ class NeonPageserverHttpClient(requests.Session):
|
|||||||
|
|
||||||
def lsn_to_hex(num: int) -> str:
|
def lsn_to_hex(num: int) -> str:
|
||||||
"""Convert lsn from int to standard hex notation."""
|
"""Convert lsn from int to standard hex notation."""
|
||||||
return "{:X}/{:X}".format(num >> 32, num & 0xFFFFFFFF)
|
return f"{num >> 32:X}/{num & 0xFFFFFFFF:X}"
|
||||||
|
|
||||||
|
|
||||||
def lsn_from_hex(lsn_hex: str) -> int:
|
def lsn_from_hex(lsn_hex: str) -> int:
|
||||||
@@ -331,16 +329,12 @@ def wait_for_upload(
|
|||||||
if current_lsn >= lsn:
|
if current_lsn >= lsn:
|
||||||
return
|
return
|
||||||
print(
|
print(
|
||||||
"waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format(
|
f"waiting for remote_consistent_lsn to reach {lsn_to_hex(lsn)}, now {lsn_to_hex(current_lsn)}, iteration {i + 1}"
|
||||||
lsn_to_hex(lsn), lsn_to_hex(current_lsn), i + 1
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|
||||||
raise Exception(
|
raise Exception(
|
||||||
"timed out while waiting for remote_consistent_lsn to reach {}, was {}".format(
|
f"timed out while waiting for remote_consistent_lsn to reach {lsn_to_hex(lsn)}, was {lsn_to_hex(current_lsn)}"
|
||||||
lsn_to_hex(lsn), lsn_to_hex(current_lsn)
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "attachment_service"
|
name = "storage_controller"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
@@ -25,6 +25,7 @@ git-version.workspace = true
|
|||||||
hex.workspace = true
|
hex.workspace = true
|
||||||
hyper.workspace = true
|
hyper.workspace = true
|
||||||
humantime.workspace = true
|
humantime.workspace = true
|
||||||
|
itertools.workspace = true
|
||||||
lasso.workspace = true
|
lasso.workspace = true
|
||||||
once_cell.workspace = true
|
once_cell.workspace = true
|
||||||
pageserver_api.workspace = true
|
pageserver_api.workspace = true
|
||||||
@@ -44,8 +45,8 @@ diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
|
|||||||
diesel_migrations = { version = "2.1.0" }
|
diesel_migrations = { version = "2.1.0" }
|
||||||
r2d2 = { version = "0.8.10" }
|
r2d2 = { version = "0.8.10" }
|
||||||
|
|
||||||
utils = { path = "../../libs/utils/" }
|
utils = { path = "../libs/utils/" }
|
||||||
metrics = { path = "../../libs/metrics/" }
|
metrics = { path = "../libs/metrics/" }
|
||||||
control_plane = { path = ".." }
|
control_plane = { path = "../control_plane" }
|
||||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||||
|
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user