mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-18 02:42:56 +00:00
Compare commits
1 Commits
jcsp/hack
...
proxy-cpla
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
136ed19387 |
@@ -22,7 +22,6 @@
|
||||
!s3_scrubber/
|
||||
!safekeeper/
|
||||
!storage_broker/
|
||||
!storage_controller/
|
||||
!trace/
|
||||
!vendor/postgres-*/
|
||||
!workspace_hack/
|
||||
|
||||
@@ -150,7 +150,7 @@ runs:
|
||||
|
||||
# Use aws s3 cp (instead of aws s3 sync) to keep files from previous runs to make old URLs work,
|
||||
# and to keep files on the host to upload them to the database
|
||||
time s5cmd --log error cp "${WORKDIR}/report/*" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}/"
|
||||
time aws s3 cp --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
|
||||
|
||||
# Generate redirect
|
||||
cat <<EOF > ${WORKDIR}/index.html
|
||||
|
||||
@@ -10,7 +10,7 @@ inputs:
|
||||
required: true
|
||||
api_host:
|
||||
desctiption: 'Neon API host'
|
||||
default: console-stage.neon.build
|
||||
default: console.stage.neon.tech
|
||||
outputs:
|
||||
dsn:
|
||||
description: 'Created Branch DSN (for main database)'
|
||||
|
||||
@@ -13,7 +13,7 @@ inputs:
|
||||
required: true
|
||||
api_host:
|
||||
desctiption: 'Neon API host'
|
||||
default: console-stage.neon.build
|
||||
default: console.stage.neon.tech
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
|
||||
@@ -13,7 +13,7 @@ inputs:
|
||||
default: 15
|
||||
api_host:
|
||||
desctiption: 'Neon API host'
|
||||
default: console-stage.neon.build
|
||||
default: console.stage.neon.tech
|
||||
provisioner:
|
||||
desctiption: 'k8s-pod or k8s-neonvm'
|
||||
default: 'k8s-pod'
|
||||
|
||||
@@ -10,7 +10,7 @@ inputs:
|
||||
required: true
|
||||
api_host:
|
||||
desctiption: 'Neon API host'
|
||||
default: console-stage.neon.build
|
||||
default: console.stage.neon.tech
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
|
||||
1
.github/workflows/approved-for-ci-run.yml
vendored
1
.github/workflows/approved-for-ci-run.yml
vendored
@@ -18,7 +18,6 @@ on:
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
|
||||
cancel-in-progress: false
|
||||
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
58
.github/workflows/benchmarking.yml
vendored
58
.github/workflows/benchmarking.yml
vendored
@@ -147,16 +147,15 @@ jobs:
|
||||
"neonvm-captest-new"
|
||||
],
|
||||
"db_size": [ "10gb" ],
|
||||
"include": [{ "platform": "neon-captest-freetier", "db_size": "3gb" },
|
||||
{ "platform": "neon-captest-new", "db_size": "50gb" },
|
||||
{ "platform": "neonvm-captest-freetier", "db_size": "3gb" },
|
||||
{ "platform": "neonvm-captest-new", "db_size": "50gb" },
|
||||
{ "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
|
||||
"include": [{ "platform": "neon-captest-freetier", "db_size": "3gb" },
|
||||
{ "platform": "neon-captest-new", "db_size": "50gb" },
|
||||
{ "platform": "neonvm-captest-freetier", "db_size": "3gb" },
|
||||
{ "platform": "neonvm-captest-new", "db_size": "50gb" }]
|
||||
}'
|
||||
|
||||
if [ "$(date +%A)" = "Saturday" ]; then
|
||||
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
|
||||
{ "platform": "rds-aurora", "db_size": "50gb"}]')
|
||||
{ "platform": "rds-aurora", "db_size": "50gb"}]')
|
||||
fi
|
||||
|
||||
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
||||
@@ -172,7 +171,7 @@ jobs:
|
||||
|
||||
if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
|
||||
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
|
||||
{ "platform": "rds-aurora" }]')
|
||||
{ "platform": "rds-aurora" }]')
|
||||
fi
|
||||
|
||||
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
||||
@@ -191,7 +190,7 @@ jobs:
|
||||
|
||||
if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
|
||||
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
|
||||
{ "platform": "rds-aurora", "scale": "10" }]')
|
||||
{ "platform": "rds-aurora", "scale": "10" }]')
|
||||
fi
|
||||
|
||||
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
||||
@@ -254,9 +253,6 @@ jobs:
|
||||
neon-captest-reuse)
|
||||
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
|
||||
;;
|
||||
neonvm-captest-sharding-reuse)
|
||||
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
|
||||
;;
|
||||
neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
|
||||
CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
|
||||
;;
|
||||
@@ -274,15 +270,11 @@ jobs:
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
QUERIES=("SELECT version()")
|
||||
QUERY="SELECT version();"
|
||||
if [[ "${PLATFORM}" = "neon"* ]]; then
|
||||
QUERIES+=("SHOW neon.tenant_id")
|
||||
QUERIES+=("SHOW neon.timeline_id")
|
||||
QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
|
||||
fi
|
||||
|
||||
for q in "${QUERIES[@]}"; do
|
||||
psql ${CONNSTR} -c "${q}"
|
||||
done
|
||||
psql ${CONNSTR} -c "${QUERY}"
|
||||
|
||||
- name: Benchmark init
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
@@ -409,15 +401,11 @@ jobs:
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
QUERIES=("SELECT version()")
|
||||
QUERY="SELECT version();"
|
||||
if [[ "${PLATFORM}" = "neon"* ]]; then
|
||||
QUERIES+=("SHOW neon.tenant_id")
|
||||
QUERIES+=("SHOW neon.timeline_id")
|
||||
QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
|
||||
fi
|
||||
|
||||
for q in "${QUERIES[@]}"; do
|
||||
psql ${CONNSTR} -c "${q}"
|
||||
done
|
||||
psql ${CONNSTR} -c "${QUERY}"
|
||||
|
||||
- name: ClickBench benchmark
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
@@ -519,15 +507,11 @@ jobs:
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
QUERIES=("SELECT version()")
|
||||
QUERY="SELECT version();"
|
||||
if [[ "${PLATFORM}" = "neon"* ]]; then
|
||||
QUERIES+=("SHOW neon.tenant_id")
|
||||
QUERIES+=("SHOW neon.timeline_id")
|
||||
QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
|
||||
fi
|
||||
|
||||
for q in "${QUERIES[@]}"; do
|
||||
psql ${CONNSTR} -c "${q}"
|
||||
done
|
||||
psql ${CONNSTR} -c "${QUERY}"
|
||||
|
||||
- name: Run TPC-H benchmark
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
@@ -613,15 +597,11 @@ jobs:
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
QUERIES=("SELECT version()")
|
||||
QUERY="SELECT version();"
|
||||
if [[ "${PLATFORM}" = "neon"* ]]; then
|
||||
QUERIES+=("SHOW neon.tenant_id")
|
||||
QUERIES+=("SHOW neon.timeline_id")
|
||||
QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
|
||||
fi
|
||||
|
||||
for q in "${QUERIES[@]}"; do
|
||||
psql ${CONNSTR} -c "${q}"
|
||||
done
|
||||
psql ${CONNSTR} -c "${QUERY}"
|
||||
|
||||
- name: Run user examples
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
|
||||
@@ -21,7 +21,6 @@ defaults:
|
||||
|
||||
concurrency:
|
||||
group: build-build-tools-image-${{ inputs.image-tag }}
|
||||
cancel-in-progress: false
|
||||
|
||||
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
|
||||
permissions: {}
|
||||
|
||||
5
.github/workflows/build_and_test.yml
vendored
5
.github/workflows/build_and_test.yml
vendored
@@ -1127,15 +1127,15 @@ jobs:
|
||||
-f deployProxy=false \
|
||||
-f deployStorage=true \
|
||||
-f deployStorageBroker=true \
|
||||
-f deployStorageController=true \
|
||||
-f branch=main \
|
||||
-f dockerTag=${{needs.tag.outputs.build-tag}} \
|
||||
-f deployPreprodRegion=true
|
||||
|
||||
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
|
||||
-f deployPgSniRouter=false \
|
||||
-f deployProxy=false \
|
||||
-f deployStorage=true \
|
||||
-f deployStorageBroker=true \
|
||||
-f deployStorageController=true \
|
||||
-f branch=main \
|
||||
-f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
|
||||
@@ -1144,7 +1144,6 @@ jobs:
|
||||
-f deployProxy=true \
|
||||
-f deployStorage=false \
|
||||
-f deployStorageBroker=false \
|
||||
-f deployStorageController=false \
|
||||
-f branch=main \
|
||||
-f dockerTag=${{needs.tag.outputs.build-tag}} \
|
||||
-f deployPreprodRegion=true
|
||||
|
||||
@@ -28,9 +28,7 @@ jobs:
|
||||
- name: Get build-tools image tag for the current commit
|
||||
id: get-build-tools-tag
|
||||
env:
|
||||
# Usually, for COMMIT_SHA, we use `github.event.pull_request.head.sha || github.sha`, but here, even for PRs,
|
||||
# we want to use `github.sha` i.e. point to a phantom merge commit to determine the image tag correctly.
|
||||
COMMIT_SHA: ${{ github.sha }}
|
||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
LAST_BUILD_TOOLS_SHA=$(
|
||||
|
||||
1
.github/workflows/pin-build-tools-image.yml
vendored
1
.github/workflows/pin-build-tools-image.yml
vendored
@@ -20,7 +20,6 @@ defaults:
|
||||
|
||||
concurrency:
|
||||
group: pin-build-tools-image-${{ inputs.from-tag }}
|
||||
cancel-in-progress: false
|
||||
|
||||
permissions: {}
|
||||
|
||||
|
||||
90
.github/workflows/trigger-e2e-tests.yml
vendored
90
.github/workflows/trigger-e2e-tests.yml
vendored
@@ -62,14 +62,14 @@ jobs:
|
||||
|
||||
trigger-e2e-tests:
|
||||
needs: [ tag ]
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
env:
|
||||
TAG: ${{ needs.tag.outputs.build-tag }}
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
||||
options: --init
|
||||
steps:
|
||||
- name: check if ecr image are present
|
||||
env:
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
run: |
|
||||
for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
|
||||
OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
|
||||
@@ -79,55 +79,41 @@ jobs:
|
||||
fi
|
||||
done
|
||||
|
||||
- name: Set e2e-platforms
|
||||
id: e2e-platforms
|
||||
env:
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
# Default set of platforms to run e2e tests on
|
||||
platforms='["docker", "k8s"]'
|
||||
|
||||
# If the PR changes vendor/, pgxn/ or libs/vm_monitor/ directories, or Dockerfile.compute-node, add k8s-neonvm to the list of platforms.
|
||||
# If the workflow run is not a pull request, add k8s-neonvm to the list.
|
||||
if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then
|
||||
for f in $(gh api "/repos/${GITHUB_REPOSITORY}/pulls/${PR_NUMBER}/files" --paginate --jq '.[].filename'); do
|
||||
case "$f" in
|
||||
vendor/*|pgxn/*|libs/vm_monitor/*|Dockerfile.compute-node)
|
||||
platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
|
||||
;;
|
||||
*)
|
||||
# no-op
|
||||
;;
|
||||
esac
|
||||
done
|
||||
else
|
||||
platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
|
||||
fi
|
||||
|
||||
echo "e2e-platforms=${platforms}" | tee -a $GITHUB_OUTPUT
|
||||
|
||||
- name: Set PR's status to pending and request a remote CI test
|
||||
env:
|
||||
E2E_PLATFORMS: ${{ steps.e2e-platforms.outputs.e2e-platforms }}
|
||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
run: |
|
||||
REMOTE_REPO="${GITHUB_REPOSITORY_OWNER}/cloud"
|
||||
# For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit
|
||||
# but we need to use a real sha of a latest commit in the PR's branch for the e2e job,
|
||||
# to place a job run status update later.
|
||||
COMMIT_SHA=${{ github.event.pull_request.head.sha }}
|
||||
# For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those
|
||||
COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
|
||||
|
||||
gh api "/repos/${GITHUB_REPOSITORY}/statuses/${COMMIT_SHA}" \
|
||||
--method POST \
|
||||
--raw-field "state=pending" \
|
||||
--raw-field "description=[$REMOTE_REPO] Remote CI job is about to start" \
|
||||
--raw-field "context=neon-cloud-e2e"
|
||||
REMOTE_REPO="${{ github.repository_owner }}/cloud"
|
||||
|
||||
gh workflow --repo ${REMOTE_REPO} \
|
||||
run testing.yml \
|
||||
--ref "main" \
|
||||
--raw-field "ci_job_name=neon-cloud-e2e" \
|
||||
--raw-field "commit_hash=$COMMIT_SHA" \
|
||||
--raw-field "remote_repo=${GITHUB_REPOSITORY}" \
|
||||
--raw-field "storage_image_tag=${TAG}" \
|
||||
--raw-field "compute_image_tag=${TAG}" \
|
||||
--raw-field "concurrency_group=${E2E_CONCURRENCY_GROUP}" \
|
||||
--raw-field "e2e-platforms=${E2E_PLATFORMS}"
|
||||
curl -f -X POST \
|
||||
https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
|
||||
-H "Accept: application/vnd.github.v3+json" \
|
||||
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
|
||||
--data \
|
||||
"{
|
||||
\"state\": \"pending\",
|
||||
\"context\": \"neon-cloud-e2e\",
|
||||
\"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
|
||||
}"
|
||||
|
||||
curl -f -X POST \
|
||||
https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
|
||||
-H "Accept: application/vnd.github.v3+json" \
|
||||
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
|
||||
--data \
|
||||
"{
|
||||
\"ref\": \"main\",
|
||||
\"inputs\": {
|
||||
\"ci_job_name\": \"neon-cloud-e2e\",
|
||||
\"commit_hash\": \"$COMMIT_SHA\",
|
||||
\"remote_repo\": \"${{ github.repository }}\",
|
||||
\"storage_image_tag\": \"${TAG}\",
|
||||
\"compute_image_tag\": \"${TAG}\",
|
||||
\"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
|
||||
}
|
||||
}"
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/compute_tools/ @neondatabase/control-plane @neondatabase/compute
|
||||
/storage_controller @neondatabase/storage
|
||||
/control_plane/attachment_service @neondatabase/storage
|
||||
/libs/pageserver_api/ @neondatabase/storage
|
||||
/libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers
|
||||
/libs/remote_storage/ @neondatabase/storage
|
||||
|
||||
512
Cargo.lock
generated
512
Cargo.lock
generated
@@ -271,10 +271,42 @@ dependencies = [
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "atomic-take"
|
||||
version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a8ab6b55fe97976e46f91ddbed8d147d966475dc29b2032757ba47e02376fbc3"
|
||||
name = "attachment_service"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"aws-config",
|
||||
"bytes",
|
||||
"camino",
|
||||
"clap",
|
||||
"control_plane",
|
||||
"diesel",
|
||||
"diesel_migrations",
|
||||
"fail",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hex",
|
||||
"humantime",
|
||||
"hyper",
|
||||
"lasso",
|
||||
"measured",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"pageserver_api",
|
||||
"pageserver_client",
|
||||
"postgres_connection",
|
||||
"r2d2",
|
||||
"reqwest",
|
||||
"routerify",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "autocfg"
|
||||
@@ -304,7 +336,7 @@ dependencies = [
|
||||
"fastrand 2.0.0",
|
||||
"hex",
|
||||
"http 0.2.9",
|
||||
"hyper 0.14.26",
|
||||
"hyper",
|
||||
"ring 0.17.6",
|
||||
"time",
|
||||
"tokio",
|
||||
@@ -341,7 +373,7 @@ dependencies = [
|
||||
"bytes",
|
||||
"fastrand 2.0.0",
|
||||
"http 0.2.9",
|
||||
"http-body 0.4.5",
|
||||
"http-body",
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
"tracing",
|
||||
@@ -392,7 +424,7 @@ dependencies = [
|
||||
"aws-types",
|
||||
"bytes",
|
||||
"http 0.2.9",
|
||||
"http-body 0.4.5",
|
||||
"http-body",
|
||||
"once_cell",
|
||||
"percent-encoding",
|
||||
"regex-lite",
|
||||
@@ -520,7 +552,7 @@ dependencies = [
|
||||
"crc32fast",
|
||||
"hex",
|
||||
"http 0.2.9",
|
||||
"http-body 0.4.5",
|
||||
"http-body",
|
||||
"md-5",
|
||||
"pin-project-lite",
|
||||
"sha1",
|
||||
@@ -552,7 +584,7 @@ dependencies = [
|
||||
"bytes-utils",
|
||||
"futures-core",
|
||||
"http 0.2.9",
|
||||
"http-body 0.4.5",
|
||||
"http-body",
|
||||
"once_cell",
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
@@ -591,10 +623,10 @@ dependencies = [
|
||||
"aws-smithy-types",
|
||||
"bytes",
|
||||
"fastrand 2.0.0",
|
||||
"h2 0.3.26",
|
||||
"h2",
|
||||
"http 0.2.9",
|
||||
"http-body 0.4.5",
|
||||
"hyper 0.14.26",
|
||||
"http-body",
|
||||
"hyper",
|
||||
"hyper-rustls",
|
||||
"once_cell",
|
||||
"pin-project-lite",
|
||||
@@ -632,7 +664,7 @@ dependencies = [
|
||||
"bytes-utils",
|
||||
"futures-core",
|
||||
"http 0.2.9",
|
||||
"http-body 0.4.5",
|
||||
"http-body",
|
||||
"itoa",
|
||||
"num-integer",
|
||||
"pin-project-lite",
|
||||
@@ -681,8 +713,8 @@ dependencies = [
|
||||
"bytes",
|
||||
"futures-util",
|
||||
"http 0.2.9",
|
||||
"http-body 0.4.5",
|
||||
"hyper 0.14.26",
|
||||
"http-body",
|
||||
"hyper",
|
||||
"itoa",
|
||||
"matchit",
|
||||
"memchr",
|
||||
@@ -697,7 +729,7 @@ dependencies = [
|
||||
"sha1",
|
||||
"sync_wrapper",
|
||||
"tokio",
|
||||
"tokio-tungstenite 0.20.0",
|
||||
"tokio-tungstenite",
|
||||
"tower",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
@@ -713,7 +745,7 @@ dependencies = [
|
||||
"bytes",
|
||||
"futures-util",
|
||||
"http 0.2.9",
|
||||
"http-body 0.4.5",
|
||||
"http-body",
|
||||
"mime",
|
||||
"rustversion",
|
||||
"tower-layer",
|
||||
@@ -1130,7 +1162,7 @@ version = "4.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "191d9573962933b4027f932c600cd252ce27a8ad5979418fe78e43c07996f27b"
|
||||
dependencies = [
|
||||
"heck 0.4.1",
|
||||
"heck",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
@@ -1202,7 +1234,7 @@ dependencies = [
|
||||
"compute_api",
|
||||
"flate2",
|
||||
"futures",
|
||||
"hyper 0.14.26",
|
||||
"hyper",
|
||||
"nix 0.27.1",
|
||||
"notify",
|
||||
"num_cpus",
|
||||
@@ -1319,7 +1351,7 @@ dependencies = [
|
||||
"git-version",
|
||||
"hex",
|
||||
"humantime",
|
||||
"hyper 0.14.26",
|
||||
"hyper",
|
||||
"nix 0.27.1",
|
||||
"once_cell",
|
||||
"pageserver_api",
|
||||
@@ -1468,9 +1500,12 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-utils"
|
||||
version = "0.8.19"
|
||||
version = "0.8.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"
|
||||
checksum = "3c063cd8cc95f5c377ed0d4b49a4b21f632396ff690e8470c29b3359b346984b"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossterm"
|
||||
@@ -1843,12 +1878,23 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
|
||||
|
||||
[[package]]
|
||||
name = "errno"
|
||||
version = "0.3.8"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245"
|
||||
checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a"
|
||||
dependencies = [
|
||||
"errno-dragonfly",
|
||||
"libc",
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "errno-dragonfly"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2188,9 +2234,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "h2"
|
||||
version = "0.3.26"
|
||||
version = "0.3.24"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "81fe527a889e1532da5c525686d96d4c2e74cdd345badf8dfef9f6b39dd5f5e8"
|
||||
checksum = "bb2c4422095b67ee78da96fbb51a4cc413b3b25883c7717ff7ca1ab31022c9c9"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fnv",
|
||||
@@ -2205,25 +2251,6 @@ dependencies = [
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "h2"
|
||||
version = "0.4.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "816ec7294445779408f36fe57bc5b7fc1cf59664059096c65f905c1c61f58069"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fnv",
|
||||
"futures-core",
|
||||
"futures-sink",
|
||||
"futures-util",
|
||||
"http 1.1.0",
|
||||
"indexmap 2.0.1",
|
||||
"slab",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "half"
|
||||
version = "1.8.2"
|
||||
@@ -2305,12 +2332,6 @@ version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
|
||||
|
||||
[[package]]
|
||||
name = "hermit-abi"
|
||||
version = "0.3.3"
|
||||
@@ -2395,29 +2416,6 @@ dependencies = [
|
||||
"pin-project-lite",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "http-body"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1cac85db508abc24a2e48553ba12a996e87244a0395ce011e62b37158745d643"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"http 1.1.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "http-body-util"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "41cb79eb393015dadd30fc252023adb0b2400a0caee0fa2a077e6e21a551e840"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"futures-util",
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
"pin-project-lite",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "http-types"
|
||||
version = "2.12.0"
|
||||
@@ -2476,9 +2474,9 @@ dependencies = [
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
"futures-util",
|
||||
"h2 0.3.26",
|
||||
"h2",
|
||||
"http 0.2.9",
|
||||
"http-body 0.4.5",
|
||||
"http-body",
|
||||
"httparse",
|
||||
"httpdate",
|
||||
"itoa",
|
||||
@@ -2490,26 +2488,6 @@ dependencies = [
|
||||
"want",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hyper"
|
||||
version = "1.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "186548d73ac615b32a73aafe38fb4f56c0d340e110e5a200bcadbaf2e199263a"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"futures-channel",
|
||||
"futures-util",
|
||||
"h2 0.4.4",
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
"httparse",
|
||||
"httpdate",
|
||||
"itoa",
|
||||
"pin-project-lite",
|
||||
"smallvec",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hyper-rustls"
|
||||
version = "0.24.0"
|
||||
@@ -2517,7 +2495,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0646026eb1b3eea4cd9ba47912ea5ce9cc07713d105b1a14698f4e6433d348b7"
|
||||
dependencies = [
|
||||
"http 0.2.9",
|
||||
"hyper 0.14.26",
|
||||
"hyper",
|
||||
"log",
|
||||
"rustls 0.21.9",
|
||||
"rustls-native-certs 0.6.2",
|
||||
@@ -2531,7 +2509,7 @@ version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1"
|
||||
dependencies = [
|
||||
"hyper 0.14.26",
|
||||
"hyper",
|
||||
"pin-project-lite",
|
||||
"tokio",
|
||||
"tokio-io-timeout",
|
||||
@@ -2544,7 +2522,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"hyper 0.14.26",
|
||||
"hyper",
|
||||
"native-tls",
|
||||
"tokio",
|
||||
"tokio-native-tls",
|
||||
@@ -2552,33 +2530,15 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "hyper-tungstenite"
|
||||
version = "0.13.0"
|
||||
version = "0.11.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7a343d17fe7885302ed7252767dc7bb83609a874b6ff581142241ec4b73957ad"
|
||||
checksum = "7cc7dcb1ab67cd336f468a12491765672e61a3b6b148634dbfe2fe8acd3fe7d9"
|
||||
dependencies = [
|
||||
"http-body-util",
|
||||
"hyper 1.2.0",
|
||||
"hyper-util",
|
||||
"hyper",
|
||||
"pin-project-lite",
|
||||
"tokio",
|
||||
"tokio-tungstenite 0.21.0",
|
||||
"tungstenite 0.21.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hyper-util"
|
||||
version = "0.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ca38ef113da30126bbff9cd1705f9273e15d45498615d138b0c20279ac7a76aa"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"futures-util",
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
"hyper 1.2.0",
|
||||
"pin-project-lite",
|
||||
"socket2 0.5.5",
|
||||
"tokio",
|
||||
"tokio-tungstenite",
|
||||
"tungstenite",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2872,12 +2832,6 @@ version = "0.3.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"
|
||||
|
||||
[[package]]
|
||||
name = "linux-raw-sys"
|
||||
version = "0.4.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"
|
||||
|
||||
[[package]]
|
||||
name = "lock_api"
|
||||
version = "0.4.10"
|
||||
@@ -2932,12 +2886,11 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
|
||||
|
||||
[[package]]
|
||||
name = "measured"
|
||||
version = "0.0.21"
|
||||
version = "0.0.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5"
|
||||
checksum = "f246648d027839a34b420e27c7de1165ace96e19ef894985d0a6ff89a7840a9f"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"crossbeam-utils",
|
||||
"hashbrown 0.14.0",
|
||||
"itoa",
|
||||
"lasso",
|
||||
@@ -2950,27 +2903,16 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "measured-derive"
|
||||
version = "0.0.21"
|
||||
version = "0.0.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6ea497f33e1e856a376c32ad916f69a0bd3c597db1f912a399f842b01a4a685d"
|
||||
checksum = "edaa5cc22d99d5d6d7d99c3b5b5f7e7f8034c22f1b5d62a1adecd2ed005d9b80"
|
||||
dependencies = [
|
||||
"heck 0.5.0",
|
||||
"heck",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "measured-process"
|
||||
version = "0.0.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b364ccb66937a814b6b2ad751d1a2f7a9d5a78c761144036825fb36bb0771000"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"measured",
|
||||
"procfs 0.16.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.6.4"
|
||||
@@ -3010,10 +2952,8 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"chrono",
|
||||
"libc",
|
||||
"measured",
|
||||
"measured-process",
|
||||
"once_cell",
|
||||
"procfs 0.14.2",
|
||||
"procfs",
|
||||
"prometheus",
|
||||
"rand 0.8.5",
|
||||
"rand_distr",
|
||||
@@ -3495,9 +3435,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "ordered-multimap"
|
||||
version = "0.7.3"
|
||||
version = "0.7.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "49203cdcae0030493bad186b28da2fa25645fa276a51b6fec8010d281e02ef79"
|
||||
checksum = "a4d6a8c22fc714f0c2373e6091bf6f5e9b37b1bc0b1184874b7e0a4e303d318f"
|
||||
dependencies = [
|
||||
"dlv-list",
|
||||
"hashbrown 0.14.0",
|
||||
@@ -3563,17 +3503,12 @@ dependencies = [
|
||||
"camino",
|
||||
"clap",
|
||||
"git-version",
|
||||
"humantime",
|
||||
"pageserver",
|
||||
"pageserver_api",
|
||||
"postgres_ffi",
|
||||
"remote_storage",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"svg_fmt",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
@@ -3609,7 +3544,7 @@ dependencies = [
|
||||
"hex-literal",
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"hyper 0.14.26",
|
||||
"hyper",
|
||||
"itertools",
|
||||
"leaky-bucket",
|
||||
"md5",
|
||||
@@ -3628,7 +3563,7 @@ dependencies = [
|
||||
"postgres_connection",
|
||||
"postgres_ffi",
|
||||
"pq_proto",
|
||||
"procfs 0.14.2",
|
||||
"procfs",
|
||||
"rand 0.8.5",
|
||||
"regex",
|
||||
"remote_storage",
|
||||
@@ -3719,6 +3654,7 @@ dependencies = [
|
||||
"anyhow",
|
||||
"async-compression",
|
||||
"async-stream",
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"chrono",
|
||||
@@ -4188,29 +4124,6 @@ dependencies = [
|
||||
"rustix 0.36.16",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "procfs"
|
||||
version = "0.16.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4"
|
||||
dependencies = [
|
||||
"bitflags 2.4.1",
|
||||
"hex",
|
||||
"lazy_static",
|
||||
"procfs-core",
|
||||
"rustix 0.38.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "procfs-core"
|
||||
version = "0.16.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29"
|
||||
dependencies = [
|
||||
"bitflags 2.4.1",
|
||||
"hex",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "prometheus"
|
||||
version = "0.13.3"
|
||||
@@ -4223,7 +4136,7 @@ dependencies = [
|
||||
"libc",
|
||||
"memchr",
|
||||
"parking_lot 0.12.1",
|
||||
"procfs 0.14.2",
|
||||
"procfs",
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
@@ -4244,7 +4157,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"heck 0.4.1",
|
||||
"heck",
|
||||
"itertools",
|
||||
"lazy_static",
|
||||
"log",
|
||||
@@ -4286,9 +4199,7 @@ name = "proxy"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-compression",
|
||||
"async-trait",
|
||||
"atomic-take",
|
||||
"aws-config",
|
||||
"aws-sdk-iam",
|
||||
"aws-sigv4",
|
||||
@@ -4312,17 +4223,13 @@ dependencies = [
|
||||
"hmac",
|
||||
"hostname",
|
||||
"http 1.1.0",
|
||||
"http-body-util",
|
||||
"humantime",
|
||||
"hyper 0.14.26",
|
||||
"hyper 1.2.0",
|
||||
"hyper",
|
||||
"hyper-tungstenite",
|
||||
"hyper-util",
|
||||
"ipnet",
|
||||
"itertools",
|
||||
"lasso",
|
||||
"md5",
|
||||
"measured",
|
||||
"metrics",
|
||||
"native-tls",
|
||||
"once_cell",
|
||||
@@ -4651,7 +4558,7 @@ dependencies = [
|
||||
"futures-util",
|
||||
"http-types",
|
||||
"humantime",
|
||||
"hyper 0.14.26",
|
||||
"hyper",
|
||||
"itertools",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
@@ -4681,10 +4588,10 @@ dependencies = [
|
||||
"encoding_rs",
|
||||
"futures-core",
|
||||
"futures-util",
|
||||
"h2 0.3.26",
|
||||
"h2",
|
||||
"http 0.2.9",
|
||||
"http-body 0.4.5",
|
||||
"hyper 0.14.26",
|
||||
"http-body",
|
||||
"hyper",
|
||||
"hyper-rustls",
|
||||
"hyper-tls",
|
||||
"ipnet",
|
||||
@@ -4742,7 +4649,7 @@ dependencies = [
|
||||
"futures",
|
||||
"getrandom 0.2.11",
|
||||
"http 0.2.9",
|
||||
"hyper 0.14.26",
|
||||
"hyper",
|
||||
"parking_lot 0.11.2",
|
||||
"reqwest",
|
||||
"reqwest-middleware",
|
||||
@@ -4829,7 +4736,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "496c1d3718081c45ba9c31fbfc07417900aa96f4070ff90dc29961836b7a9945"
|
||||
dependencies = [
|
||||
"http 0.2.9",
|
||||
"hyper 0.14.26",
|
||||
"hyper",
|
||||
"lazy_static",
|
||||
"percent-encoding",
|
||||
"regex",
|
||||
@@ -4941,19 +4848,6 @@ dependencies = [
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustix"
|
||||
version = "0.38.28"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316"
|
||||
dependencies = [
|
||||
"bitflags 2.4.1",
|
||||
"errno",
|
||||
"libc",
|
||||
"linux-raw-sys 0.4.13",
|
||||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustls"
|
||||
version = "0.21.9"
|
||||
@@ -5084,7 +4978,6 @@ dependencies = [
|
||||
"aws-smithy-async",
|
||||
"bincode",
|
||||
"bytes",
|
||||
"camino",
|
||||
"chrono",
|
||||
"clap",
|
||||
"crc32c",
|
||||
@@ -5135,7 +5028,7 @@ dependencies = [
|
||||
"git-version",
|
||||
"hex",
|
||||
"humantime",
|
||||
"hyper 0.14.26",
|
||||
"hyper",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"parking_lot 0.12.1",
|
||||
@@ -5620,9 +5513,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "smallvec"
|
||||
version = "1.13.1"
|
||||
version = "1.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7"
|
||||
checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9"
|
||||
|
||||
[[package]]
|
||||
name = "smol_str"
|
||||
@@ -5714,7 +5607,7 @@ dependencies = [
|
||||
"futures-util",
|
||||
"git-version",
|
||||
"humantime",
|
||||
"hyper 0.14.26",
|
||||
"hyper",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"parking_lot 0.12.1",
|
||||
@@ -5728,65 +5621,6 @@ dependencies = [
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "storage_controller"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"aws-config",
|
||||
"bytes",
|
||||
"camino",
|
||||
"clap",
|
||||
"control_plane",
|
||||
"diesel",
|
||||
"diesel_migrations",
|
||||
"fail",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hex",
|
||||
"humantime",
|
||||
"hyper 0.14.26",
|
||||
"itertools",
|
||||
"lasso",
|
||||
"measured",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"pageserver_api",
|
||||
"pageserver_client",
|
||||
"postgres_connection",
|
||||
"r2d2",
|
||||
"reqwest",
|
||||
"routerify",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "storcon_cli"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"clap",
|
||||
"comfy-table",
|
||||
"hyper 0.14.26",
|
||||
"pageserver_api",
|
||||
"pageserver_client",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "stringprep"
|
||||
version = "0.1.2"
|
||||
@@ -5815,7 +5649,7 @@ version = "0.24.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59"
|
||||
dependencies = [
|
||||
"heck 0.4.1",
|
||||
"heck",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"rustversion",
|
||||
@@ -5943,23 +5777,23 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "test-context"
|
||||
version = "0.3.0"
|
||||
version = "0.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6676ab8513edfd2601a108621103fdb45cac9098305ca25ec93f7023b06b05d9"
|
||||
checksum = "055831a02a4f5aa28fede67f2902014273eb8c21b958ac5ebbd59b71ef30dbc3"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"futures",
|
||||
"test-context-macros",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "test-context-macros"
|
||||
version = "0.3.0"
|
||||
version = "0.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "78ea17a2dc368aeca6f554343ced1b1e31f76d63683fa8016e5844bd7a5144a1"
|
||||
checksum = "8901a55b0a7a06ebc4a674dcca925170da8e613fa3b163a1df804ed10afb154d"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 1.0.109",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6100,9 +5934,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
|
||||
|
||||
[[package]]
|
||||
name = "tokio"
|
||||
version = "1.37.0"
|
||||
version = "1.36.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1adbebffeca75fcfd058afa480fb6c0b81e165a0323f9c9d39c9697e37c46787"
|
||||
checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931"
|
||||
dependencies = [
|
||||
"backtrace",
|
||||
"bytes",
|
||||
@@ -6257,19 +6091,7 @@ dependencies = [
|
||||
"futures-util",
|
||||
"log",
|
||||
"tokio",
|
||||
"tungstenite 0.20.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-tungstenite"
|
||||
version = "0.21.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c83b561d025642014097b66e6c1bb422783339e0909e4429cde4749d1990bc38"
|
||||
dependencies = [
|
||||
"futures-util",
|
||||
"log",
|
||||
"tokio",
|
||||
"tungstenite 0.21.0",
|
||||
"tungstenite",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6336,10 +6158,10 @@ dependencies = [
|
||||
"bytes",
|
||||
"futures-core",
|
||||
"futures-util",
|
||||
"h2 0.3.26",
|
||||
"h2",
|
||||
"http 0.2.9",
|
||||
"http-body 0.4.5",
|
||||
"hyper 0.14.26",
|
||||
"http-body",
|
||||
"hyper",
|
||||
"hyper-timeout",
|
||||
"percent-encoding",
|
||||
"pin-project",
|
||||
@@ -6525,7 +6347,7 @@ dependencies = [
|
||||
name = "tracing-utils"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"hyper 0.14.26",
|
||||
"hyper",
|
||||
"opentelemetry",
|
||||
"opentelemetry-otlp",
|
||||
"opentelemetry-semantic-conventions",
|
||||
@@ -6562,25 +6384,6 @@ dependencies = [
|
||||
"utf-8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tungstenite"
|
||||
version = "0.21.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9ef1a641ea34f399a848dea702823bbecfb4c486f911735368f1f137cb8257e1"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"data-encoding",
|
||||
"http 1.1.0",
|
||||
"httparse",
|
||||
"log",
|
||||
"rand 0.8.5",
|
||||
"sha1",
|
||||
"thiserror",
|
||||
"url",
|
||||
"utf-8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "twox-hash"
|
||||
version = "1.6.3"
|
||||
@@ -6745,8 +6548,7 @@ dependencies = [
|
||||
"heapless",
|
||||
"hex",
|
||||
"hex-literal",
|
||||
"humantime",
|
||||
"hyper 0.14.26",
|
||||
"hyper",
|
||||
"jsonwebtoken",
|
||||
"leaky-bucket",
|
||||
"metrics",
|
||||
@@ -7106,15 +6908,6 @@ dependencies = [
|
||||
"windows-targets 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-sys"
|
||||
version = "0.52.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
|
||||
dependencies = [
|
||||
"windows-targets 0.52.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-targets"
|
||||
version = "0.42.2"
|
||||
@@ -7145,21 +6938,6 @@ dependencies = [
|
||||
"windows_x86_64_msvc 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-targets"
|
||||
version = "0.52.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b"
|
||||
dependencies = [
|
||||
"windows_aarch64_gnullvm 0.52.4",
|
||||
"windows_aarch64_msvc 0.52.4",
|
||||
"windows_i686_gnu 0.52.4",
|
||||
"windows_i686_msvc 0.52.4",
|
||||
"windows_x86_64_gnu 0.52.4",
|
||||
"windows_x86_64_gnullvm 0.52.4",
|
||||
"windows_x86_64_msvc 0.52.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_gnullvm"
|
||||
version = "0.42.2"
|
||||
@@ -7172,12 +6950,6 @@ version = "0.48.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc"
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_gnullvm"
|
||||
version = "0.52.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9"
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_msvc"
|
||||
version = "0.42.2"
|
||||
@@ -7190,12 +6962,6 @@ version = "0.48.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3"
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_msvc"
|
||||
version = "0.52.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_gnu"
|
||||
version = "0.42.2"
|
||||
@@ -7208,12 +6974,6 @@ version = "0.48.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_gnu"
|
||||
version = "0.52.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_msvc"
|
||||
version = "0.42.2"
|
||||
@@ -7226,12 +6986,6 @@ version = "0.48.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_msvc"
|
||||
version = "0.52.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnu"
|
||||
version = "0.42.2"
|
||||
@@ -7244,12 +6998,6 @@ version = "0.48.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnu"
|
||||
version = "0.52.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnullvm"
|
||||
version = "0.42.2"
|
||||
@@ -7262,12 +7010,6 @@ version = "0.48.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnullvm"
|
||||
version = "0.52.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_msvc"
|
||||
version = "0.42.2"
|
||||
@@ -7280,12 +7022,6 @@ version = "0.48.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_msvc"
|
||||
version = "0.52.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"
|
||||
|
||||
[[package]]
|
||||
name = "winnow"
|
||||
version = "0.4.6"
|
||||
@@ -7334,10 +7070,11 @@ dependencies = [
|
||||
"futures-sink",
|
||||
"futures-util",
|
||||
"getrandom 0.2.11",
|
||||
"hashbrown 0.13.2",
|
||||
"hashbrown 0.14.0",
|
||||
"hex",
|
||||
"hmac",
|
||||
"hyper 0.14.26",
|
||||
"hyper",
|
||||
"indexmap 1.9.3",
|
||||
"itertools",
|
||||
"libc",
|
||||
@@ -7375,6 +7112,7 @@ dependencies = [
|
||||
"tower",
|
||||
"tracing",
|
||||
"tracing-core",
|
||||
"tungstenite",
|
||||
"url",
|
||||
"uuid",
|
||||
"zeroize",
|
||||
|
||||
11
Cargo.toml
11
Cargo.toml
@@ -3,7 +3,7 @@ resolver = "2"
|
||||
members = [
|
||||
"compute_tools",
|
||||
"control_plane",
|
||||
"control_plane/storcon_cli",
|
||||
"control_plane/attachment_service",
|
||||
"pageserver",
|
||||
"pageserver/compaction",
|
||||
"pageserver/ctl",
|
||||
@@ -12,7 +12,6 @@ members = [
|
||||
"proxy",
|
||||
"safekeeper",
|
||||
"storage_broker",
|
||||
"storage_controller",
|
||||
"s3_scrubber",
|
||||
"workspace_hack",
|
||||
"trace",
|
||||
@@ -44,7 +43,6 @@ license = "Apache-2.0"
|
||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
arc-swap = "1.6"
|
||||
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
|
||||
atomic-take = "1.1.0"
|
||||
azure_core = "0.18"
|
||||
azure_identity = "0.18"
|
||||
azure_storage = "0.18"
|
||||
@@ -98,7 +96,7 @@ http-types = { version = "2", default-features = false }
|
||||
humantime = "2.1"
|
||||
humantime-serde = "1.1.1"
|
||||
hyper = "0.14"
|
||||
hyper-tungstenite = "0.13.0"
|
||||
hyper-tungstenite = "0.11"
|
||||
inotify = "0.10.2"
|
||||
ipnet = "2.9.0"
|
||||
itertools = "0.10"
|
||||
@@ -107,8 +105,7 @@ lasso = "0.7"
|
||||
leaky-bucket = "1.0.1"
|
||||
libc = "0.2"
|
||||
md5 = "0.7.0"
|
||||
measured = { version = "0.0.21", features=["lasso"] }
|
||||
measured-process = { version = "0.0.21" }
|
||||
measured = { version = "0.0.13", features=["default", "lasso"] }
|
||||
memoffset = "0.8"
|
||||
native-tls = "0.2"
|
||||
nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
|
||||
@@ -161,7 +158,7 @@ svg_fmt = "0.4.1"
|
||||
sync_wrapper = "0.1.2"
|
||||
tar = "0.4"
|
||||
task-local-extensions = "0.1.4"
|
||||
test-context = "0.3"
|
||||
test-context = "0.1"
|
||||
thiserror = "1.0"
|
||||
tikv-jemallocator = "0.5"
|
||||
tikv-jemalloc-ctl = "0.5"
|
||||
|
||||
@@ -58,12 +58,6 @@ RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v$
|
||||
&& mv protoc/include/google /usr/local/include/google \
|
||||
&& rm -rf protoc.zip protoc
|
||||
|
||||
# s5cmd
|
||||
ENV S5CMD_VERSION=2.2.2
|
||||
RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/s5cmd_${S5CMD_VERSION}_Linux-$(uname -m | sed 's/x86_64/64bit/g' | sed 's/aarch64/arm64/g').tar.gz" | tar zxvf - s5cmd \
|
||||
&& chmod +x s5cmd \
|
||||
&& mv s5cmd /usr/local/bin/s5cmd
|
||||
|
||||
# LLVM
|
||||
ENV LLVM_VERSION=17
|
||||
RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
|
||||
|
||||
@@ -944,9 +944,6 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
|
||||
COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
|
||||
COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
|
||||
|
||||
# Create remote extension download directory
|
||||
RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions
|
||||
|
||||
# Install:
|
||||
# libreadline8 for psql
|
||||
# libicu67, locales for collations (including ICU and plpgsql_check)
|
||||
|
||||
@@ -818,15 +818,9 @@ impl ComputeNode {
|
||||
Client::connect(zenith_admin_connstr.as_str(), NoTls)
|
||||
.context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?;
|
||||
// Disable forwarding so that users don't get a cloud_admin role
|
||||
|
||||
let mut func = || {
|
||||
client.simple_query("SET neon.forward_ddl = false")?;
|
||||
client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
|
||||
client.simple_query("GRANT zenith_admin TO cloud_admin")?;
|
||||
Ok::<_, anyhow::Error>(())
|
||||
};
|
||||
func().context("apply_config setup cloud_admin")?;
|
||||
|
||||
client.simple_query("SET neon.forward_ddl = false")?;
|
||||
client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
|
||||
client.simple_query("GRANT zenith_admin TO cloud_admin")?;
|
||||
drop(client);
|
||||
|
||||
// reconnect with connstring with expected name
|
||||
@@ -838,29 +832,24 @@ impl ComputeNode {
|
||||
};
|
||||
|
||||
// Disable DDL forwarding because control plane already knows about these roles/databases.
|
||||
client
|
||||
.simple_query("SET neon.forward_ddl = false")
|
||||
.context("apply_config SET neon.forward_ddl = false")?;
|
||||
client.simple_query("SET neon.forward_ddl = false")?;
|
||||
|
||||
// Proceed with post-startup configuration. Note, that order of operations is important.
|
||||
let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
|
||||
create_neon_superuser(spec, &mut client).context("apply_config create_neon_superuser")?;
|
||||
cleanup_instance(&mut client).context("apply_config cleanup_instance")?;
|
||||
handle_roles(spec, &mut client).context("apply_config handle_roles")?;
|
||||
handle_databases(spec, &mut client).context("apply_config handle_databases")?;
|
||||
handle_role_deletions(spec, connstr.as_str(), &mut client)
|
||||
.context("apply_config handle_role_deletions")?;
|
||||
create_neon_superuser(spec, &mut client)?;
|
||||
cleanup_instance(&mut client)?;
|
||||
handle_roles(spec, &mut client)?;
|
||||
handle_databases(spec, &mut client)?;
|
||||
handle_role_deletions(spec, connstr.as_str(), &mut client)?;
|
||||
handle_grants(
|
||||
spec,
|
||||
&mut client,
|
||||
connstr.as_str(),
|
||||
self.has_feature(ComputeFeature::AnonExtension),
|
||||
)
|
||||
.context("apply_config handle_grants")?;
|
||||
handle_extensions(spec, &mut client).context("apply_config handle_extensions")?;
|
||||
handle_extension_neon(&mut client).context("apply_config handle_extension_neon")?;
|
||||
create_availability_check_data(&mut client)
|
||||
.context("apply_config create_availability_check_data")?;
|
||||
)?;
|
||||
handle_extensions(spec, &mut client)?;
|
||||
handle_extension_neon(&mut client)?;
|
||||
create_availability_check_data(&mut client)?;
|
||||
|
||||
// 'Close' connection
|
||||
drop(client);
|
||||
@@ -868,7 +857,7 @@ impl ComputeNode {
|
||||
// Run migrations separately to not hold up cold starts
|
||||
thread::spawn(move || {
|
||||
let mut client = Client::connect(connstr.as_str(), NoTls)?;
|
||||
handle_migrations(&mut client).context("apply_config handle_migrations")
|
||||
handle_migrations(&mut client)
|
||||
});
|
||||
Ok(())
|
||||
}
|
||||
@@ -1273,12 +1262,10 @@ LIMIT 100",
|
||||
.await
|
||||
.map_err(DownloadError::Other);
|
||||
|
||||
if download_size.is_ok() {
|
||||
self.ext_download_progress
|
||||
.write()
|
||||
.expect("bad lock")
|
||||
.insert(ext_archive_name.to_string(), (download_start, true));
|
||||
}
|
||||
self.ext_download_progress
|
||||
.write()
|
||||
.expect("bad lock")
|
||||
.insert(ext_archive_name.to_string(), (download_start, true));
|
||||
|
||||
download_size
|
||||
}
|
||||
|
||||
@@ -6,8 +6,8 @@ use std::path::Path;
|
||||
use anyhow::Result;
|
||||
|
||||
use crate::pg_helpers::escape_conf_value;
|
||||
use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize};
|
||||
use compute_api::spec::{ComputeMode, ComputeSpec, GenericOption};
|
||||
use crate::pg_helpers::PgOptionsSerialize;
|
||||
use compute_api::spec::{ComputeMode, ComputeSpec};
|
||||
|
||||
/// Check that `line` is inside a text file and put it there if it is not.
|
||||
/// Create file if it doesn't exist.
|
||||
@@ -92,27 +92,6 @@ pub fn write_postgres_conf(
|
||||
}
|
||||
}
|
||||
|
||||
if cfg!(target_os = "linux") {
|
||||
// Check /proc/sys/vm/overcommit_memory -- if it equals 2 (i.e. linux memory overcommit is
|
||||
// disabled), then the control plane has enabled swap and we should set
|
||||
// dynamic_shared_memory_type = 'mmap'.
|
||||
//
|
||||
// This is (maybe?) temporary - for more, see https://github.com/neondatabase/cloud/issues/12047.
|
||||
let overcommit_memory_contents = std::fs::read_to_string("/proc/sys/vm/overcommit_memory")
|
||||
// ignore any errors - they may be expected to occur under certain situations (e.g. when
|
||||
// not running in Linux).
|
||||
.unwrap_or_else(|_| String::new());
|
||||
if overcommit_memory_contents.trim() == "2" {
|
||||
let opt = GenericOption {
|
||||
name: "dynamic_shared_memory_type".to_owned(),
|
||||
value: Some("mmap".to_owned()),
|
||||
vartype: "enum".to_owned(),
|
||||
};
|
||||
|
||||
write!(file, "{}", opt.to_pg_setting())?;
|
||||
}
|
||||
}
|
||||
|
||||
// If there are any extra options in the 'settings' field, append those
|
||||
if spec.cluster.settings.is_some() {
|
||||
writeln!(file, "# Managed by compute_ctl: begin")?;
|
||||
|
||||
@@ -44,7 +44,7 @@ pub fn escape_conf_value(s: &str) -> String {
|
||||
format!("'{}'", res)
|
||||
}
|
||||
|
||||
pub trait GenericOptionExt {
|
||||
trait GenericOptionExt {
|
||||
fn to_pg_option(&self) -> String;
|
||||
fn to_pg_setting(&self) -> String;
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@ use std::fs::File;
|
||||
use std::path::Path;
|
||||
use std::str::FromStr;
|
||||
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use postgres::config::Config;
|
||||
use postgres::{Client, NoTls};
|
||||
use reqwest::StatusCode;
|
||||
@@ -302,9 +302,9 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
RoleAction::Create => {
|
||||
// This branch only runs when roles are created through the console, so it is
|
||||
// safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
|
||||
// from neon_superuser.
|
||||
// from neon_superuser. (NOTE: REPLICATION has been removed from here for now).
|
||||
let mut query: String = format!(
|
||||
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
|
||||
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
|
||||
name.pg_quote()
|
||||
);
|
||||
info!("running role create query: '{}'", &query);
|
||||
@@ -698,8 +698,7 @@ pub fn handle_grants(
|
||||
|
||||
// it is important to run this after all grants
|
||||
if enable_anon_extension {
|
||||
handle_extension_anon(spec, &db.owner, &mut db_client, false)
|
||||
.context("handle_grants handle_extension_anon")?;
|
||||
handle_extension_anon(spec, &db.owner, &mut db_client, false)?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -744,24 +743,21 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
|
||||
// which may happen in two cases:
|
||||
// - extension was just installed
|
||||
// - extension was already installed and is up to date
|
||||
let query = "ALTER EXTENSION neon UPDATE";
|
||||
info!("update neon extension version with query: {}", query);
|
||||
if let Err(e) = client.simple_query(query) {
|
||||
error!(
|
||||
"failed to upgrade neon extension during `handle_extension_neon`: {}",
|
||||
e
|
||||
);
|
||||
}
|
||||
// DISABLED due to compute node unpinning epic
|
||||
// let query = "ALTER EXTENSION neon UPDATE";
|
||||
// info!("update neon extension version with query: {}", query);
|
||||
// client.simple_query(query)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
|
||||
info!("handle neon extension upgrade");
|
||||
let query = "ALTER EXTENSION neon UPDATE";
|
||||
info!("update neon extension version with query: {}", query);
|
||||
client.simple_query(query)?;
|
||||
pub fn handle_neon_extension_upgrade(_client: &mut Client) -> Result<()> {
|
||||
info!("handle neon extension upgrade (not really)");
|
||||
// DISABLED due to compute node unpinning epic
|
||||
// let query = "ALTER EXTENSION neon UPDATE";
|
||||
// info!("update neon extension version with query: {}", query);
|
||||
// client.simple_query(query)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -810,40 +806,43 @@ $$;"#,
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
// Add new migrations below.
|
||||
r#"
|
||||
DO $$
|
||||
DECLARE
|
||||
role_name TEXT;
|
||||
BEGIN
|
||||
FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
|
||||
LOOP
|
||||
RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
|
||||
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
|
||||
END LOOP;
|
||||
END
|
||||
$$;"#,
|
||||
];
|
||||
|
||||
let mut func = || {
|
||||
let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
|
||||
client.simple_query(query)?;
|
||||
let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
|
||||
client.simple_query(query)?;
|
||||
|
||||
let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
|
||||
client.simple_query(query)?;
|
||||
query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
|
||||
client.simple_query(query)?;
|
||||
|
||||
let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
|
||||
client.simple_query(query)?;
|
||||
query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
|
||||
client.simple_query(query)?;
|
||||
|
||||
let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
|
||||
client.simple_query(query)?;
|
||||
query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
|
||||
client.simple_query(query)?;
|
||||
|
||||
let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
|
||||
client.simple_query(query)?;
|
||||
Ok::<_, anyhow::Error>(())
|
||||
};
|
||||
func().context("handle_migrations prepare")?;
|
||||
query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
|
||||
client.simple_query(query)?;
|
||||
|
||||
let query = "SELECT id FROM neon_migration.migration_id";
|
||||
let row = client
|
||||
.query_one(query, &[])
|
||||
.context("handle_migrations get migration_id")?;
|
||||
query = "SELECT id FROM neon_migration.migration_id";
|
||||
let row = client.query_one(query, &[])?;
|
||||
let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
|
||||
let starting_migration_id = current_migration;
|
||||
|
||||
let query = "BEGIN";
|
||||
client
|
||||
.simple_query(query)
|
||||
.context("handle_migrations begin")?;
|
||||
query = "BEGIN";
|
||||
client.simple_query(query)?;
|
||||
|
||||
while current_migration < migrations.len() {
|
||||
let migration = &migrations[current_migration];
|
||||
@@ -851,9 +850,7 @@ $$;"#,
|
||||
info!("Skip migration id={}", current_migration);
|
||||
} else {
|
||||
info!("Running migration:\n{}\n", migration);
|
||||
client.simple_query(migration).with_context(|| {
|
||||
format!("handle_migrations current_migration={}", current_migration)
|
||||
})?;
|
||||
client.simple_query(migration)?;
|
||||
}
|
||||
current_migration += 1;
|
||||
}
|
||||
@@ -861,14 +858,10 @@ $$;"#,
|
||||
"UPDATE neon_migration.migration_id SET id={}",
|
||||
migrations.len()
|
||||
);
|
||||
client
|
||||
.simple_query(&setval)
|
||||
.context("handle_migrations update id")?;
|
||||
client.simple_query(&setval)?;
|
||||
|
||||
let query = "COMMIT";
|
||||
client
|
||||
.simple_query(query)
|
||||
.context("handle_migrations commit")?;
|
||||
query = "COMMIT";
|
||||
client.simple_query(query)?;
|
||||
|
||||
info!(
|
||||
"Ran {} migrations",
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[package]
|
||||
name = "storage_controller"
|
||||
name = "attachment_service"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
@@ -25,7 +25,6 @@ git-version.workspace = true
|
||||
hex.workspace = true
|
||||
hyper.workspace = true
|
||||
humantime.workspace = true
|
||||
itertools.workspace = true
|
||||
lasso.workspace = true
|
||||
once_cell.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
@@ -45,8 +44,8 @@ diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
|
||||
diesel_migrations = { version = "2.1.0" }
|
||||
r2d2 = { version = "0.8.10" }
|
||||
|
||||
utils = { path = "../libs/utils/" }
|
||||
metrics = { path = "../libs/metrics/" }
|
||||
control_plane = { path = "../control_plane" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
utils = { path = "../../libs/utils/" }
|
||||
metrics = { path = "../../libs/metrics/" }
|
||||
control_plane = { path = ".." }
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
use std::sync::Arc;
|
||||
use std::{collections::HashMap, time::Duration};
|
||||
|
||||
use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
|
||||
@@ -15,32 +14,19 @@ use utils::{
|
||||
|
||||
use crate::service::Config;
|
||||
|
||||
const BUSY_DELAY: Duration = Duration::from_secs(1);
|
||||
const SLOWDOWN_DELAY: Duration = Duration::from_secs(5);
|
||||
|
||||
const NOTIFY_REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
|
||||
|
||||
pub(crate) const API_CONCURRENCY: usize = 32;
|
||||
|
||||
struct UnshardedComputeHookTenant {
|
||||
// Which node is this tenant attached to
|
||||
node_id: NodeId,
|
||||
|
||||
// Must hold this lock to send a notification.
|
||||
send_lock: Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>,
|
||||
}
|
||||
struct ShardedComputeHookTenant {
|
||||
stripe_size: ShardStripeSize,
|
||||
shard_count: ShardCount,
|
||||
shards: Vec<(ShardNumber, NodeId)>,
|
||||
|
||||
// Must hold this lock to send a notification. The contents represent
|
||||
// the last successfully sent notification, and are used to coalesce multiple
|
||||
// updates by only sending when there is a chance since our last successful send.
|
||||
send_lock: Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>,
|
||||
}
|
||||
|
||||
enum ComputeHookTenant {
|
||||
Unsharded(UnshardedComputeHookTenant),
|
||||
Unsharded(NodeId),
|
||||
Sharded(ShardedComputeHookTenant),
|
||||
}
|
||||
|
||||
@@ -52,20 +38,9 @@ impl ComputeHookTenant {
|
||||
shards: vec![(tenant_shard_id.shard_number, node_id)],
|
||||
stripe_size,
|
||||
shard_count: tenant_shard_id.shard_count,
|
||||
send_lock: Arc::default(),
|
||||
})
|
||||
} else {
|
||||
Self::Unsharded(UnshardedComputeHookTenant {
|
||||
node_id,
|
||||
send_lock: Arc::default(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn get_send_lock(&self) -> &Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>> {
|
||||
match self {
|
||||
Self::Unsharded(unsharded_tenant) => &unsharded_tenant.send_lock,
|
||||
Self::Sharded(sharded_tenant) => &sharded_tenant.send_lock,
|
||||
Self::Unsharded(node_id)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -78,8 +53,8 @@ impl ComputeHookTenant {
|
||||
node_id: NodeId,
|
||||
) {
|
||||
match self {
|
||||
Self::Unsharded(unsharded_tenant) if tenant_shard_id.shard_count.count() == 1 => {
|
||||
unsharded_tenant.node_id = node_id
|
||||
Self::Unsharded(existing_node_id) if tenant_shard_id.shard_count.count() == 1 => {
|
||||
*existing_node_id = node_id
|
||||
}
|
||||
Self::Sharded(sharded_tenant)
|
||||
if sharded_tenant.stripe_size == stripe_size
|
||||
@@ -106,14 +81,14 @@ impl ComputeHookTenant {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)]
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
struct ComputeHookNotifyRequestShard {
|
||||
node_id: NodeId,
|
||||
shard_number: ShardNumber,
|
||||
}
|
||||
|
||||
/// Request body that we send to the control plane to notify it of where a tenant is attached
|
||||
#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)]
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
struct ComputeHookNotifyRequest {
|
||||
tenant_id: TenantId,
|
||||
stripe_size: Option<ShardStripeSize>,
|
||||
@@ -146,44 +121,14 @@ pub(crate) enum NotifyError {
|
||||
Fatal(StatusCode),
|
||||
}
|
||||
|
||||
enum MaybeSendResult {
|
||||
// Please send this request while holding the lock, and if you succeed then write
|
||||
// the request into the lock.
|
||||
Transmit(
|
||||
(
|
||||
ComputeHookNotifyRequest,
|
||||
tokio::sync::OwnedMutexGuard<Option<ComputeHookNotifyRequest>>,
|
||||
),
|
||||
),
|
||||
// Something requires sending, but you must wait for a current sender then call again
|
||||
AwaitLock(Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>),
|
||||
// Nothing requires sending
|
||||
Noop,
|
||||
}
|
||||
|
||||
impl ComputeHookTenant {
|
||||
fn maybe_send(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
lock: Option<tokio::sync::OwnedMutexGuard<Option<ComputeHookNotifyRequest>>>,
|
||||
) -> MaybeSendResult {
|
||||
let locked = match lock {
|
||||
Some(already_locked) => already_locked,
|
||||
None => {
|
||||
// Lock order: this _must_ be only a try_lock, because we are called inside of the [`ComputeHook::state`] lock.
|
||||
let Ok(locked) = self.get_send_lock().clone().try_lock_owned() else {
|
||||
return MaybeSendResult::AwaitLock(self.get_send_lock().clone());
|
||||
};
|
||||
locked
|
||||
}
|
||||
};
|
||||
|
||||
let request = match self {
|
||||
Self::Unsharded(unsharded_tenant) => Some(ComputeHookNotifyRequest {
|
||||
fn maybe_reconfigure(&self, tenant_id: TenantId) -> Option<ComputeHookNotifyRequest> {
|
||||
match self {
|
||||
Self::Unsharded(node_id) => Some(ComputeHookNotifyRequest {
|
||||
tenant_id,
|
||||
shards: vec![ComputeHookNotifyRequestShard {
|
||||
shard_number: ShardNumber(0),
|
||||
node_id: unsharded_tenant.node_id,
|
||||
node_id: *node_id,
|
||||
}],
|
||||
stripe_size: None,
|
||||
}),
|
||||
@@ -207,25 +152,12 @@ impl ComputeHookTenant {
|
||||
// Sharded tenant doesn't yet have information for all its shards
|
||||
|
||||
tracing::info!(
|
||||
"ComputeHookTenant::maybe_send: not enough shards ({}/{})",
|
||||
"ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
|
||||
sharded_tenant.shards.len(),
|
||||
sharded_tenant.shard_count.count()
|
||||
);
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
match request {
|
||||
None => {
|
||||
// Not yet ready to emit a notification
|
||||
tracing::info!("Tenant isn't yet ready to emit a notification");
|
||||
MaybeSendResult::Noop
|
||||
}
|
||||
Some(request) if Some(&request) == locked.as_ref() => {
|
||||
// No change from the last value successfully sent
|
||||
MaybeSendResult::Noop
|
||||
}
|
||||
Some(request) => MaybeSendResult::Transmit((request, locked)),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -235,19 +167,8 @@ impl ComputeHookTenant {
|
||||
/// the compute connection string.
|
||||
pub(super) struct ComputeHook {
|
||||
config: Config,
|
||||
state: std::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
|
||||
state: tokio::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
|
||||
authorization_header: Option<String>,
|
||||
|
||||
// Concurrency limiter, so that we do not overload the cloud control plane when updating
|
||||
// large numbers of tenants (e.g. when failing over after a node failure)
|
||||
api_concurrency: tokio::sync::Semaphore,
|
||||
|
||||
// This lock is only used in testing enviroments, to serialize calls into neon_lock
|
||||
neon_local_lock: tokio::sync::Mutex<()>,
|
||||
|
||||
// We share a client across all notifications to enable connection re-use etc when
|
||||
// sending large numbers of notifications
|
||||
client: reqwest::Client,
|
||||
}
|
||||
|
||||
impl ComputeHook {
|
||||
@@ -257,30 +178,18 @@ impl ComputeHook {
|
||||
.clone()
|
||||
.map(|jwt| format!("Bearer {}", jwt));
|
||||
|
||||
let client = reqwest::ClientBuilder::new()
|
||||
.timeout(NOTIFY_REQUEST_TIMEOUT)
|
||||
.build()
|
||||
.expect("Failed to construct HTTP client");
|
||||
|
||||
Self {
|
||||
state: Default::default(),
|
||||
config,
|
||||
authorization_header,
|
||||
neon_local_lock: Default::default(),
|
||||
api_concurrency: tokio::sync::Semaphore::new(API_CONCURRENCY),
|
||||
client,
|
||||
}
|
||||
}
|
||||
|
||||
/// For test environments: use neon_local's LocalEnv to update compute
|
||||
async fn do_notify_local(
|
||||
&self,
|
||||
reconfigure_request: &ComputeHookNotifyRequest,
|
||||
reconfigure_request: ComputeHookNotifyRequest,
|
||||
) -> anyhow::Result<()> {
|
||||
// neon_local updates are not safe to call concurrently, use a lock to serialize
|
||||
// all calls to this function
|
||||
let _locked = self.neon_local_lock.lock().await;
|
||||
|
||||
let env = match LocalEnv::load_config() {
|
||||
Ok(e) => e,
|
||||
Err(e) => {
|
||||
@@ -297,7 +206,7 @@ impl ComputeHook {
|
||||
} = reconfigure_request;
|
||||
|
||||
let compute_pageservers = shards
|
||||
.iter()
|
||||
.into_iter()
|
||||
.map(|shard| {
|
||||
let ps_conf = env
|
||||
.get_pageserver_conf(shard.node_id)
|
||||
@@ -309,10 +218,10 @@ impl ComputeHook {
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
for (endpoint_name, endpoint) in &cplane.endpoints {
|
||||
if endpoint.tenant_id == *tenant_id && endpoint.status() == EndpointStatus::Running {
|
||||
if endpoint.tenant_id == tenant_id && endpoint.status() == EndpointStatus::Running {
|
||||
tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
|
||||
endpoint
|
||||
.reconfigure(compute_pageservers.clone(), *stripe_size)
|
||||
.reconfigure(compute_pageservers.clone(), stripe_size)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
@@ -322,11 +231,12 @@ impl ComputeHook {
|
||||
|
||||
async fn do_notify_iteration(
|
||||
&self,
|
||||
client: &reqwest::Client,
|
||||
url: &String,
|
||||
reconfigure_request: &ComputeHookNotifyRequest,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), NotifyError> {
|
||||
let req = self.client.request(Method::PUT, url);
|
||||
let req = client.request(Method::PUT, url);
|
||||
let req = if let Some(value) = &self.authorization_header {
|
||||
req.header(reqwest::header::AUTHORIZATION, value)
|
||||
} else {
|
||||
@@ -370,10 +280,11 @@ impl ComputeHook {
|
||||
Err(NotifyError::SlowDown)
|
||||
}
|
||||
StatusCode::LOCKED => {
|
||||
// We consider this fatal, because it's possible that the operation blocking the control one is
|
||||
// also the one that is waiting for this reconcile. We should let the reconciler calling
|
||||
// this hook fail, to give control plane a chance to un-lock.
|
||||
tracing::info!("Control plane reports tenant is locked, dropping out of notify");
|
||||
// Delay our retry if busy: the usual fast exponential backoff in backoff::retry
|
||||
// is not appropriate
|
||||
tokio::time::timeout(BUSY_DELAY, cancel.cancelled())
|
||||
.await
|
||||
.ok();
|
||||
Err(NotifyError::Busy)
|
||||
}
|
||||
StatusCode::SERVICE_UNAVAILABLE
|
||||
@@ -389,27 +300,13 @@ impl ComputeHook {
|
||||
async fn do_notify(
|
||||
&self,
|
||||
url: &String,
|
||||
reconfigure_request: &ComputeHookNotifyRequest,
|
||||
reconfigure_request: ComputeHookNotifyRequest,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), NotifyError> {
|
||||
// We hold these semaphore units across all retries, rather than only across each
|
||||
// HTTP request: this is to preserve fairness and avoid a situation where a retry might
|
||||
// time out waiting for a semaphore.
|
||||
let _units = self
|
||||
.api_concurrency
|
||||
.acquire()
|
||||
.await
|
||||
// Interpret closed semaphore as shutdown
|
||||
.map_err(|_| NotifyError::ShuttingDown)?;
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
backoff::retry(
|
||||
|| self.do_notify_iteration(url, reconfigure_request, cancel),
|
||||
|e| {
|
||||
matches!(
|
||||
e,
|
||||
NotifyError::Fatal(_) | NotifyError::Unexpected(_) | NotifyError::Busy
|
||||
)
|
||||
},
|
||||
|| self.do_notify_iteration(&client, url, &reconfigure_request, cancel),
|
||||
|e| matches!(e, NotifyError::Fatal(_) | NotifyError::Unexpected(_)),
|
||||
3,
|
||||
10,
|
||||
"Send compute notification",
|
||||
@@ -443,70 +340,42 @@ impl ComputeHook {
|
||||
stripe_size: ShardStripeSize,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), NotifyError> {
|
||||
let maybe_send_result = {
|
||||
let mut state_locked = self.state.lock().unwrap();
|
||||
let mut locked = self.state.lock().await;
|
||||
|
||||
use std::collections::hash_map::Entry;
|
||||
let tenant = match state_locked.entry(tenant_shard_id.tenant_id) {
|
||||
Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
|
||||
tenant_shard_id,
|
||||
stripe_size,
|
||||
node_id,
|
||||
)),
|
||||
Entry::Occupied(e) => {
|
||||
let tenant = e.into_mut();
|
||||
tenant.update(tenant_shard_id, stripe_size, node_id);
|
||||
tenant
|
||||
}
|
||||
};
|
||||
tenant.maybe_send(tenant_shard_id.tenant_id, None)
|
||||
use std::collections::hash_map::Entry;
|
||||
let tenant = match locked.entry(tenant_shard_id.tenant_id) {
|
||||
Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
|
||||
tenant_shard_id,
|
||||
stripe_size,
|
||||
node_id,
|
||||
)),
|
||||
Entry::Occupied(e) => {
|
||||
let tenant = e.into_mut();
|
||||
tenant.update(tenant_shard_id, stripe_size, node_id);
|
||||
tenant
|
||||
}
|
||||
};
|
||||
|
||||
// Process result: we may get an update to send, or we may have to wait for a lock
|
||||
// before trying again.
|
||||
let (request, mut send_lock_guard) = match maybe_send_result {
|
||||
MaybeSendResult::Noop => {
|
||||
return Ok(());
|
||||
}
|
||||
MaybeSendResult::AwaitLock(send_lock) => {
|
||||
let send_locked = send_lock.lock_owned().await;
|
||||
|
||||
// Lock order: maybe_send is called within the `[Self::state]` lock, and takes the send lock, but here
|
||||
// we have acquired the send lock and take `[Self::state]` lock. This is safe because maybe_send only uses
|
||||
// try_lock.
|
||||
let state_locked = self.state.lock().unwrap();
|
||||
let Some(tenant) = state_locked.get(&tenant_shard_id.tenant_id) else {
|
||||
return Ok(());
|
||||
};
|
||||
match tenant.maybe_send(tenant_shard_id.tenant_id, Some(send_locked)) {
|
||||
MaybeSendResult::AwaitLock(_) => {
|
||||
unreachable!("We supplied lock guard")
|
||||
}
|
||||
MaybeSendResult::Noop => {
|
||||
return Ok(());
|
||||
}
|
||||
MaybeSendResult::Transmit((request, lock)) => (request, lock),
|
||||
}
|
||||
}
|
||||
MaybeSendResult::Transmit((request, lock)) => (request, lock),
|
||||
let reconfigure_request = tenant.maybe_reconfigure(tenant_shard_id.tenant_id);
|
||||
let Some(reconfigure_request) = reconfigure_request else {
|
||||
// The tenant doesn't yet have pageservers for all its shards: we won't notify anything
|
||||
// until it does.
|
||||
tracing::info!("Tenant isn't yet ready to emit a notification");
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
let result = if let Some(notify_url) = &self.config.compute_hook_url {
|
||||
self.do_notify(notify_url, &request, cancel).await
|
||||
if let Some(notify_url) = &self.config.compute_hook_url {
|
||||
self.do_notify(notify_url, reconfigure_request, cancel)
|
||||
.await
|
||||
} else {
|
||||
self.do_notify_local(&request).await.map_err(|e| {
|
||||
// This path is for testing only, so munge the error into our prod-style error type.
|
||||
tracing::error!("Local notification hook failed: {e}");
|
||||
NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR)
|
||||
})
|
||||
};
|
||||
|
||||
if result.is_ok() {
|
||||
// Before dropping the send lock, stash the request we just sent so that
|
||||
// subsequent callers can avoid redundantly re-sending the same thing.
|
||||
*send_lock_guard = Some(request);
|
||||
self.do_notify_local(reconfigure_request)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
// This path is for testing only, so munge the error into our prod-style error type.
|
||||
tracing::error!("Local notification hook failed: {e}");
|
||||
NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR)
|
||||
})
|
||||
}
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
@@ -530,22 +399,21 @@ pub(crate) mod tests {
|
||||
NodeId(1),
|
||||
);
|
||||
|
||||
// An unsharded tenant is always ready to emit a notification, but won't
|
||||
// send the same one twice
|
||||
let send_result = tenant_state.maybe_send(tenant_id, None);
|
||||
let MaybeSendResult::Transmit((request, mut guard)) = send_result else {
|
||||
anyhow::bail!("Wrong send result");
|
||||
};
|
||||
assert_eq!(request.shards.len(), 1);
|
||||
assert!(request.stripe_size.is_none());
|
||||
|
||||
// Simulate successful send
|
||||
*guard = Some(request);
|
||||
drop(guard);
|
||||
|
||||
// Try asking again: this should be a no-op
|
||||
let send_result = tenant_state.maybe_send(tenant_id, None);
|
||||
assert!(matches!(send_result, MaybeSendResult::Noop));
|
||||
// An unsharded tenant is always ready to emit a notification
|
||||
assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
|
||||
assert_eq!(
|
||||
tenant_state
|
||||
.maybe_reconfigure(tenant_id)
|
||||
.unwrap()
|
||||
.shards
|
||||
.len(),
|
||||
1
|
||||
);
|
||||
assert!(tenant_state
|
||||
.maybe_reconfigure(tenant_id)
|
||||
.unwrap()
|
||||
.stripe_size
|
||||
.is_none());
|
||||
|
||||
// Writing the first shard of a multi-sharded situation (i.e. in a split)
|
||||
// resets the tenant state and puts it in an non-notifying state (need to
|
||||
@@ -559,10 +427,7 @@ pub(crate) mod tests {
|
||||
ShardStripeSize(32768),
|
||||
NodeId(1),
|
||||
);
|
||||
assert!(matches!(
|
||||
tenant_state.maybe_send(tenant_id, None),
|
||||
MaybeSendResult::Noop
|
||||
));
|
||||
assert!(tenant_state.maybe_reconfigure(tenant_id).is_none());
|
||||
|
||||
// Writing the second shard makes it ready to notify
|
||||
tenant_state.update(
|
||||
@@ -575,16 +440,22 @@ pub(crate) mod tests {
|
||||
NodeId(1),
|
||||
);
|
||||
|
||||
let send_result = tenant_state.maybe_send(tenant_id, None);
|
||||
let MaybeSendResult::Transmit((request, mut guard)) = send_result else {
|
||||
anyhow::bail!("Wrong send result");
|
||||
};
|
||||
assert_eq!(request.shards.len(), 2);
|
||||
assert_eq!(request.stripe_size, Some(ShardStripeSize(32768)));
|
||||
|
||||
// Simulate successful send
|
||||
*guard = Some(request);
|
||||
drop(guard);
|
||||
assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
|
||||
assert_eq!(
|
||||
tenant_state
|
||||
.maybe_reconfigure(tenant_id)
|
||||
.unwrap()
|
||||
.shards
|
||||
.len(),
|
||||
2
|
||||
);
|
||||
assert_eq!(
|
||||
tenant_state
|
||||
.maybe_reconfigure(tenant_id)
|
||||
.unwrap()
|
||||
.stripe_size,
|
||||
Some(ShardStripeSize(32768))
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -8,7 +8,6 @@ use futures::Future;
|
||||
use hyper::header::CONTENT_TYPE;
|
||||
use hyper::{Body, Request, Response};
|
||||
use hyper::{StatusCode, Uri};
|
||||
use metrics::{BuildInfo, NeonMetrics};
|
||||
use pageserver_api::models::{
|
||||
TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
|
||||
TenantTimeTravelRequest, TimelineCreateRequest,
|
||||
@@ -35,8 +34,7 @@ use utils::{
|
||||
};
|
||||
|
||||
use pageserver_api::controller_api::{
|
||||
NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantPolicyRequest,
|
||||
TenantShardMigrateRequest,
|
||||
NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
|
||||
};
|
||||
use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
|
||||
|
||||
@@ -45,19 +43,15 @@ use control_plane::storage_controller::{AttachHookRequest, InspectRequest};
|
||||
use routerify::Middleware;
|
||||
|
||||
/// State available to HTTP request handlers
|
||||
#[derive(Clone)]
|
||||
pub struct HttpState {
|
||||
service: Arc<crate::service::Service>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
neon_metrics: NeonMetrics,
|
||||
allowlist_routes: Vec<Uri>,
|
||||
}
|
||||
|
||||
impl HttpState {
|
||||
pub fn new(
|
||||
service: Arc<crate::service::Service>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
build_info: BuildInfo,
|
||||
) -> Self {
|
||||
pub fn new(service: Arc<crate::service::Service>, auth: Option<Arc<SwappableJwtAuth>>) -> Self {
|
||||
let allowlist_routes = ["/status", "/ready", "/metrics"]
|
||||
.iter()
|
||||
.map(|v| v.parse().unwrap())
|
||||
@@ -65,7 +59,6 @@ impl HttpState {
|
||||
Self {
|
||||
service,
|
||||
auth,
|
||||
neon_metrics: NeonMetrics::new(build_info),
|
||||
allowlist_routes,
|
||||
}
|
||||
}
|
||||
@@ -405,15 +398,6 @@ async fn handle_tenant_describe(
|
||||
json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
|
||||
}
|
||||
|
||||
async fn handle_tenant_list(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
json_response(StatusCode::OK, service.tenant_list())
|
||||
}
|
||||
|
||||
async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
@@ -427,10 +411,7 @@ async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let state = get_state(&req);
|
||||
let nodes = state.service.node_list().await?;
|
||||
let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::<Vec<_>>();
|
||||
|
||||
json_response(StatusCode::OK, api_nodes)
|
||||
json_response(StatusCode::OK, state.service.node_list().await?)
|
||||
}
|
||||
|
||||
async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
@@ -497,22 +478,6 @@ async fn handle_tenant_shard_migrate(
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_tenant_update_policy(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
let update_req = json_request::<TenantPolicyRequest>(&mut req).await?;
|
||||
let state = get_state(&req);
|
||||
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
state
|
||||
.service
|
||||
.tenant_update_policy(tenant_id, update_req)
|
||||
.await?,
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
check_permissions(&req, Scope::PageServerApi)?;
|
||||
@@ -522,18 +487,6 @@ async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiErr
|
||||
json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?)
|
||||
}
|
||||
|
||||
async fn handle_tenant_import(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
check_permissions(&req, Scope::PageServerApi)?;
|
||||
|
||||
let state = get_state(&req);
|
||||
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
state.service.tenant_import(tenant_id).await?,
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
@@ -556,14 +509,6 @@ async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>,
|
||||
json_response(StatusCode::OK, state.service.consistency_check().await?)
|
||||
}
|
||||
|
||||
async fn handle_reconcile_all(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let state = get_state(&req);
|
||||
|
||||
json_response(StatusCode::OK, state.service.reconcile_all_now().await?)
|
||||
}
|
||||
|
||||
/// Status endpoint is just used for checking that our HTTP listener is up
|
||||
async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
json_response(StatusCode::OK, ())
|
||||
@@ -620,17 +565,9 @@ where
|
||||
.await
|
||||
}
|
||||
|
||||
/// Check if the required scope is held in the request's token, or if the request has
|
||||
/// a token with 'admin' scope then always permit it.
|
||||
fn check_permissions(request: &Request<Body>, required_scope: Scope) -> Result<(), ApiError> {
|
||||
check_permission_with(request, |claims| {
|
||||
match crate::auth::check_permission(claims, required_scope) {
|
||||
Err(e) => match crate::auth::check_permission(claims, Scope::Admin) {
|
||||
Ok(()) => Ok(()),
|
||||
Err(_) => Err(e),
|
||||
},
|
||||
Ok(()) => Ok(()),
|
||||
}
|
||||
crate::auth::check_permission(claims, required_scope)
|
||||
})
|
||||
}
|
||||
|
||||
@@ -690,11 +627,10 @@ fn epilogue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn measured_metrics_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
pub async fn measured_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4";
|
||||
|
||||
let state = get_state(&req);
|
||||
let payload = crate::metrics::METRICS_REGISTRY.encode(&state.neon_metrics);
|
||||
let payload = crate::metrics::METRICS_REGISTRY.encode();
|
||||
let response = Response::builder()
|
||||
.status(200)
|
||||
.header(CONTENT_TYPE, TEXT_FORMAT)
|
||||
@@ -723,7 +659,6 @@ where
|
||||
pub fn make_router(
|
||||
service: Arc<Service>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
build_info: BuildInfo,
|
||||
) -> RouterBuilder<hyper::Body, ApiError> {
|
||||
let mut router = endpoint::make_router()
|
||||
.middleware(prologue_metrics_middleware())
|
||||
@@ -740,7 +675,7 @@ pub fn make_router(
|
||||
}
|
||||
|
||||
router
|
||||
.data(Arc::new(HttpState::new(service, auth, build_info)))
|
||||
.data(Arc::new(HttpState::new(service, auth)))
|
||||
.get("/metrics", |r| {
|
||||
named_request_span(r, measured_metrics_handler, RequestName("metrics"))
|
||||
})
|
||||
@@ -771,13 +706,6 @@ pub fn make_router(
|
||||
.post("/debug/v1/node/:node_id/drop", |r| {
|
||||
named_request_span(r, handle_node_drop, RequestName("debug_v1_node_drop"))
|
||||
})
|
||||
.post("/debug/v1/tenant/:tenant_id/import", |r| {
|
||||
named_request_span(
|
||||
r,
|
||||
handle_tenant_import,
|
||||
RequestName("debug_v1_tenant_import"),
|
||||
)
|
||||
})
|
||||
.get("/debug/v1/tenant", |r| {
|
||||
named_request_span(r, handle_tenants_dump, RequestName("debug_v1_tenant"))
|
||||
})
|
||||
@@ -798,9 +726,6 @@ pub fn make_router(
|
||||
RequestName("debug_v1_consistency_check"),
|
||||
)
|
||||
})
|
||||
.post("/debug/v1/reconcile_all", |r| {
|
||||
request_span(r, handle_reconcile_all)
|
||||
})
|
||||
.put("/debug/v1/failpoints", |r| {
|
||||
request_span(r, |r| failpoints_handler(r, CancellationToken::new()))
|
||||
})
|
||||
@@ -840,16 +765,6 @@ pub fn make_router(
|
||||
RequestName("control_v1_tenant_describe"),
|
||||
)
|
||||
})
|
||||
.get("/control/v1/tenant", |r| {
|
||||
tenant_service_handler(r, handle_tenant_list, RequestName("control_v1_tenant_list"))
|
||||
})
|
||||
.put("/control/v1/tenant/:tenant_id/policy", |r| {
|
||||
named_request_span(
|
||||
r,
|
||||
handle_tenant_update_policy,
|
||||
RequestName("control_v1_tenant_policy"),
|
||||
)
|
||||
})
|
||||
// Tenant operations
|
||||
// The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
|
||||
// this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
|
||||
@@ -14,7 +14,7 @@ mod reconciler;
|
||||
mod scheduler;
|
||||
mod schema;
|
||||
pub mod service;
|
||||
mod tenant_shard;
|
||||
mod tenant_state;
|
||||
|
||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone, Serialize)]
|
||||
struct Sequence(u64);
|
||||
@@ -1,20 +1,18 @@
|
||||
use anyhow::{anyhow, Context};
|
||||
use attachment_service::http::make_router;
|
||||
use attachment_service::metrics::preinitialize_metrics;
|
||||
use attachment_service::persistence::Persistence;
|
||||
use attachment_service::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT};
|
||||
use camino::Utf8PathBuf;
|
||||
use clap::Parser;
|
||||
use diesel::Connection;
|
||||
use metrics::launch_timestamp::LaunchTimestamp;
|
||||
use metrics::BuildInfo;
|
||||
use std::sync::Arc;
|
||||
use storage_controller::http::make_router;
|
||||
use storage_controller::metrics::preinitialize_metrics;
|
||||
use storage_controller::persistence::Persistence;
|
||||
use storage_controller::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT};
|
||||
use tokio::signal::unix::SignalKind;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::auth::{JwtAuth, SwappableJwtAuth};
|
||||
use utils::logging::{self, LogFormat};
|
||||
|
||||
use utils::sentry_init::init_sentry;
|
||||
use utils::{project_build_tag, project_git_version, tcp_listener};
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
@@ -52,7 +50,7 @@ struct Cli {
|
||||
#[arg(short, long)]
|
||||
path: Option<Utf8PathBuf>,
|
||||
|
||||
/// URL to connect to postgres, like postgresql://localhost:1234/storage_controller
|
||||
/// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
|
||||
#[arg(long)]
|
||||
database_url: Option<String>,
|
||||
|
||||
@@ -160,8 +158,6 @@ fn main() -> anyhow::Result<()> {
|
||||
std::process::exit(1);
|
||||
}));
|
||||
|
||||
let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
|
||||
|
||||
tokio::runtime::Builder::new_current_thread()
|
||||
// We use spawn_blocking for database operations, so require approximately
|
||||
// as many blocking threads as we will open database connections.
|
||||
@@ -193,11 +189,6 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
args.listen
|
||||
);
|
||||
|
||||
let build_info = BuildInfo {
|
||||
revision: GIT_VERSION,
|
||||
build_tag: BUILD_TAG,
|
||||
};
|
||||
|
||||
let strict_mode = if args.dev {
|
||||
StrictMode::Dev
|
||||
} else {
|
||||
@@ -259,7 +250,7 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
let auth = secrets
|
||||
.public_key
|
||||
.map(|jwt_auth| Arc::new(SwappableJwtAuth::new(jwt_auth)));
|
||||
let router = make_router(service.clone(), auth, build_info)
|
||||
let router = make_router(service.clone(), auth)
|
||||
.build()
|
||||
.map_err(|err| anyhow!(err))?;
|
||||
let router_service = utils::http::RouterService::new(router).unwrap();
|
||||
@@ -8,8 +8,10 @@
|
||||
//! The rest of the code defines label group types and deals with converting outer types to labels.
|
||||
//!
|
||||
use bytes::Bytes;
|
||||
use measured::{label::LabelValue, metric::histogram, FixedCardinalityLabel, MetricGroup};
|
||||
use metrics::NeonMetrics;
|
||||
use measured::{
|
||||
label::{LabelValue, StaticLabelSet},
|
||||
FixedCardinalityLabel, MetricGroup,
|
||||
};
|
||||
use once_cell::sync::Lazy;
|
||||
use std::sync::Mutex;
|
||||
|
||||
@@ -24,28 +26,21 @@ pub fn preinitialize_metrics() {
|
||||
|
||||
pub(crate) struct StorageControllerMetrics {
|
||||
pub(crate) metrics_group: StorageControllerMetricGroup,
|
||||
encoder: Mutex<measured::text::BufferedTextEncoder>,
|
||||
encoder: Mutex<measured::text::TextEncoder>,
|
||||
}
|
||||
|
||||
#[derive(measured::MetricGroup)]
|
||||
#[metric(new())]
|
||||
pub(crate) struct StorageControllerMetricGroup {
|
||||
/// Count of how many times we spawn a reconcile task
|
||||
pub(crate) storage_controller_reconcile_spawn: measured::Counter,
|
||||
|
||||
/// Reconciler tasks completed, broken down by success/failure/cancelled
|
||||
pub(crate) storage_controller_reconcile_complete:
|
||||
measured::CounterVec<ReconcileCompleteLabelGroupSet>,
|
||||
|
||||
/// Count of how many times we make an optimization change to a tenant's scheduling
|
||||
pub(crate) storage_controller_schedule_optimization: measured::Counter,
|
||||
|
||||
/// HTTP request status counters for handled requests
|
||||
pub(crate) storage_controller_http_request_status:
|
||||
measured::CounterVec<HttpRequestStatusLabelGroupSet>,
|
||||
|
||||
/// HTTP request handler latency across all status codes
|
||||
#[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
|
||||
pub(crate) storage_controller_http_request_latency:
|
||||
measured::HistogramVec<HttpRequestLatencyLabelGroupSet, 5>,
|
||||
|
||||
@@ -57,7 +52,6 @@ pub(crate) struct StorageControllerMetricGroup {
|
||||
/// Latency of HTTP requests to the pageserver, broken down by pageserver
|
||||
/// node id, request name and method. This include both successful and unsuccessful
|
||||
/// requests.
|
||||
#[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
|
||||
pub(crate) storage_controller_pageserver_request_latency:
|
||||
measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
|
||||
|
||||
@@ -69,7 +63,6 @@ pub(crate) struct StorageControllerMetricGroup {
|
||||
/// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver
|
||||
/// node id, request name and method. This include both successful and unsuccessful
|
||||
/// requests.
|
||||
#[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
|
||||
pub(crate) storage_controller_passthrough_request_latency:
|
||||
measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
|
||||
|
||||
@@ -78,34 +71,75 @@ pub(crate) struct StorageControllerMetricGroup {
|
||||
measured::CounterVec<DatabaseQueryErrorLabelGroupSet>,
|
||||
|
||||
/// Latency of database queries, broken down by operation.
|
||||
#[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
|
||||
pub(crate) storage_controller_database_query_latency:
|
||||
measured::HistogramVec<DatabaseQueryLatencyLabelGroupSet, 5>,
|
||||
}
|
||||
|
||||
impl StorageControllerMetrics {
|
||||
pub(crate) fn encode(&self, neon_metrics: &NeonMetrics) -> Bytes {
|
||||
pub(crate) fn encode(&self) -> Bytes {
|
||||
let mut encoder = self.encoder.lock().unwrap();
|
||||
neon_metrics
|
||||
.collect_group_into(&mut *encoder)
|
||||
.unwrap_or_else(|infallible| match infallible {});
|
||||
self.metrics_group
|
||||
.collect_group_into(&mut *encoder)
|
||||
.unwrap_or_else(|infallible| match infallible {});
|
||||
self.metrics_group.collect_into(&mut *encoder);
|
||||
encoder.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for StorageControllerMetrics {
|
||||
fn default() -> Self {
|
||||
let mut metrics_group = StorageControllerMetricGroup::new();
|
||||
metrics_group
|
||||
.storage_controller_reconcile_complete
|
||||
.init_all_dense();
|
||||
|
||||
Self {
|
||||
metrics_group,
|
||||
encoder: Mutex::new(measured::text::BufferedTextEncoder::new()),
|
||||
metrics_group: StorageControllerMetricGroup::new(),
|
||||
encoder: Mutex::new(measured::text::TextEncoder::new()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl StorageControllerMetricGroup {
|
||||
pub(crate) fn new() -> Self {
|
||||
Self {
|
||||
storage_controller_reconcile_spawn: measured::Counter::new(),
|
||||
storage_controller_reconcile_complete: measured::CounterVec::new(
|
||||
ReconcileCompleteLabelGroupSet {
|
||||
status: StaticLabelSet::new(),
|
||||
},
|
||||
),
|
||||
storage_controller_http_request_status: measured::CounterVec::new(
|
||||
HttpRequestStatusLabelGroupSet {
|
||||
path: lasso::ThreadedRodeo::new(),
|
||||
method: StaticLabelSet::new(),
|
||||
status: StaticLabelSet::new(),
|
||||
},
|
||||
),
|
||||
storage_controller_http_request_latency: measured::HistogramVec::new(
|
||||
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
|
||||
),
|
||||
storage_controller_pageserver_request_error: measured::CounterVec::new(
|
||||
PageserverRequestLabelGroupSet {
|
||||
pageserver_id: lasso::ThreadedRodeo::new(),
|
||||
path: lasso::ThreadedRodeo::new(),
|
||||
method: StaticLabelSet::new(),
|
||||
},
|
||||
),
|
||||
storage_controller_pageserver_request_latency: measured::HistogramVec::new(
|
||||
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
|
||||
),
|
||||
storage_controller_passthrough_request_error: measured::CounterVec::new(
|
||||
PageserverRequestLabelGroupSet {
|
||||
pageserver_id: lasso::ThreadedRodeo::new(),
|
||||
path: lasso::ThreadedRodeo::new(),
|
||||
method: StaticLabelSet::new(),
|
||||
},
|
||||
),
|
||||
storage_controller_passthrough_request_latency: measured::HistogramVec::new(
|
||||
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
|
||||
),
|
||||
storage_controller_database_query_error: measured::CounterVec::new(
|
||||
DatabaseQueryErrorLabelGroupSet {
|
||||
operation: StaticLabelSet::new(),
|
||||
error_type: StaticLabelSet::new(),
|
||||
},
|
||||
),
|
||||
storage_controller_database_query_latency: measured::HistogramVec::new(
|
||||
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -119,7 +153,7 @@ pub(crate) struct ReconcileCompleteLabelGroup {
|
||||
#[derive(measured::LabelGroup)]
|
||||
#[label(set = HttpRequestStatusLabelGroupSet)]
|
||||
pub(crate) struct HttpRequestStatusLabelGroup<'a> {
|
||||
#[label(dynamic_with = lasso::ThreadedRodeo, default)]
|
||||
#[label(dynamic_with = lasso::ThreadedRodeo)]
|
||||
pub(crate) path: &'a str,
|
||||
pub(crate) method: Method,
|
||||
pub(crate) status: StatusCode,
|
||||
@@ -128,21 +162,40 @@ pub(crate) struct HttpRequestStatusLabelGroup<'a> {
|
||||
#[derive(measured::LabelGroup)]
|
||||
#[label(set = HttpRequestLatencyLabelGroupSet)]
|
||||
pub(crate) struct HttpRequestLatencyLabelGroup<'a> {
|
||||
#[label(dynamic_with = lasso::ThreadedRodeo, default)]
|
||||
#[label(dynamic_with = lasso::ThreadedRodeo)]
|
||||
pub(crate) path: &'a str,
|
||||
pub(crate) method: Method,
|
||||
}
|
||||
|
||||
impl Default for HttpRequestLatencyLabelGroupSet {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
path: lasso::ThreadedRodeo::new(),
|
||||
method: StaticLabelSet::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(measured::LabelGroup, Clone)]
|
||||
#[label(set = PageserverRequestLabelGroupSet)]
|
||||
pub(crate) struct PageserverRequestLabelGroup<'a> {
|
||||
#[label(dynamic_with = lasso::ThreadedRodeo, default)]
|
||||
#[label(dynamic_with = lasso::ThreadedRodeo)]
|
||||
pub(crate) pageserver_id: &'a str,
|
||||
#[label(dynamic_with = lasso::ThreadedRodeo, default)]
|
||||
#[label(dynamic_with = lasso::ThreadedRodeo)]
|
||||
pub(crate) path: &'a str,
|
||||
pub(crate) method: Method,
|
||||
}
|
||||
|
||||
impl Default for PageserverRequestLabelGroupSet {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
pageserver_id: lasso::ThreadedRodeo::new(),
|
||||
path: lasso::ThreadedRodeo::new(),
|
||||
method: StaticLabelSet::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(measured::LabelGroup)]
|
||||
#[label(set = DatabaseQueryErrorLabelGroupSet)]
|
||||
pub(crate) struct DatabaseQueryErrorLabelGroup {
|
||||
@@ -156,7 +209,7 @@ pub(crate) struct DatabaseQueryLatencyLabelGroup {
|
||||
pub(crate) operation: DatabaseOperation,
|
||||
}
|
||||
|
||||
#[derive(FixedCardinalityLabel, Clone, Copy)]
|
||||
#[derive(FixedCardinalityLabel)]
|
||||
pub(crate) enum ReconcileOutcome {
|
||||
#[label(rename = "ok")]
|
||||
Success,
|
||||
@@ -164,7 +217,7 @@ pub(crate) enum ReconcileOutcome {
|
||||
Cancel,
|
||||
}
|
||||
|
||||
#[derive(FixedCardinalityLabel, Copy, Clone)]
|
||||
#[derive(FixedCardinalityLabel, Clone)]
|
||||
pub(crate) enum Method {
|
||||
Get,
|
||||
Put,
|
||||
@@ -189,12 +242,11 @@ impl From<hyper::Method> for Method {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode);
|
||||
|
||||
impl LabelValue for StatusCode {
|
||||
fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
|
||||
v.write_int(self.0.as_u16() as i64)
|
||||
v.write_int(self.0.as_u16() as u64)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -212,7 +264,7 @@ impl FixedCardinalityLabel for StatusCode {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(FixedCardinalityLabel, Clone, Copy)]
|
||||
#[derive(FixedCardinalityLabel)]
|
||||
pub(crate) enum DatabaseErrorLabel {
|
||||
Query,
|
||||
Connection,
|
||||
@@ -3,8 +3,7 @@ use std::{str::FromStr, time::Duration};
|
||||
use hyper::StatusCode;
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy,
|
||||
TenantLocateResponseShard,
|
||||
NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, TenantLocateResponseShard,
|
||||
},
|
||||
shard::TenantShardId,
|
||||
};
|
||||
@@ -257,19 +256,6 @@ impl Node {
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Generate the simplified API-friendly description of a node's state
|
||||
pub(crate) fn describe(&self) -> NodeDescribeResponse {
|
||||
NodeDescribeResponse {
|
||||
id: self.id,
|
||||
availability: self.availability.into(),
|
||||
scheduling: self.scheduling,
|
||||
listen_http_addr: self.listen_http_addr.clone(),
|
||||
listen_http_port: self.listen_http_port,
|
||||
listen_pg_addr: self.listen_pg_addr.clone(),
|
||||
listen_pg_port: self.listen_pg_port,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Node {
|
||||
@@ -1,14 +1,13 @@
|
||||
use pageserver_api::{
|
||||
models::{
|
||||
LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress,
|
||||
TenantScanRemoteStorageResponse, TenantShardSplitRequest, TenantShardSplitResponse,
|
||||
TimelineCreateRequest, TimelineInfo,
|
||||
TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
|
||||
},
|
||||
shard::TenantShardId,
|
||||
};
|
||||
use pageserver_client::mgmt_api::{Client, Result};
|
||||
use reqwest::StatusCode;
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
use utils::id::{NodeId, TimelineId};
|
||||
|
||||
/// Thin wrapper around [`pageserver_client::mgmt_api::Client`]. It allows the storage
|
||||
/// controller to collect metrics in a non-intrusive manner.
|
||||
@@ -89,18 +88,6 @@ impl PageserverClient {
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) async fn tenant_scan_remote_storage(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<TenantScanRemoteStorageResponse> {
|
||||
measured_request!(
|
||||
"tenant_scan_remote_storage",
|
||||
crate::metrics::Method::Get,
|
||||
&self.node_id_label,
|
||||
self.inner.tenant_scan_remote_storage(tenant_id).await
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) async fn tenant_secondary_download(
|
||||
&self,
|
||||
tenant_id: TenantShardId,
|
||||
@@ -9,7 +9,6 @@ use camino::Utf8PathBuf;
|
||||
use diesel::pg::PgConnection;
|
||||
use diesel::prelude::*;
|
||||
use diesel::Connection;
|
||||
use pageserver_api::controller_api::ShardSchedulingPolicy;
|
||||
use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
|
||||
use pageserver_api::models::TenantConfig;
|
||||
use pageserver_api::shard::ShardConfigError;
|
||||
@@ -79,7 +78,7 @@ pub(crate) enum DatabaseError {
|
||||
Logical(String),
|
||||
}
|
||||
|
||||
#[derive(measured::FixedCardinalityLabel, Copy, Clone)]
|
||||
#[derive(measured::FixedCardinalityLabel, Clone)]
|
||||
pub(crate) enum DatabaseOperation {
|
||||
InsertNode,
|
||||
UpdateNode,
|
||||
@@ -108,12 +107,6 @@ pub(crate) enum AbortShardSplitStatus {
|
||||
|
||||
pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;
|
||||
|
||||
/// Some methods can operate on either a whole tenant or a single shard
|
||||
pub(crate) enum TenantFilter {
|
||||
Tenant(TenantId),
|
||||
Shard(TenantShardId),
|
||||
}
|
||||
|
||||
impl Persistence {
|
||||
// The default postgres connection limit is 100. We use up to 99, to leave one free for a human admin under
|
||||
// normal circumstances. This assumes we have exclusive use of the database cluster to which we connect.
|
||||
@@ -147,13 +140,15 @@ impl Persistence {
|
||||
/// Wraps `with_conn` in order to collect latency and error metrics
|
||||
async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
|
||||
where
|
||||
F: FnOnce(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
|
||||
F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
|
||||
R: Send + 'static,
|
||||
{
|
||||
let latency = &METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_database_query_latency;
|
||||
let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { operation: op });
|
||||
let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup {
|
||||
operation: op.clone(),
|
||||
});
|
||||
|
||||
let res = self.with_conn(func).await;
|
||||
|
||||
@@ -173,7 +168,7 @@ impl Persistence {
|
||||
/// Call the provided function in a tokio blocking thread, with a Diesel database connection.
|
||||
async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
|
||||
where
|
||||
F: FnOnce(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
|
||||
F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
|
||||
R: Send + 'static,
|
||||
{
|
||||
let mut conn = self.connection_pool.get()?;
|
||||
@@ -280,11 +275,6 @@ impl Persistence {
|
||||
// Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165
|
||||
shard.placement_policy = "{\"Attached\":0}".to_string();
|
||||
}
|
||||
|
||||
if shard.scheduling_policy.is_empty() {
|
||||
shard.scheduling_policy =
|
||||
serde_json::to_string(&ShardSchedulingPolicy::default()).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
let tenants: Vec<TenantShardPersistence> = decoded.tenants.into_values().collect();
|
||||
@@ -475,45 +465,59 @@ impl Persistence {
|
||||
/// that we only do the first time a tenant is set to an attached policy via /location_config.
|
||||
pub(crate) async fn update_tenant_shard(
|
||||
&self,
|
||||
tenant: TenantFilter,
|
||||
input_placement_policy: Option<PlacementPolicy>,
|
||||
input_config: Option<TenantConfig>,
|
||||
tenant_shard_id: TenantShardId,
|
||||
input_placement_policy: PlacementPolicy,
|
||||
input_config: TenantConfig,
|
||||
input_generation: Option<Generation>,
|
||||
input_scheduling_policy: Option<ShardSchedulingPolicy>,
|
||||
) -> DatabaseResult<()> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
|
||||
self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| {
|
||||
let query = match tenant {
|
||||
TenantFilter::Shard(tenant_shard_id) => diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
||||
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
||||
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
|
||||
.into_boxed(),
|
||||
TenantFilter::Tenant(input_tenant_id) => diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(input_tenant_id.to_string()))
|
||||
.into_boxed(),
|
||||
};
|
||||
let query = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
||||
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
||||
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32));
|
||||
|
||||
#[derive(AsChangeset)]
|
||||
#[diesel(table_name = crate::schema::tenant_shards)]
|
||||
struct ShardUpdate {
|
||||
generation: Option<i32>,
|
||||
placement_policy: Option<String>,
|
||||
config: Option<String>,
|
||||
scheduling_policy: Option<String>,
|
||||
if let Some(input_generation) = input_generation {
|
||||
// Update includes generation column
|
||||
query
|
||||
.set((
|
||||
generation.eq(Some(input_generation.into().unwrap() as i32)),
|
||||
placement_policy
|
||||
.eq(serde_json::to_string(&input_placement_policy).unwrap()),
|
||||
config.eq(serde_json::to_string(&input_config).unwrap()),
|
||||
))
|
||||
.execute(conn)?;
|
||||
} else {
|
||||
// Update does not include generation column
|
||||
query
|
||||
.set((
|
||||
placement_policy
|
||||
.eq(serde_json::to_string(&input_placement_policy).unwrap()),
|
||||
config.eq(serde_json::to_string(&input_config).unwrap()),
|
||||
))
|
||||
.execute(conn)?;
|
||||
}
|
||||
|
||||
let update = ShardUpdate {
|
||||
generation: input_generation.map(|g| g.into().unwrap() as i32),
|
||||
placement_policy: input_placement_policy
|
||||
.map(|p| serde_json::to_string(&p).unwrap()),
|
||||
config: input_config.map(|c| serde_json::to_string(&c).unwrap()),
|
||||
scheduling_policy: input_scheduling_policy
|
||||
.map(|p| serde_json::to_string(&p).unwrap()),
|
||||
};
|
||||
Ok(())
|
||||
})
|
||||
.await?;
|
||||
|
||||
query.set(update).execute(conn)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn update_tenant_config(
|
||||
&self,
|
||||
input_tenant_id: TenantId,
|
||||
input_config: TenantConfig,
|
||||
) -> DatabaseResult<()> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
|
||||
self.with_measured_conn(DatabaseOperation::UpdateTenantConfig, move |conn| {
|
||||
diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(input_tenant_id.to_string()))
|
||||
.set((config.eq(serde_json::to_string(&input_config).unwrap()),))
|
||||
.execute(conn)?;
|
||||
|
||||
Ok(())
|
||||
})
|
||||
@@ -694,7 +698,7 @@ impl Persistence {
|
||||
}
|
||||
}
|
||||
|
||||
/// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
|
||||
/// Parts of [`crate::tenant_state::TenantState`] that are stored durably
|
||||
#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)]
|
||||
#[diesel(table_name = crate::schema::tenant_shards)]
|
||||
pub(crate) struct TenantShardPersistence {
|
||||
@@ -724,8 +728,6 @@ pub(crate) struct TenantShardPersistence {
|
||||
pub(crate) splitting: SplitState,
|
||||
#[serde(default)]
|
||||
pub(crate) config: String,
|
||||
#[serde(default)]
|
||||
pub(crate) scheduling_policy: String,
|
||||
}
|
||||
|
||||
impl TenantShardPersistence {
|
||||
@@ -18,14 +18,14 @@ use utils::sync::gate::GateGuard;
|
||||
|
||||
use crate::compute_hook::{ComputeHook, NotifyError};
|
||||
use crate::node::Node;
|
||||
use crate::tenant_shard::{IntentState, ObservedState, ObservedStateLocation};
|
||||
use crate::tenant_state::{IntentState, ObservedState, ObservedStateLocation};
|
||||
|
||||
const DEFAULT_HEATMAP_PERIOD: &str = "60s";
|
||||
|
||||
/// Object with the lifetime of the background reconcile task that is created
|
||||
/// for tenants which have a difference between their intent and observed states.
|
||||
pub(super) struct Reconciler {
|
||||
/// See [`crate::tenant_shard::TenantShard`] for the meanings of these fields: they are a snapshot
|
||||
/// See [`crate::tenant_state::TenantState`] for the meanings of these fields: they are a snapshot
|
||||
/// of a tenant's state from when we spawned a reconcile task.
|
||||
pub(super) tenant_shard_id: TenantShardId,
|
||||
pub(crate) shard: ShardIdentity,
|
||||
@@ -48,11 +48,11 @@ pub(super) struct Reconciler {
|
||||
|
||||
/// To avoid stalling if the cloud control plane is unavailable, we may proceed
|
||||
/// past failures in [`ComputeHook::notify`], but we _must_ remember that we failed
|
||||
/// so that we can set [`crate::tenant_shard::TenantShard::pending_compute_notification`] to ensure a later retry.
|
||||
/// so that we can set [`crate::tenant_state::TenantState::pending_compute_notification`] to ensure a later retry.
|
||||
pub(crate) compute_notify_failure: bool,
|
||||
|
||||
/// A means to abort background reconciliation: it is essential to
|
||||
/// call this when something changes in the original TenantShard that
|
||||
/// call this when something changes in the original TenantState that
|
||||
/// will make this reconciliation impossible or unnecessary, for
|
||||
/// example when a pageserver node goes offline, or the PlacementPolicy for
|
||||
/// the tenant is changed.
|
||||
@@ -66,7 +66,7 @@ pub(super) struct Reconciler {
|
||||
pub(crate) persistence: Arc<Persistence>,
|
||||
}
|
||||
|
||||
/// This is a snapshot of [`crate::tenant_shard::IntentState`], but it does not do any
|
||||
/// This is a snapshot of [`crate::tenant_state::IntentState`], but it does not do any
|
||||
/// reference counting for Scheduler. The IntentState is what the scheduler works with,
|
||||
/// and the TargetState is just the instruction for a particular Reconciler run.
|
||||
#[derive(Debug)]
|
||||
@@ -487,7 +487,6 @@ impl Reconciler {
|
||||
while let Err(e) = self.compute_notify().await {
|
||||
match e {
|
||||
NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)),
|
||||
NotifyError::ShuttingDown => return Err(ReconcileError::Cancel),
|
||||
_ => {
|
||||
tracing::warn!(
|
||||
"Live migration blocked by compute notification error, retrying: {e}"
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::{node::Node, tenant_shard::TenantShard};
|
||||
use crate::{node::Node, tenant_state::TenantState};
|
||||
use pageserver_api::controller_api::UtilizationScore;
|
||||
use serde::Serialize;
|
||||
use std::collections::HashMap;
|
||||
@@ -27,7 +27,7 @@ pub enum MaySchedule {
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct SchedulerNode {
|
||||
/// How many shards are currently scheduled on this node, via their [`crate::tenant_shard::IntentState`].
|
||||
/// How many shards are currently scheduled on this node, via their [`crate::tenant_state::IntentState`].
|
||||
shard_count: usize,
|
||||
|
||||
/// Whether this node is currently elegible to have new shards scheduled (this is derived
|
||||
@@ -58,86 +58,6 @@ pub(crate) struct Scheduler {
|
||||
nodes: HashMap<NodeId, SchedulerNode>,
|
||||
}
|
||||
|
||||
/// Score for soft constraint scheduling: lower scores are preferred to higher scores.
|
||||
///
|
||||
/// For example, we may set an affinity score based on the number of shards from the same
|
||||
/// tenant already on a node, to implicitly prefer to balance out shards.
|
||||
#[derive(Copy, Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
|
||||
pub(crate) struct AffinityScore(pub(crate) usize);
|
||||
|
||||
impl AffinityScore {
|
||||
/// If we have no anti-affinity at all toward a node, this is its score. It means
|
||||
/// the scheduler has a free choice amongst nodes with this score, and may pick a node
|
||||
/// based on other information such as total utilization.
|
||||
pub(crate) const FREE: Self = Self(0);
|
||||
|
||||
pub(crate) fn inc(&mut self) {
|
||||
self.0 += 1;
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::Add for AffinityScore {
|
||||
type Output = Self;
|
||||
|
||||
fn add(self, rhs: Self) -> Self::Output {
|
||||
Self(self.0 + rhs.0)
|
||||
}
|
||||
}
|
||||
|
||||
/// Hint for whether this is a sincere attempt to schedule, or a speculative
|
||||
/// check for where we _would_ schedule (done during optimization)
|
||||
#[derive(Debug)]
|
||||
pub(crate) enum ScheduleMode {
|
||||
Normal,
|
||||
Speculative,
|
||||
}
|
||||
|
||||
impl Default for ScheduleMode {
|
||||
fn default() -> Self {
|
||||
Self::Normal
|
||||
}
|
||||
}
|
||||
|
||||
// For carrying state between multiple calls to [`TenantShard::schedule`], e.g. when calling
|
||||
// it for many shards in the same tenant.
|
||||
#[derive(Debug, Default)]
|
||||
pub(crate) struct ScheduleContext {
|
||||
/// Sparse map of nodes: omitting a node implicitly makes its affinity [`AffinityScore::FREE`]
|
||||
pub(crate) nodes: HashMap<NodeId, AffinityScore>,
|
||||
|
||||
/// Specifically how many _attached_ locations are on each node
|
||||
pub(crate) attached_nodes: HashMap<NodeId, usize>,
|
||||
|
||||
pub(crate) mode: ScheduleMode,
|
||||
}
|
||||
|
||||
impl ScheduleContext {
|
||||
/// Input is a list of nodes we would like to avoid using again within this context. The more
|
||||
/// times a node is passed into this call, the less inclined we are to use it.
|
||||
pub(crate) fn avoid(&mut self, nodes: &[NodeId]) {
|
||||
for node_id in nodes {
|
||||
let entry = self.nodes.entry(*node_id).or_insert(AffinityScore::FREE);
|
||||
entry.inc()
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn push_attached(&mut self, node_id: NodeId) {
|
||||
let entry = self.attached_nodes.entry(node_id).or_default();
|
||||
*entry += 1;
|
||||
}
|
||||
|
||||
pub(crate) fn get_node_affinity(&self, node_id: NodeId) -> AffinityScore {
|
||||
self.nodes
|
||||
.get(&node_id)
|
||||
.copied()
|
||||
.unwrap_or(AffinityScore::FREE)
|
||||
}
|
||||
|
||||
pub(crate) fn get_node_attachments(&self, node_id: NodeId) -> usize {
|
||||
self.attached_nodes.get(&node_id).copied().unwrap_or(0)
|
||||
}
|
||||
}
|
||||
|
||||
impl Scheduler {
|
||||
pub(crate) fn new<'a>(nodes: impl Iterator<Item = &'a Node>) -> Self {
|
||||
let mut scheduler_nodes = HashMap::new();
|
||||
@@ -163,7 +83,7 @@ impl Scheduler {
|
||||
pub(crate) fn consistency_check<'a>(
|
||||
&self,
|
||||
nodes: impl Iterator<Item = &'a Node>,
|
||||
shards: impl Iterator<Item = &'a TenantShard>,
|
||||
shards: impl Iterator<Item = &'a TenantState>,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut expect_nodes: HashMap<NodeId, SchedulerNode> = HashMap::new();
|
||||
for node in nodes {
|
||||
@@ -304,87 +224,53 @@ impl Scheduler {
|
||||
node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None })
|
||||
}
|
||||
|
||||
/// hard_exclude: it is forbidden to use nodes in this list, typically becacuse they
|
||||
/// are already in use by this shard -- we use this to avoid picking the same node
|
||||
/// as both attached and secondary location. This is a hard constraint: if we cannot
|
||||
/// find any nodes that aren't in this list, then we will return a [`ScheduleError::ImpossibleConstraint`].
|
||||
///
|
||||
/// context: we prefer to avoid using nodes identified in the context, according
|
||||
/// to their anti-affinity score. We use this to prefeer to avoid placing shards in
|
||||
/// the same tenant on the same node. This is a soft constraint: the context will never
|
||||
/// cause us to fail to schedule a shard.
|
||||
pub(crate) fn schedule_shard(
|
||||
&self,
|
||||
hard_exclude: &[NodeId],
|
||||
context: &ScheduleContext,
|
||||
) -> Result<NodeId, ScheduleError> {
|
||||
pub(crate) fn schedule_shard(&self, hard_exclude: &[NodeId]) -> Result<NodeId, ScheduleError> {
|
||||
if self.nodes.is_empty() {
|
||||
return Err(ScheduleError::NoPageservers);
|
||||
}
|
||||
|
||||
let mut scores: Vec<(NodeId, AffinityScore, usize)> = self
|
||||
let mut tenant_counts: Vec<(NodeId, usize)> = self
|
||||
.nodes
|
||||
.iter()
|
||||
.filter_map(|(k, v)| {
|
||||
if hard_exclude.contains(k) || v.may_schedule == MaySchedule::No {
|
||||
None
|
||||
} else {
|
||||
Some((
|
||||
*k,
|
||||
context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE),
|
||||
v.shard_count,
|
||||
))
|
||||
Some((*k, v.shard_count))
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Sort by, in order of precedence:
|
||||
// 1st: Affinity score. We should never pick a higher-score node if a lower-score node is available
|
||||
// 2nd: Utilization. Within nodes with the same affinity, use the least loaded nodes.
|
||||
// 3rd: Node ID. This is a convenience to make selection deterministic in tests and empty systems.
|
||||
scores.sort_by_key(|i| (i.1, i.2, i.0));
|
||||
// Sort by tenant count. Nodes with the same tenant count are sorted by ID.
|
||||
tenant_counts.sort_by_key(|i| (i.1, i.0));
|
||||
|
||||
if scores.is_empty() {
|
||||
// After applying constraints, no pageservers were left.
|
||||
if !matches!(context.mode, ScheduleMode::Speculative) {
|
||||
// If this was not a speculative attempt, log details to understand why we couldn't
|
||||
// schedule: this may help an engineer understand if some nodes are marked offline
|
||||
// in a way that's preventing progress.
|
||||
if tenant_counts.is_empty() {
|
||||
// After applying constraints, no pageservers were left. We log some detail about
|
||||
// the state of nodes to help understand why this happened. This is not logged as an error because
|
||||
// it is legitimately possible for enough nodes to be Offline to prevent scheduling a shard.
|
||||
tracing::info!("Scheduling failure, while excluding {hard_exclude:?}, node states:");
|
||||
for (node_id, node) in &self.nodes {
|
||||
tracing::info!(
|
||||
"Scheduling failure, while excluding {hard_exclude:?}, node states:"
|
||||
"Node {node_id}: may_schedule={} shards={}",
|
||||
node.may_schedule != MaySchedule::No,
|
||||
node.shard_count
|
||||
);
|
||||
for (node_id, node) in &self.nodes {
|
||||
tracing::info!(
|
||||
"Node {node_id}: may_schedule={} shards={}",
|
||||
node.may_schedule != MaySchedule::No,
|
||||
node.shard_count
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
return Err(ScheduleError::ImpossibleConstraint);
|
||||
}
|
||||
|
||||
// Lowest score wins
|
||||
let node_id = scores.first().unwrap().0;
|
||||
|
||||
if !matches!(context.mode, ScheduleMode::Speculative) {
|
||||
tracing::info!(
|
||||
"scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})",
|
||||
scores.iter().map(|i| i.0 .0).collect::<Vec<_>>()
|
||||
let node_id = tenant_counts.first().unwrap().0;
|
||||
tracing::info!(
|
||||
"scheduler selected node {node_id} (elegible nodes {:?}, exclude: {hard_exclude:?})",
|
||||
tenant_counts.iter().map(|i| i.0 .0).collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
// Note that we do not update shard count here to reflect the scheduling: that
|
||||
// is IntentState's job when the scheduled location is used.
|
||||
|
||||
Ok(node_id)
|
||||
}
|
||||
|
||||
/// Unit test access to internal state
|
||||
#[cfg(test)]
|
||||
pub(crate) fn get_node_shard_count(&self, node_id: NodeId) -> usize {
|
||||
self.nodes.get(&node_id).unwrap().shard_count
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -421,7 +307,7 @@ pub(crate) mod test_utils {
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use crate::tenant_shard::IntentState;
|
||||
use crate::tenant_state::IntentState;
|
||||
#[test]
|
||||
fn scheduler_basic() -> anyhow::Result<()> {
|
||||
let nodes = test_utils::make_test_nodes(2);
|
||||
@@ -430,17 +316,15 @@ mod tests {
|
||||
let mut t1_intent = IntentState::new();
|
||||
let mut t2_intent = IntentState::new();
|
||||
|
||||
let context = ScheduleContext::default();
|
||||
|
||||
let scheduled = scheduler.schedule_shard(&[], &context)?;
|
||||
let scheduled = scheduler.schedule_shard(&[])?;
|
||||
t1_intent.set_attached(&mut scheduler, Some(scheduled));
|
||||
let scheduled = scheduler.schedule_shard(&[], &context)?;
|
||||
let scheduled = scheduler.schedule_shard(&[])?;
|
||||
t2_intent.set_attached(&mut scheduler, Some(scheduled));
|
||||
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);
|
||||
|
||||
let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers(), &context)?;
|
||||
let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers())?;
|
||||
t1_intent.push_secondary(&mut scheduler, scheduled);
|
||||
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
|
||||
@@ -22,7 +22,6 @@ diesel::table! {
|
||||
placement_policy -> Varchar,
|
||||
splitting -> Int2,
|
||||
config -> Text,
|
||||
scheduling_policy -> Varchar,
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -7,9 +7,8 @@ use std::{
|
||||
use crate::{
|
||||
metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
|
||||
persistence::TenantShardPersistence,
|
||||
scheduler::{AffinityScore, MaySchedule, ScheduleContext},
|
||||
};
|
||||
use pageserver_api::controller_api::{PlacementPolicy, ShardSchedulingPolicy};
|
||||
use pageserver_api::controller_api::PlacementPolicy;
|
||||
use pageserver_api::{
|
||||
models::{LocationConfig, LocationConfigMode, TenantConfig},
|
||||
shard::{ShardIdentity, TenantShardId},
|
||||
@@ -50,7 +49,7 @@ where
|
||||
/// This struct implement Serialize for debugging purposes, but is _not_ persisted
|
||||
/// itself: see [`crate::persistence`] for the subset of tenant shard state that is persisted.
|
||||
#[derive(Serialize)]
|
||||
pub(crate) struct TenantShard {
|
||||
pub(crate) struct TenantState {
|
||||
pub(crate) tenant_shard_id: TenantShardId,
|
||||
|
||||
pub(crate) shard: ShardIdentity,
|
||||
@@ -117,10 +116,6 @@ pub(crate) struct TenantShard {
|
||||
/// sending it. This is the mechanism by which compute notifications are included in the scope
|
||||
/// of state that we publish externally in an eventually consistent way.
|
||||
pub(crate) pending_compute_notification: bool,
|
||||
|
||||
// Support/debug tool: if something is going wrong or flapping with scheduling, this may
|
||||
// be set to a non-active state to avoid making changes while the issue is fixed.
|
||||
scheduling_policy: ShardSchedulingPolicy,
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Debug, Serialize)]
|
||||
@@ -251,13 +246,8 @@ impl IntentState {
|
||||
|
||||
impl Drop for IntentState {
|
||||
fn drop(&mut self) {
|
||||
// Must clear before dropping, to avoid leaving stale refcounts in the Scheduler.
|
||||
// We do not check this while panicking, to avoid polluting unit test failures or
|
||||
// other assertions with this assertion's output. It's still wrong to leak these,
|
||||
// but if we already have a panic then we don't need to independently flag this case.
|
||||
if !(std::thread::panicking()) {
|
||||
debug_assert!(self.attached.is_none() && self.secondary.is_empty());
|
||||
}
|
||||
// Must clear before dropping, to avoid leaving stale refcounts in the Scheduler
|
||||
debug_assert!(self.attached.is_none() && self.secondary.is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -302,26 +292,6 @@ pub enum ReconcileWaitError {
|
||||
Failed(TenantShardId, String),
|
||||
}
|
||||
|
||||
#[derive(Eq, PartialEq, Debug)]
|
||||
pub(crate) struct ReplaceSecondary {
|
||||
old_node_id: NodeId,
|
||||
new_node_id: NodeId,
|
||||
}
|
||||
|
||||
#[derive(Eq, PartialEq, Debug)]
|
||||
pub(crate) struct MigrateAttachment {
|
||||
old_attached_node_id: NodeId,
|
||||
new_attached_node_id: NodeId,
|
||||
}
|
||||
|
||||
#[derive(Eq, PartialEq, Debug)]
|
||||
pub(crate) enum ScheduleOptimization {
|
||||
// Replace one of our secondary locations with a different node
|
||||
ReplaceSecondary(ReplaceSecondary),
|
||||
// Migrate attachment to an existing secondary location
|
||||
MigrateAttachment(MigrateAttachment),
|
||||
}
|
||||
|
||||
impl ReconcilerWaiter {
|
||||
pub(crate) async fn wait_timeout(&self, timeout: Duration) -> Result<(), ReconcileWaitError> {
|
||||
tokio::select! {
|
||||
@@ -354,7 +324,7 @@ pub(crate) struct ReconcilerHandle {
|
||||
}
|
||||
|
||||
/// When a reconcile task completes, it sends this result object
|
||||
/// to be applied to the primary TenantShard.
|
||||
/// to be applied to the primary TenantState.
|
||||
pub(crate) struct ReconcileResult {
|
||||
pub(crate) sequence: Sequence,
|
||||
/// On errors, `observed` should be treated as an incompleted description
|
||||
@@ -367,7 +337,7 @@ pub(crate) struct ReconcileResult {
|
||||
pub(crate) generation: Option<Generation>,
|
||||
pub(crate) observed: ObservedState,
|
||||
|
||||
/// Set [`TenantShard::pending_compute_notification`] from this flag
|
||||
/// Set [`TenantState::pending_compute_notification`] from this flag
|
||||
pub(crate) pending_compute_notification: bool,
|
||||
}
|
||||
|
||||
@@ -379,7 +349,7 @@ impl ObservedState {
|
||||
}
|
||||
}
|
||||
|
||||
impl TenantShard {
|
||||
impl TenantState {
|
||||
pub(crate) fn new(
|
||||
tenant_shard_id: TenantShardId,
|
||||
shard: ShardIdentity,
|
||||
@@ -400,7 +370,6 @@ impl TenantShard {
|
||||
error_waiter: Arc::new(SeqWait::new(Sequence(0))),
|
||||
last_error: Arc::default(),
|
||||
pending_compute_notification: false,
|
||||
scheduling_policy: ShardSchedulingPolicy::default(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -456,7 +425,6 @@ impl TenantShard {
|
||||
fn schedule_attached(
|
||||
&mut self,
|
||||
scheduler: &mut Scheduler,
|
||||
context: &ScheduleContext,
|
||||
) -> Result<(bool, NodeId), ScheduleError> {
|
||||
// No work to do if we already have an attached tenant
|
||||
if let Some(node_id) = self.intent.attached {
|
||||
@@ -470,33 +438,14 @@ impl TenantShard {
|
||||
Ok((true, promote_secondary))
|
||||
} else {
|
||||
// Pick a fresh node: either we had no secondaries or none were schedulable
|
||||
let node_id = scheduler.schedule_shard(&self.intent.secondary, context)?;
|
||||
let node_id = scheduler.schedule_shard(&self.intent.secondary)?;
|
||||
tracing::debug!("Selected {} as attached", node_id);
|
||||
self.intent.set_attached(scheduler, Some(node_id));
|
||||
Ok((true, node_id))
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn schedule(
|
||||
&mut self,
|
||||
scheduler: &mut Scheduler,
|
||||
context: &mut ScheduleContext,
|
||||
) -> Result<(), ScheduleError> {
|
||||
let r = self.do_schedule(scheduler, context);
|
||||
|
||||
context.avoid(&self.intent.all_pageservers());
|
||||
if let Some(attached) = self.intent.get_attached() {
|
||||
context.push_attached(*attached);
|
||||
}
|
||||
|
||||
r
|
||||
}
|
||||
|
||||
pub(crate) fn do_schedule(
|
||||
&mut self,
|
||||
scheduler: &mut Scheduler,
|
||||
context: &ScheduleContext,
|
||||
) -> Result<(), ScheduleError> {
|
||||
pub(crate) fn schedule(&mut self, scheduler: &mut Scheduler) -> Result<(), ScheduleError> {
|
||||
// TODO: before scheduling new nodes, check if any existing content in
|
||||
// self.intent refers to pageservers that are offline, and pick other
|
||||
// pageservers if so.
|
||||
@@ -504,16 +453,6 @@ impl TenantShard {
|
||||
// TODO: respect the splitting bit on tenants: if they are currently splitting then we may not
|
||||
// change their attach location.
|
||||
|
||||
match self.scheduling_policy {
|
||||
ShardSchedulingPolicy::Active | ShardSchedulingPolicy::Essential => {}
|
||||
ShardSchedulingPolicy::Pause | ShardSchedulingPolicy::Stop => {
|
||||
// Warn to make it obvious why other things aren't happening/working, if we skip scheduling
|
||||
tracing::warn!(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(),
|
||||
"Scheduling is disabled by policy {:?}", self.scheduling_policy);
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
// Build the set of pageservers already in use by this tenant, to avoid scheduling
|
||||
// more work on the same pageservers we're already using.
|
||||
let mut modified = false;
|
||||
@@ -540,13 +479,12 @@ impl TenantShard {
|
||||
}
|
||||
|
||||
// Should have exactly one attached, and N secondaries
|
||||
let (modified_attached, attached_node_id) =
|
||||
self.schedule_attached(scheduler, context)?;
|
||||
let (modified_attached, attached_node_id) = self.schedule_attached(scheduler)?;
|
||||
modified |= modified_attached;
|
||||
|
||||
let mut used_pageservers = vec![attached_node_id];
|
||||
while self.intent.secondary.len() < secondary_count {
|
||||
let node_id = scheduler.schedule_shard(&used_pageservers, context)?;
|
||||
let node_id = scheduler.schedule_shard(&used_pageservers)?;
|
||||
self.intent.push_secondary(scheduler, node_id);
|
||||
used_pageservers.push(node_id);
|
||||
modified = true;
|
||||
@@ -559,7 +497,7 @@ impl TenantShard {
|
||||
modified = true;
|
||||
} else if self.intent.secondary.is_empty() {
|
||||
// Populate secondary by scheduling a fresh node
|
||||
let node_id = scheduler.schedule_shard(&[], context)?;
|
||||
let node_id = scheduler.schedule_shard(&[])?;
|
||||
self.intent.push_secondary(scheduler, node_id);
|
||||
modified = true;
|
||||
}
|
||||
@@ -586,167 +524,6 @@ impl TenantShard {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Optimize attachments: if a shard has a secondary location that is preferable to
|
||||
/// its primary location based on soft constraints, switch that secondary location
|
||||
/// to be attached.
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
|
||||
pub(crate) fn optimize_attachment(
|
||||
&self,
|
||||
nodes: &HashMap<NodeId, Node>,
|
||||
schedule_context: &ScheduleContext,
|
||||
) -> Option<ScheduleOptimization> {
|
||||
let attached = (*self.intent.get_attached())?;
|
||||
if self.intent.secondary.is_empty() {
|
||||
// We can only do useful work if we have both attached and secondary locations: this
|
||||
// function doesn't schedule new locations, only swaps between attached and secondaries.
|
||||
return None;
|
||||
}
|
||||
|
||||
let current_affinity_score = schedule_context.get_node_affinity(attached);
|
||||
let current_attachment_count = schedule_context.get_node_attachments(attached);
|
||||
|
||||
// Generate score for each node, dropping any un-schedulable nodes.
|
||||
let all_pageservers = self.intent.all_pageservers();
|
||||
let mut scores = all_pageservers
|
||||
.iter()
|
||||
.flat_map(|node_id| {
|
||||
if matches!(
|
||||
nodes
|
||||
.get(node_id)
|
||||
.map(|n| n.may_schedule())
|
||||
.unwrap_or(MaySchedule::No),
|
||||
MaySchedule::No
|
||||
) {
|
||||
None
|
||||
} else {
|
||||
let affinity_score = schedule_context.get_node_affinity(*node_id);
|
||||
let attachment_count = schedule_context.get_node_attachments(*node_id);
|
||||
Some((*node_id, affinity_score, attachment_count))
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
// Sort precedence:
|
||||
// 1st - prefer nodes with the lowest total affinity score
|
||||
// 2nd - prefer nodes with the lowest number of attachments in this context
|
||||
// 3rd - if all else is equal, sort by node ID for determinism in tests.
|
||||
scores.sort_by_key(|i| (i.1, i.2, i.0));
|
||||
|
||||
if let Some((preferred_node, preferred_affinity_score, preferred_attachment_count)) =
|
||||
scores.first()
|
||||
{
|
||||
if attached != *preferred_node {
|
||||
// The best alternative must be more than 1 better than us, otherwise we could end
|
||||
// up flapping back next time we're called (e.g. there's no point migrating from
|
||||
// a location with score 1 to a score zero, because on next location the situation
|
||||
// would be the same, but in reverse).
|
||||
if current_affinity_score > *preferred_affinity_score + AffinityScore(1)
|
||||
|| current_attachment_count > *preferred_attachment_count + 1
|
||||
{
|
||||
tracing::info!(
|
||||
"Identified optimization: migrate attachment {attached}->{preferred_node} (secondaries {:?})",
|
||||
self.intent.get_secondary()
|
||||
);
|
||||
return Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
|
||||
old_attached_node_id: attached,
|
||||
new_attached_node_id: *preferred_node,
|
||||
}));
|
||||
}
|
||||
} else {
|
||||
tracing::debug!(
|
||||
"Node {} is already preferred (score {:?})",
|
||||
preferred_node,
|
||||
preferred_affinity_score
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Fall-through: we didn't find an optimization
|
||||
None
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
|
||||
pub(crate) fn optimize_secondary(
|
||||
&self,
|
||||
scheduler: &Scheduler,
|
||||
schedule_context: &ScheduleContext,
|
||||
) -> Option<ScheduleOptimization> {
|
||||
if self.intent.secondary.is_empty() {
|
||||
// We can only do useful work if we have both attached and secondary locations: this
|
||||
// function doesn't schedule new locations, only swaps between attached and secondaries.
|
||||
return None;
|
||||
}
|
||||
|
||||
for secondary in self.intent.get_secondary() {
|
||||
let Some(affinity_score) = schedule_context.nodes.get(secondary) else {
|
||||
// We're already on a node unaffected any affinity constraints,
|
||||
// so we won't change it.
|
||||
continue;
|
||||
};
|
||||
|
||||
// Let the scheduler suggest a node, where it would put us if we were scheduling afresh
|
||||
// This implicitly limits the choice to nodes that are available, and prefers nodes
|
||||
// with lower utilization.
|
||||
let Ok(candidate_node) =
|
||||
scheduler.schedule_shard(&self.intent.all_pageservers(), schedule_context)
|
||||
else {
|
||||
// A scheduling error means we have no possible candidate replacements
|
||||
continue;
|
||||
};
|
||||
|
||||
let candidate_affinity_score = schedule_context
|
||||
.nodes
|
||||
.get(&candidate_node)
|
||||
.unwrap_or(&AffinityScore::FREE);
|
||||
|
||||
// The best alternative must be more than 1 better than us, otherwise we could end
|
||||
// up flapping back next time we're called.
|
||||
if *candidate_affinity_score + AffinityScore(1) < *affinity_score {
|
||||
// If some other node is available and has a lower score than this node, then
|
||||
// that other node is a good place to migrate to.
|
||||
tracing::info!(
|
||||
"Identified optimization: replace secondary {secondary}->{candidate_node} (current secondaries {:?})",
|
||||
self.intent.get_secondary()
|
||||
);
|
||||
return Some(ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
|
||||
old_node_id: *secondary,
|
||||
new_node_id: candidate_node,
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
pub(crate) fn apply_optimization(
|
||||
&mut self,
|
||||
scheduler: &mut Scheduler,
|
||||
optimization: ScheduleOptimization,
|
||||
) {
|
||||
metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_schedule_optimization
|
||||
.inc();
|
||||
|
||||
match optimization {
|
||||
ScheduleOptimization::MigrateAttachment(MigrateAttachment {
|
||||
old_attached_node_id,
|
||||
new_attached_node_id,
|
||||
}) => {
|
||||
self.intent.demote_attached(old_attached_node_id);
|
||||
self.intent
|
||||
.promote_attached(scheduler, new_attached_node_id);
|
||||
}
|
||||
ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
|
||||
old_node_id,
|
||||
new_node_id,
|
||||
}) => {
|
||||
self.intent.remove_secondary(scheduler, old_node_id);
|
||||
self.intent.push_secondary(scheduler, new_node_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Query whether the tenant's observed state for attached node matches its intent state, and if so,
|
||||
/// yield the node ID. This is appropriate for emitting compute hook notifications: we are checking that
|
||||
/// the node in question is not only where we intend to attach, but that the tenant is indeed already attached there.
|
||||
@@ -891,19 +668,6 @@ impl TenantShard {
|
||||
}
|
||||
}
|
||||
|
||||
// Pre-checks done: finally check whether we may actually do the work
|
||||
match self.scheduling_policy {
|
||||
ShardSchedulingPolicy::Active
|
||||
| ShardSchedulingPolicy::Essential
|
||||
| ShardSchedulingPolicy::Pause => {}
|
||||
ShardSchedulingPolicy::Stop => {
|
||||
// We only reach this point if there is work to do and we're going to skip
|
||||
// doing it: warn it obvious why this tenant isn't doing what it ought to.
|
||||
tracing::warn!("Skipping reconcile for policy {:?}", self.scheduling_policy);
|
||||
return None;
|
||||
}
|
||||
}
|
||||
|
||||
// Build list of nodes from which the reconciler should detach
|
||||
let mut detach = Vec::new();
|
||||
for node_id in self.observed.locations.keys() {
|
||||
@@ -1040,22 +804,6 @@ impl TenantShard {
|
||||
})
|
||||
}
|
||||
|
||||
/// Get a waiter for any reconciliation in flight, but do not start reconciliation
|
||||
/// if it is not already running
|
||||
pub(crate) fn get_waiter(&self) -> Option<ReconcilerWaiter> {
|
||||
if self.reconciler.is_some() {
|
||||
Some(ReconcilerWaiter {
|
||||
tenant_shard_id: self.tenant_shard_id,
|
||||
seq_wait: self.waiter.clone(),
|
||||
error_seq_wait: self.error_waiter.clone(),
|
||||
error: self.last_error.clone(),
|
||||
seq: self.sequence,
|
||||
})
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Called when a ReconcileResult has been emitted and the service is updating
|
||||
/// our state: if the result is from a sequence >= my ReconcileHandle, then drop
|
||||
/// the handle to indicate there is no longer a reconciliation in progress.
|
||||
@@ -1081,40 +829,6 @@ impl TenantShard {
|
||||
debug_assert!(!self.intent.all_pageservers().contains(&node_id));
|
||||
}
|
||||
|
||||
pub(crate) fn set_scheduling_policy(&mut self, p: ShardSchedulingPolicy) {
|
||||
self.scheduling_policy = p;
|
||||
}
|
||||
|
||||
pub(crate) fn get_scheduling_policy(&self) -> &ShardSchedulingPolicy {
|
||||
&self.scheduling_policy
|
||||
}
|
||||
|
||||
pub(crate) fn from_persistent(
|
||||
tsp: TenantShardPersistence,
|
||||
intent: IntentState,
|
||||
) -> anyhow::Result<Self> {
|
||||
let tenant_shard_id = tsp.get_tenant_shard_id()?;
|
||||
let shard_identity = tsp.get_shard_identity()?;
|
||||
|
||||
Ok(Self {
|
||||
tenant_shard_id,
|
||||
shard: shard_identity,
|
||||
sequence: Sequence::initial(),
|
||||
generation: tsp.generation.map(|g| Generation::new(g as u32)),
|
||||
policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
|
||||
intent,
|
||||
observed: ObservedState::new(),
|
||||
config: serde_json::from_str(&tsp.config).unwrap(),
|
||||
reconciler: None,
|
||||
splitting: tsp.splitting,
|
||||
waiter: Arc::new(SeqWait::new(Sequence::initial())),
|
||||
error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
|
||||
last_error: Arc::default(),
|
||||
pending_compute_notification: false,
|
||||
scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(),
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn to_persistent(&self) -> TenantShardPersistence {
|
||||
TenantShardPersistence {
|
||||
tenant_id: self.tenant_shard_id.tenant_id.to_string(),
|
||||
@@ -1126,7 +840,6 @@ impl TenantShard {
|
||||
placement_policy: serde_json::to_string(&self.policy).unwrap(),
|
||||
config: serde_json::to_string(&self.config).unwrap(),
|
||||
splitting: SplitState::default(),
|
||||
scheduling_policy: serde_json::to_string(&self.scheduling_policy).unwrap(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1143,7 +856,7 @@ pub(crate) mod tests {
|
||||
|
||||
use super::*;
|
||||
|
||||
fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantShard {
|
||||
fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantState {
|
||||
let tenant_id = TenantId::generate();
|
||||
let shard_number = ShardNumber(0);
|
||||
let shard_count = ShardCount::new(1);
|
||||
@@ -1153,7 +866,7 @@ pub(crate) mod tests {
|
||||
shard_number,
|
||||
shard_count,
|
||||
};
|
||||
TenantShard::new(
|
||||
TenantState::new(
|
||||
tenant_shard_id,
|
||||
ShardIdentity::new(
|
||||
shard_number,
|
||||
@@ -1165,32 +878,6 @@ pub(crate) mod tests {
|
||||
)
|
||||
}
|
||||
|
||||
fn make_test_tenant(policy: PlacementPolicy, shard_count: ShardCount) -> Vec<TenantShard> {
|
||||
let tenant_id = TenantId::generate();
|
||||
|
||||
(0..shard_count.count())
|
||||
.map(|i| {
|
||||
let shard_number = ShardNumber(i);
|
||||
|
||||
let tenant_shard_id = TenantShardId {
|
||||
tenant_id,
|
||||
shard_number,
|
||||
shard_count,
|
||||
};
|
||||
TenantShard::new(
|
||||
tenant_shard_id,
|
||||
ShardIdentity::new(
|
||||
shard_number,
|
||||
shard_count,
|
||||
pageserver_api::shard::ShardStripeSize(32768),
|
||||
)
|
||||
.unwrap(),
|
||||
policy.clone(),
|
||||
)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Test the scheduling behaviors used when a tenant configured for HA is subject
|
||||
/// to nodes being marked offline.
|
||||
#[test]
|
||||
@@ -1200,26 +887,25 @@ pub(crate) mod tests {
|
||||
let mut nodes = make_test_nodes(3);
|
||||
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
let mut context = ScheduleContext::default();
|
||||
|
||||
let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
||||
tenant_shard
|
||||
.schedule(&mut scheduler, &mut context)
|
||||
let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
||||
tenant_state
|
||||
.schedule(&mut scheduler)
|
||||
.expect("we have enough nodes, scheduling should work");
|
||||
|
||||
// Expect to initially be schedule on to different nodes
|
||||
assert_eq!(tenant_shard.intent.secondary.len(), 1);
|
||||
assert!(tenant_shard.intent.attached.is_some());
|
||||
assert_eq!(tenant_state.intent.secondary.len(), 1);
|
||||
assert!(tenant_state.intent.attached.is_some());
|
||||
|
||||
let attached_node_id = tenant_shard.intent.attached.unwrap();
|
||||
let secondary_node_id = *tenant_shard.intent.secondary.iter().last().unwrap();
|
||||
let attached_node_id = tenant_state.intent.attached.unwrap();
|
||||
let secondary_node_id = *tenant_state.intent.secondary.iter().last().unwrap();
|
||||
assert_ne!(attached_node_id, secondary_node_id);
|
||||
|
||||
// Notifying the attached node is offline should demote it to a secondary
|
||||
let changed = tenant_shard.intent.demote_attached(attached_node_id);
|
||||
let changed = tenant_state.intent.demote_attached(attached_node_id);
|
||||
assert!(changed);
|
||||
assert!(tenant_shard.intent.attached.is_none());
|
||||
assert_eq!(tenant_shard.intent.secondary.len(), 2);
|
||||
assert!(tenant_state.intent.attached.is_none());
|
||||
assert_eq!(tenant_state.intent.secondary.len(), 2);
|
||||
|
||||
// Update the scheduler state to indicate the node is offline
|
||||
nodes
|
||||
@@ -1229,18 +915,18 @@ pub(crate) mod tests {
|
||||
scheduler.node_upsert(nodes.get(&attached_node_id).unwrap());
|
||||
|
||||
// Scheduling the node should promote the still-available secondary node to attached
|
||||
tenant_shard
|
||||
.schedule(&mut scheduler, &mut context)
|
||||
tenant_state
|
||||
.schedule(&mut scheduler)
|
||||
.expect("active nodes are available");
|
||||
assert_eq!(tenant_shard.intent.attached.unwrap(), secondary_node_id);
|
||||
assert_eq!(tenant_state.intent.attached.unwrap(), secondary_node_id);
|
||||
|
||||
// The original attached node should have been retained as a secondary
|
||||
assert_eq!(
|
||||
*tenant_shard.intent.secondary.iter().last().unwrap(),
|
||||
*tenant_state.intent.secondary.iter().last().unwrap(),
|
||||
attached_node_id
|
||||
);
|
||||
|
||||
tenant_shard.intent.clear(&mut scheduler);
|
||||
tenant_state.intent.clear(&mut scheduler);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1250,263 +936,48 @@ pub(crate) mod tests {
|
||||
let nodes = make_test_nodes(3);
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
|
||||
let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
||||
let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
||||
|
||||
tenant_shard.observed.locations.insert(
|
||||
tenant_state.observed.locations.insert(
|
||||
NodeId(3),
|
||||
ObservedStateLocation {
|
||||
conf: Some(LocationConfig {
|
||||
mode: LocationConfigMode::AttachedMulti,
|
||||
generation: Some(2),
|
||||
secondary_conf: None,
|
||||
shard_number: tenant_shard.shard.number.0,
|
||||
shard_count: tenant_shard.shard.count.literal(),
|
||||
shard_stripe_size: tenant_shard.shard.stripe_size.0,
|
||||
shard_number: tenant_state.shard.number.0,
|
||||
shard_count: tenant_state.shard.count.literal(),
|
||||
shard_stripe_size: tenant_state.shard.stripe_size.0,
|
||||
tenant_conf: TenantConfig::default(),
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
tenant_shard.observed.locations.insert(
|
||||
tenant_state.observed.locations.insert(
|
||||
NodeId(2),
|
||||
ObservedStateLocation {
|
||||
conf: Some(LocationConfig {
|
||||
mode: LocationConfigMode::AttachedStale,
|
||||
generation: Some(1),
|
||||
secondary_conf: None,
|
||||
shard_number: tenant_shard.shard.number.0,
|
||||
shard_count: tenant_shard.shard.count.literal(),
|
||||
shard_stripe_size: tenant_shard.shard.stripe_size.0,
|
||||
shard_number: tenant_state.shard.number.0,
|
||||
shard_count: tenant_state.shard.count.literal(),
|
||||
shard_stripe_size: tenant_state.shard.stripe_size.0,
|
||||
tenant_conf: TenantConfig::default(),
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
tenant_shard.intent_from_observed(&mut scheduler);
|
||||
tenant_state.intent_from_observed(&mut scheduler);
|
||||
|
||||
// The highest generationed attached location gets used as attached
|
||||
assert_eq!(tenant_shard.intent.attached, Some(NodeId(3)));
|
||||
assert_eq!(tenant_state.intent.attached, Some(NodeId(3)));
|
||||
// Other locations get used as secondary
|
||||
assert_eq!(tenant_shard.intent.secondary, vec![NodeId(2)]);
|
||||
assert_eq!(tenant_state.intent.secondary, vec![NodeId(2)]);
|
||||
|
||||
scheduler.consistency_check(nodes.values(), [&tenant_shard].into_iter())?;
|
||||
|
||||
tenant_shard.intent.clear(&mut scheduler);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scheduling_mode() -> anyhow::Result<()> {
|
||||
let nodes = make_test_nodes(3);
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
|
||||
let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
||||
|
||||
// In pause mode, schedule() shouldn't do anything
|
||||
tenant_shard.scheduling_policy = ShardSchedulingPolicy::Pause;
|
||||
assert!(tenant_shard
|
||||
.schedule(&mut scheduler, &mut ScheduleContext::default())
|
||||
.is_ok());
|
||||
assert!(tenant_shard.intent.all_pageservers().is_empty());
|
||||
|
||||
// In active mode, schedule() works
|
||||
tenant_shard.scheduling_policy = ShardSchedulingPolicy::Active;
|
||||
assert!(tenant_shard
|
||||
.schedule(&mut scheduler, &mut ScheduleContext::default())
|
||||
.is_ok());
|
||||
assert!(!tenant_shard.intent.all_pageservers().is_empty());
|
||||
|
||||
tenant_shard.intent.clear(&mut scheduler);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn optimize_attachment() -> anyhow::Result<()> {
|
||||
let nodes = make_test_nodes(3);
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
|
||||
let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
||||
let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
||||
|
||||
// Initially: both nodes attached on shard 1, and both have secondary locations
|
||||
// on different nodes.
|
||||
shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1)));
|
||||
shard_a.intent.push_secondary(&mut scheduler, NodeId(2));
|
||||
shard_b.intent.set_attached(&mut scheduler, Some(NodeId(1)));
|
||||
shard_b.intent.push_secondary(&mut scheduler, NodeId(3));
|
||||
|
||||
let mut schedule_context = ScheduleContext::default();
|
||||
schedule_context.avoid(&shard_a.intent.all_pageservers());
|
||||
schedule_context.push_attached(shard_a.intent.get_attached().unwrap());
|
||||
schedule_context.avoid(&shard_b.intent.all_pageservers());
|
||||
schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
|
||||
|
||||
let optimization_a = shard_a.optimize_attachment(&nodes, &schedule_context);
|
||||
|
||||
// Either shard should recognize that it has the option to switch to a secondary location where there
|
||||
// would be no other shards from the same tenant, and request to do so.
|
||||
assert_eq!(
|
||||
optimization_a,
|
||||
Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
|
||||
old_attached_node_id: NodeId(1),
|
||||
new_attached_node_id: NodeId(2)
|
||||
}))
|
||||
);
|
||||
|
||||
// Note that these optimizing two shards in the same tenant with the same ScheduleContext is
|
||||
// mutually exclusive (the optimization of one invalidates the stats) -- it is the responsibility
|
||||
// of [`Service::optimize_all`] to avoid trying
|
||||
// to do optimizations for multiple shards in the same tenant at the same time. Generating
|
||||
// both optimizations is just done for test purposes
|
||||
let optimization_b = shard_b.optimize_attachment(&nodes, &schedule_context);
|
||||
assert_eq!(
|
||||
optimization_b,
|
||||
Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
|
||||
old_attached_node_id: NodeId(1),
|
||||
new_attached_node_id: NodeId(3)
|
||||
}))
|
||||
);
|
||||
|
||||
// Applying these optimizations should result in the end state proposed
|
||||
shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap());
|
||||
assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(2)));
|
||||
assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(1)]);
|
||||
shard_b.apply_optimization(&mut scheduler, optimization_b.unwrap());
|
||||
assert_eq!(shard_b.intent.get_attached(), &Some(NodeId(3)));
|
||||
assert_eq!(shard_b.intent.get_secondary(), &vec![NodeId(1)]);
|
||||
|
||||
shard_a.intent.clear(&mut scheduler);
|
||||
shard_b.intent.clear(&mut scheduler);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn optimize_secondary() -> anyhow::Result<()> {
|
||||
let nodes = make_test_nodes(4);
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
|
||||
let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
||||
let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
||||
|
||||
// Initially: both nodes attached on shard 1, and both have secondary locations
|
||||
// on different nodes.
|
||||
shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1)));
|
||||
shard_a.intent.push_secondary(&mut scheduler, NodeId(3));
|
||||
shard_b.intent.set_attached(&mut scheduler, Some(NodeId(2)));
|
||||
shard_b.intent.push_secondary(&mut scheduler, NodeId(3));
|
||||
|
||||
let mut schedule_context = ScheduleContext::default();
|
||||
schedule_context.avoid(&shard_a.intent.all_pageservers());
|
||||
schedule_context.push_attached(shard_a.intent.get_attached().unwrap());
|
||||
schedule_context.avoid(&shard_b.intent.all_pageservers());
|
||||
schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
|
||||
|
||||
let optimization_a = shard_a.optimize_secondary(&scheduler, &schedule_context);
|
||||
|
||||
// Since there is a node with no locations available, the node with two locations for the
|
||||
// same tenant should generate an optimization to move one away
|
||||
assert_eq!(
|
||||
optimization_a,
|
||||
Some(ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
|
||||
old_node_id: NodeId(3),
|
||||
new_node_id: NodeId(4)
|
||||
}))
|
||||
);
|
||||
|
||||
shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap());
|
||||
assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(1)));
|
||||
assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(4)]);
|
||||
|
||||
shard_a.intent.clear(&mut scheduler);
|
||||
shard_b.intent.clear(&mut scheduler);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Optimize til quiescent: this emulates what Service::optimize_all does, when
|
||||
// called repeatedly in the background.
|
||||
fn optimize_til_idle(
|
||||
nodes: &HashMap<NodeId, Node>,
|
||||
scheduler: &mut Scheduler,
|
||||
shards: &mut [TenantShard],
|
||||
) {
|
||||
let mut loop_n = 0;
|
||||
loop {
|
||||
let mut schedule_context = ScheduleContext::default();
|
||||
let mut any_changed = false;
|
||||
|
||||
for shard in shards.iter() {
|
||||
schedule_context.avoid(&shard.intent.all_pageservers());
|
||||
if let Some(attached) = shard.intent.get_attached() {
|
||||
schedule_context.push_attached(*attached);
|
||||
}
|
||||
}
|
||||
|
||||
for shard in shards.iter_mut() {
|
||||
let optimization = shard.optimize_attachment(nodes, &schedule_context);
|
||||
if let Some(optimization) = optimization {
|
||||
shard.apply_optimization(scheduler, optimization);
|
||||
any_changed = true;
|
||||
break;
|
||||
}
|
||||
|
||||
let optimization = shard.optimize_secondary(scheduler, &schedule_context);
|
||||
if let Some(optimization) = optimization {
|
||||
shard.apply_optimization(scheduler, optimization);
|
||||
any_changed = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if !any_changed {
|
||||
break;
|
||||
}
|
||||
|
||||
// Assert no infinite loop
|
||||
loop_n += 1;
|
||||
assert!(loop_n < 1000);
|
||||
}
|
||||
}
|
||||
|
||||
/// Test the balancing behavior of shard scheduling: that it achieves a balance, and
|
||||
/// that it converges.
|
||||
#[test]
|
||||
fn optimize_add_nodes() -> anyhow::Result<()> {
|
||||
let nodes = make_test_nodes(4);
|
||||
|
||||
// Only show the scheduler a couple of nodes
|
||||
let mut scheduler = Scheduler::new([].iter());
|
||||
scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap());
|
||||
scheduler.node_upsert(nodes.get(&NodeId(2)).unwrap());
|
||||
|
||||
let mut shards = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4));
|
||||
let mut schedule_context = ScheduleContext::default();
|
||||
for shard in &mut shards {
|
||||
assert!(shard
|
||||
.schedule(&mut scheduler, &mut schedule_context)
|
||||
.is_ok());
|
||||
}
|
||||
|
||||
// We should see equal number of locations on the two nodes.
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4);
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4);
|
||||
|
||||
// Add another two nodes: we should see the shards spread out when their optimize
|
||||
// methods are called
|
||||
scheduler.node_upsert(nodes.get(&NodeId(3)).unwrap());
|
||||
scheduler.node_upsert(nodes.get(&NodeId(4)).unwrap());
|
||||
optimize_til_idle(&nodes, &mut scheduler, &mut shards);
|
||||
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 2);
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2);
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 2);
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 2);
|
||||
|
||||
for shard in shards.iter_mut() {
|
||||
shard.intent.clear(&mut scheduler);
|
||||
}
|
||||
scheduler.consistency_check(nodes.values(), [&tenant_state].into_iter())?;
|
||||
|
||||
tenant_state.intent.clear(&mut scheduler);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -86,10 +86,7 @@ where
|
||||
.stdout(process_log_file)
|
||||
.stderr(same_file_for_stderr)
|
||||
.args(args);
|
||||
|
||||
let filled_cmd = fill_env_vars_prefixed_neon(fill_remote_storage_secrets_vars(
|
||||
fill_rust_env_vars(background_command),
|
||||
));
|
||||
let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command));
|
||||
filled_cmd.envs(envs);
|
||||
|
||||
let pid_file_to_check = match &initial_pid_file {
|
||||
@@ -271,15 +268,6 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
|
||||
cmd
|
||||
}
|
||||
|
||||
fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command {
|
||||
for (var, val) in std::env::vars() {
|
||||
if var.starts_with("NEON_PAGESERVER_") {
|
||||
cmd = cmd.env(var, val);
|
||||
}
|
||||
}
|
||||
cmd
|
||||
}
|
||||
|
||||
/// Add a `pre_exec` to the cmd that, inbetween fork() and exec(),
|
||||
/// 1. Claims a pidfile with a fcntl lock on it and
|
||||
/// 2. Sets up the pidfile's file descriptor so that it (and the lock)
|
||||
|
||||
@@ -14,7 +14,9 @@ use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
|
||||
use control_plane::safekeeper::SafekeeperNode;
|
||||
use control_plane::storage_controller::StorageController;
|
||||
use control_plane::{broker, local_env};
|
||||
use pageserver_api::controller_api::PlacementPolicy;
|
||||
use pageserver_api::controller_api::{
|
||||
NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, PlacementPolicy,
|
||||
};
|
||||
use pageserver_api::models::{
|
||||
ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
|
||||
};
|
||||
@@ -417,54 +419,6 @@ async fn handle_tenant(
|
||||
println!("{} {:?}", t.id, t.state);
|
||||
}
|
||||
}
|
||||
Some(("import", import_match)) => {
|
||||
let tenant_id = parse_tenant_id(import_match)?.unwrap_or_else(TenantId::generate);
|
||||
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
let create_response = storage_controller.tenant_import(tenant_id).await?;
|
||||
|
||||
let shard_zero = create_response
|
||||
.shards
|
||||
.first()
|
||||
.expect("Import response omitted shards");
|
||||
|
||||
let attached_pageserver_id = shard_zero.node_id;
|
||||
let pageserver =
|
||||
PageServerNode::from_env(env, env.get_pageserver_conf(attached_pageserver_id)?);
|
||||
|
||||
println!(
|
||||
"Imported tenant {tenant_id}, attached to pageserver {attached_pageserver_id}"
|
||||
);
|
||||
|
||||
let timelines = pageserver
|
||||
.http_client
|
||||
.list_timelines(shard_zero.shard_id)
|
||||
.await?;
|
||||
|
||||
// Pick a 'main' timeline that has no ancestors, the rest will get arbitrary names
|
||||
let main_timeline = timelines
|
||||
.iter()
|
||||
.find(|t| t.ancestor_timeline_id.is_none())
|
||||
.expect("No timelines found")
|
||||
.timeline_id;
|
||||
|
||||
let mut branch_i = 0;
|
||||
for timeline in timelines.iter() {
|
||||
let branch_name = if timeline.timeline_id == main_timeline {
|
||||
"main".to_string()
|
||||
} else {
|
||||
branch_i += 1;
|
||||
format!("branch_{branch_i}")
|
||||
};
|
||||
|
||||
println!(
|
||||
"Importing timeline {tenant_id}/{} as branch {branch_name}",
|
||||
timeline.timeline_id
|
||||
);
|
||||
|
||||
env.register_branch_mapping(branch_name, tenant_id, timeline.timeline_id)?;
|
||||
}
|
||||
}
|
||||
Some(("create", create_match)) => {
|
||||
let tenant_conf: HashMap<_, _> = create_match
|
||||
.get_many::<String>("config")
|
||||
@@ -1106,6 +1060,21 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
}
|
||||
}
|
||||
|
||||
Some(("set-state", subcommand_args)) => {
|
||||
let pageserver = get_pageserver(env, subcommand_args)?;
|
||||
let scheduling = subcommand_args.get_one("scheduling");
|
||||
let availability = subcommand_args.get_one("availability");
|
||||
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
storage_controller
|
||||
.node_configure(NodeConfigureRequest {
|
||||
node_id: pageserver.conf.id,
|
||||
scheduling: scheduling.cloned(),
|
||||
availability: availability.cloned(),
|
||||
})
|
||||
.await?;
|
||||
}
|
||||
|
||||
Some(("status", subcommand_args)) => {
|
||||
match get_pageserver(env, subcommand_args)?.check_status().await {
|
||||
Ok(_) => println!("Page server is up and running"),
|
||||
@@ -1279,7 +1248,7 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
|
||||
match ComputeControlPlane::load(env.clone()) {
|
||||
Ok(cplane) => {
|
||||
for (_k, node) in cplane.endpoints {
|
||||
if let Err(e) = node.stop(if immediate { "immediate" } else { "fast" }, false) {
|
||||
if let Err(e) = node.stop(if immediate { "immediate" } else { "fast " }, false) {
|
||||
eprintln!("postgres stop failed: {e:#}");
|
||||
}
|
||||
}
|
||||
@@ -1465,7 +1434,6 @@ fn cli() -> Command {
|
||||
.subcommand(
|
||||
Command::new("timeline")
|
||||
.about("Manage timelines")
|
||||
.arg_required_else_help(true)
|
||||
.subcommand(Command::new("list")
|
||||
.about("List all timelines, available to this pageserver")
|
||||
.arg(tenant_id_arg.clone()))
|
||||
@@ -1528,8 +1496,6 @@ fn cli() -> Command {
|
||||
.subcommand(Command::new("config")
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
|
||||
.subcommand(Command::new("import").arg(tenant_id_arg.clone().required(true))
|
||||
.about("Import a tenant that is present in remote storage, and create branches for its timelines"))
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("pageserver")
|
||||
@@ -1549,6 +1515,12 @@ fn cli() -> Command {
|
||||
.about("Restart local pageserver")
|
||||
.arg(pageserver_config_args.clone())
|
||||
)
|
||||
.subcommand(Command::new("set-state")
|
||||
.arg(Arg::new("availability").value_parser(value_parser!(NodeAvailability)).long("availability").action(ArgAction::Set).help("Availability state: offline,active"))
|
||||
.arg(Arg::new("scheduling").value_parser(value_parser!(NodeSchedulingPolicy)).long("scheduling").action(ArgAction::Set).help("Scheduling state: draining,pause,filling,active"))
|
||||
.about("Set scheduling or availability state of pageserver node")
|
||||
.arg(pageserver_config_args.clone())
|
||||
)
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("storage_controller")
|
||||
|
||||
@@ -156,7 +156,6 @@ pub struct SafekeeperConf {
|
||||
pub remote_storage: Option<String>,
|
||||
pub backup_threads: Option<u32>,
|
||||
pub auth_enabled: bool,
|
||||
pub listen_addr: Option<String>,
|
||||
}
|
||||
|
||||
impl Default for SafekeeperConf {
|
||||
@@ -170,7 +169,6 @@ impl Default for SafekeeperConf {
|
||||
remote_storage: None,
|
||||
backup_threads: None,
|
||||
auth_enabled: false,
|
||||
listen_addr: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -389,10 +389,6 @@ impl PageServerNode {
|
||||
.remove("image_creation_threshold")
|
||||
.map(|x| x.parse::<usize>())
|
||||
.transpose()?,
|
||||
image_layer_creation_check_threshold: settings
|
||||
.remove("image_layer_creation_check_threshold")
|
||||
.map(|x| x.parse::<u8>())
|
||||
.transpose()?,
|
||||
pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
|
||||
walreceiver_connect_timeout: settings
|
||||
.remove("walreceiver_connect_timeout")
|
||||
@@ -505,12 +501,6 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<usize>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'image_creation_threshold' as non zero integer")?,
|
||||
image_layer_creation_check_threshold: settings
|
||||
.remove("image_layer_creation_check_threshold")
|
||||
.map(|x| x.parse::<u8>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'image_creation_check_threshold' as integer")?,
|
||||
|
||||
pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
|
||||
walreceiver_connect_timeout: settings
|
||||
.remove("walreceiver_connect_timeout")
|
||||
|
||||
@@ -70,31 +70,24 @@ pub struct SafekeeperNode {
|
||||
pub pg_connection_config: PgConnectionConfig,
|
||||
pub env: LocalEnv,
|
||||
pub http_client: reqwest::Client,
|
||||
pub listen_addr: String,
|
||||
pub http_base_url: String,
|
||||
}
|
||||
|
||||
impl SafekeeperNode {
|
||||
pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode {
|
||||
let listen_addr = if let Some(ref listen_addr) = conf.listen_addr {
|
||||
listen_addr.clone()
|
||||
} else {
|
||||
"127.0.0.1".to_string()
|
||||
};
|
||||
SafekeeperNode {
|
||||
id: conf.id,
|
||||
conf: conf.clone(),
|
||||
pg_connection_config: Self::safekeeper_connection_config(&listen_addr, conf.pg_port),
|
||||
pg_connection_config: Self::safekeeper_connection_config(conf.pg_port),
|
||||
env: env.clone(),
|
||||
http_client: reqwest::Client::new(),
|
||||
http_base_url: format!("http://{}:{}/v1", listen_addr, conf.http_port),
|
||||
listen_addr,
|
||||
http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
|
||||
}
|
||||
}
|
||||
|
||||
/// Construct libpq connection string for connecting to this safekeeper.
|
||||
fn safekeeper_connection_config(addr: &str, port: u16) -> PgConnectionConfig {
|
||||
PgConnectionConfig::new_host_port(url::Host::parse(addr).unwrap(), port)
|
||||
fn safekeeper_connection_config(port: u16) -> PgConnectionConfig {
|
||||
PgConnectionConfig::new_host_port(url::Host::parse("127.0.0.1").unwrap(), port)
|
||||
}
|
||||
|
||||
pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf {
|
||||
@@ -118,8 +111,8 @@ impl SafekeeperNode {
|
||||
);
|
||||
io::stdout().flush().unwrap();
|
||||
|
||||
let listen_pg = format!("{}:{}", self.listen_addr, self.conf.pg_port);
|
||||
let listen_http = format!("{}:{}", self.listen_addr, self.conf.http_port);
|
||||
let listen_pg = format!("127.0.0.1:{}", self.conf.pg_port);
|
||||
let listen_http = format!("127.0.0.1:{}", self.conf.http_port);
|
||||
let id = self.id;
|
||||
let datadir = self.datadir_path();
|
||||
|
||||
@@ -146,7 +139,7 @@ impl SafekeeperNode {
|
||||
availability_zone,
|
||||
];
|
||||
if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port {
|
||||
let listen_pg_tenant_only = format!("{}:{}", self.listen_addr, pg_tenant_only_port);
|
||||
let listen_pg_tenant_only = format!("127.0.0.1:{}", pg_tenant_only_port);
|
||||
args.extend(["--listen-pg-tenant-only".to_owned(), listen_pg_tenant_only]);
|
||||
}
|
||||
if !self.conf.sync {
|
||||
|
||||
@@ -472,16 +472,6 @@ impl StorageController {
|
||||
.await
|
||||
}
|
||||
|
||||
#[instrument(skip(self))]
|
||||
pub async fn tenant_import(&self, tenant_id: TenantId) -> anyhow::Result<TenantCreateResponse> {
|
||||
self.dispatch::<(), TenantCreateResponse>(
|
||||
Method::POST,
|
||||
format!("debug/v1/tenant/{tenant_id}/import"),
|
||||
None,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
#[instrument(skip(self))]
|
||||
pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
|
||||
self.dispatch::<(), _>(
|
||||
|
||||
@@ -1,23 +0,0 @@
|
||||
[package]
|
||||
name = "storcon_cli"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
clap.workspace = true
|
||||
comfy-table.workspace = true
|
||||
hyper.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
pageserver_client.workspace = true
|
||||
reqwest.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json = { workspace = true, features = ["raw_value"] }
|
||||
thiserror.workspace = true
|
||||
tokio.workspace = true
|
||||
tracing.workspace = true
|
||||
utils.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -1,681 +0,0 @@
|
||||
use std::{collections::HashMap, str::FromStr, time::Duration};
|
||||
|
||||
use clap::{Parser, Subcommand};
|
||||
use hyper::{Method, StatusCode};
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
|
||||
TenantDescribeResponse, TenantPolicyRequest,
|
||||
},
|
||||
models::{
|
||||
LocationConfigSecondary, ShardParameters, TenantConfig, TenantConfigRequest,
|
||||
TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
|
||||
},
|
||||
shard::{ShardStripeSize, TenantShardId},
|
||||
};
|
||||
use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
|
||||
use reqwest::Url;
|
||||
use serde::{de::DeserializeOwned, Serialize};
|
||||
use utils::id::{NodeId, TenantId};
|
||||
|
||||
use pageserver_api::controller_api::{
|
||||
NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
|
||||
TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse,
|
||||
};
|
||||
|
||||
#[derive(Subcommand, Debug)]
|
||||
enum Command {
|
||||
/// Register a pageserver with the storage controller. This shouldn't usually be necessary,
|
||||
/// since pageservers auto-register when they start up
|
||||
NodeRegister {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
|
||||
#[arg(long)]
|
||||
listen_pg_addr: String,
|
||||
#[arg(long)]
|
||||
listen_pg_port: u16,
|
||||
|
||||
#[arg(long)]
|
||||
listen_http_addr: String,
|
||||
#[arg(long)]
|
||||
listen_http_port: u16,
|
||||
},
|
||||
|
||||
/// Modify a node's configuration in the storage controller
|
||||
NodeConfigure {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
|
||||
/// Availability is usually auto-detected based on heartbeats. Set 'offline' here to
|
||||
/// manually mark a node offline
|
||||
#[arg(long)]
|
||||
availability: Option<NodeAvailabilityArg>,
|
||||
/// Scheduling policy controls whether tenant shards may be scheduled onto this node.
|
||||
#[arg(long)]
|
||||
scheduling: Option<NodeSchedulingPolicy>,
|
||||
},
|
||||
/// Modify a tenant's policies in the storage controller
|
||||
TenantPolicy {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
/// Placement policy controls whether a tenant is `detached`, has only a secondary location (`secondary`),
|
||||
/// or is in the normal attached state with N secondary locations (`attached:N`)
|
||||
#[arg(long)]
|
||||
placement: Option<PlacementPolicyArg>,
|
||||
/// Scheduling policy enables pausing the controller's scheduling activity involving this tenant. `active` is normal,
|
||||
/// `essential` disables optimization scheduling changes, `pause` disables all scheduling changes, and `stop` prevents
|
||||
/// all reconciliation activity including for scheduling changes already made. `pause` and `stop` can make a tenant
|
||||
/// unavailable, and are only for use in emergencies.
|
||||
#[arg(long)]
|
||||
scheduling: Option<ShardSchedulingPolicyArg>,
|
||||
},
|
||||
/// List nodes known to the storage controller
|
||||
Nodes {},
|
||||
/// List tenants known to the storage controller
|
||||
Tenants {},
|
||||
/// Create a new tenant in the storage controller, and by extension on pageservers.
|
||||
TenantCreate {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
},
|
||||
/// Delete a tenant in the storage controller, and by extension on pageservers.
|
||||
TenantDelete {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
},
|
||||
/// Split an existing tenant into a higher number of shards than its current shard count.
|
||||
TenantShardSplit {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
#[arg(long)]
|
||||
shard_count: u8,
|
||||
/// Optional, in 8kiB pages. e.g. set 2048 for 16MB stripes.
|
||||
#[arg(long)]
|
||||
stripe_size: Option<u32>,
|
||||
},
|
||||
/// Migrate the attached location for a tenant shard to a specific pageserver.
|
||||
TenantShardMigrate {
|
||||
#[arg(long)]
|
||||
tenant_shard_id: TenantShardId,
|
||||
#[arg(long)]
|
||||
node: NodeId,
|
||||
},
|
||||
/// Modify the pageserver tenant configuration of a tenant: this is the configuration structure
|
||||
/// that is passed through to pageservers, and does not affect storage controller behavior.
|
||||
TenantConfig {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
#[arg(long)]
|
||||
config: String,
|
||||
},
|
||||
/// Attempt to balance the locations for a tenant across pageservers. This is a client-side
|
||||
/// alternative to the storage controller's scheduling optimization behavior.
|
||||
TenantScatter {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
},
|
||||
/// Print details about a particular tenant, including all its shards' states.
|
||||
TenantDescribe {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
},
|
||||
/// For a tenant which hasn't been onboarded to the storage controller yet, add it in secondary
|
||||
/// mode so that it can warm up content on a pageserver.
|
||||
TenantWarmup {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(
|
||||
author,
|
||||
version,
|
||||
about,
|
||||
long_about = "CLI for Storage Controller Support/Debug"
|
||||
)]
|
||||
#[command(arg_required_else_help(true))]
|
||||
struct Cli {
|
||||
#[arg(long)]
|
||||
/// URL to storage controller. e.g. http://127.0.0.1:1234 when using `neon_local`
|
||||
api: Url,
|
||||
|
||||
#[arg(long)]
|
||||
/// JWT token for authenticating with storage controller. Depending on the API used, this
|
||||
/// should have either `pageserverapi` or `admin` scopes: for convenience, you should mint
|
||||
/// a token with both scopes to use with this tool.
|
||||
jwt: Option<String>,
|
||||
|
||||
#[command(subcommand)]
|
||||
command: Command,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct PlacementPolicyArg(PlacementPolicy);
|
||||
|
||||
impl FromStr for PlacementPolicyArg {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s {
|
||||
"detached" => Ok(Self(PlacementPolicy::Detached)),
|
||||
"secondary" => Ok(Self(PlacementPolicy::Secondary)),
|
||||
_ if s.starts_with("attached:") => {
|
||||
let mut splitter = s.split(':');
|
||||
let _prefix = splitter.next().unwrap();
|
||||
match splitter.next().and_then(|s| s.parse::<usize>().ok()) {
|
||||
Some(n) => Ok(Self(PlacementPolicy::Attached(n))),
|
||||
None => Err(anyhow::anyhow!(
|
||||
"Invalid format '{s}', a valid example is 'attached:1'"
|
||||
)),
|
||||
}
|
||||
}
|
||||
_ => Err(anyhow::anyhow!(
|
||||
"Unknown placement policy '{s}', try detached,secondary,attached:<n>"
|
||||
)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct ShardSchedulingPolicyArg(ShardSchedulingPolicy);
|
||||
|
||||
impl FromStr for ShardSchedulingPolicyArg {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s {
|
||||
"active" => Ok(Self(ShardSchedulingPolicy::Active)),
|
||||
"essential" => Ok(Self(ShardSchedulingPolicy::Essential)),
|
||||
"pause" => Ok(Self(ShardSchedulingPolicy::Pause)),
|
||||
"stop" => Ok(Self(ShardSchedulingPolicy::Stop)),
|
||||
_ => Err(anyhow::anyhow!(
|
||||
"Unknown scheduling policy '{s}', try active,essential,pause,stop"
|
||||
)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct NodeAvailabilityArg(NodeAvailabilityWrapper);
|
||||
|
||||
impl FromStr for NodeAvailabilityArg {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s {
|
||||
"active" => Ok(Self(NodeAvailabilityWrapper::Active)),
|
||||
"offline" => Ok(Self(NodeAvailabilityWrapper::Offline)),
|
||||
_ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct Client {
|
||||
base_url: Url,
|
||||
jwt_token: Option<String>,
|
||||
client: reqwest::Client,
|
||||
}
|
||||
|
||||
impl Client {
|
||||
fn new(base_url: Url, jwt_token: Option<String>) -> Self {
|
||||
Self {
|
||||
base_url,
|
||||
jwt_token,
|
||||
client: reqwest::ClientBuilder::new()
|
||||
.build()
|
||||
.expect("Failed to construct http client"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Simple HTTP request wrapper for calling into storage controller
|
||||
async fn dispatch<RQ, RS>(
|
||||
&self,
|
||||
method: hyper::Method,
|
||||
path: String,
|
||||
body: Option<RQ>,
|
||||
) -> mgmt_api::Result<RS>
|
||||
where
|
||||
RQ: Serialize + Sized,
|
||||
RS: DeserializeOwned + Sized,
|
||||
{
|
||||
// The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
|
||||
// for general purpose API access.
|
||||
let url = Url::from_str(&format!(
|
||||
"http://{}:{}/{path}",
|
||||
self.base_url.host_str().unwrap(),
|
||||
self.base_url.port().unwrap()
|
||||
))
|
||||
.unwrap();
|
||||
|
||||
let mut builder = self.client.request(method, url);
|
||||
if let Some(body) = body {
|
||||
builder = builder.json(&body)
|
||||
}
|
||||
if let Some(jwt_token) = &self.jwt_token {
|
||||
builder = builder.header(
|
||||
reqwest::header::AUTHORIZATION,
|
||||
format!("Bearer {jwt_token}"),
|
||||
);
|
||||
}
|
||||
|
||||
let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?;
|
||||
let response = response.error_from_body().await?;
|
||||
|
||||
response
|
||||
.json()
|
||||
.await
|
||||
.map_err(pageserver_client::mgmt_api::Error::ReceiveBody)
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let cli = Cli::parse();
|
||||
|
||||
let storcon_client = Client::new(cli.api.clone(), cli.jwt.clone());
|
||||
|
||||
let mut trimmed = cli.api.to_string();
|
||||
trimmed.pop();
|
||||
let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref());
|
||||
|
||||
match cli.command {
|
||||
Command::NodeRegister {
|
||||
node_id,
|
||||
listen_pg_addr,
|
||||
listen_pg_port,
|
||||
listen_http_addr,
|
||||
listen_http_port,
|
||||
} => {
|
||||
storcon_client
|
||||
.dispatch::<_, ()>(
|
||||
Method::POST,
|
||||
"control/v1/node".to_string(),
|
||||
Some(NodeRegisterRequest {
|
||||
node_id,
|
||||
listen_pg_addr,
|
||||
listen_pg_port,
|
||||
listen_http_addr,
|
||||
listen_http_port,
|
||||
}),
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
Command::TenantCreate { tenant_id } => {
|
||||
vps_client
|
||||
.tenant_create(&TenantCreateRequest {
|
||||
new_tenant_id: TenantShardId::unsharded(tenant_id),
|
||||
generation: None,
|
||||
shard_parameters: ShardParameters::default(),
|
||||
placement_policy: Some(PlacementPolicy::Attached(1)),
|
||||
config: TenantConfig::default(),
|
||||
})
|
||||
.await?;
|
||||
}
|
||||
Command::TenantDelete { tenant_id } => {
|
||||
let status = vps_client
|
||||
.tenant_delete(TenantShardId::unsharded(tenant_id))
|
||||
.await?;
|
||||
tracing::info!("Delete status: {}", status);
|
||||
}
|
||||
Command::Nodes {} => {
|
||||
let resp = storcon_client
|
||||
.dispatch::<(), Vec<NodeDescribeResponse>>(
|
||||
Method::GET,
|
||||
"control/v1/node".to_string(),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
let mut table = comfy_table::Table::new();
|
||||
table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
|
||||
for node in resp {
|
||||
table.add_row([
|
||||
format!("{}", node.id),
|
||||
node.listen_http_addr,
|
||||
format!("{:?}", node.scheduling),
|
||||
format!("{:?}", node.availability),
|
||||
]);
|
||||
}
|
||||
println!("{table}");
|
||||
}
|
||||
Command::NodeConfigure {
|
||||
node_id,
|
||||
availability,
|
||||
scheduling,
|
||||
} => {
|
||||
let req = NodeConfigureRequest {
|
||||
node_id,
|
||||
availability: availability.map(|a| a.0),
|
||||
scheduling,
|
||||
};
|
||||
storcon_client
|
||||
.dispatch::<_, ()>(
|
||||
Method::PUT,
|
||||
format!("control/v1/node/{node_id}/config"),
|
||||
Some(req),
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
Command::Tenants {} => {
|
||||
let resp = storcon_client
|
||||
.dispatch::<(), Vec<TenantDescribeResponse>>(
|
||||
Method::GET,
|
||||
"control/v1/tenant".to_string(),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
let mut table = comfy_table::Table::new();
|
||||
table.set_header([
|
||||
"TenantId",
|
||||
"ShardCount",
|
||||
"StripeSize",
|
||||
"Placement",
|
||||
"Scheduling",
|
||||
]);
|
||||
for tenant in resp {
|
||||
let shard_zero = tenant.shards.into_iter().next().unwrap();
|
||||
table.add_row([
|
||||
format!("{}", tenant.tenant_id),
|
||||
format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
|
||||
format!("{:?}", tenant.stripe_size),
|
||||
format!("{:?}", tenant.policy),
|
||||
format!("{:?}", shard_zero.scheduling_policy),
|
||||
]);
|
||||
}
|
||||
|
||||
println!("{table}");
|
||||
}
|
||||
Command::TenantPolicy {
|
||||
tenant_id,
|
||||
placement,
|
||||
scheduling,
|
||||
} => {
|
||||
let req = TenantPolicyRequest {
|
||||
scheduling: scheduling.map(|s| s.0),
|
||||
placement: placement.map(|p| p.0),
|
||||
};
|
||||
storcon_client
|
||||
.dispatch::<_, ()>(
|
||||
Method::PUT,
|
||||
format!("control/v1/tenant/{tenant_id}/policy"),
|
||||
Some(req),
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
Command::TenantShardSplit {
|
||||
tenant_id,
|
||||
shard_count,
|
||||
stripe_size,
|
||||
} => {
|
||||
let req = TenantShardSplitRequest {
|
||||
new_shard_count: shard_count,
|
||||
new_stripe_size: stripe_size.map(ShardStripeSize),
|
||||
};
|
||||
|
||||
let response = storcon_client
|
||||
.dispatch::<TenantShardSplitRequest, TenantShardSplitResponse>(
|
||||
Method::PUT,
|
||||
format!("control/v1/tenant/{tenant_id}/shard_split"),
|
||||
Some(req),
|
||||
)
|
||||
.await?;
|
||||
println!(
|
||||
"Split tenant {} into {} shards: {}",
|
||||
tenant_id,
|
||||
shard_count,
|
||||
response
|
||||
.new_shards
|
||||
.iter()
|
||||
.map(|s| format!("{:?}", s))
|
||||
.collect::<Vec<_>>()
|
||||
.join(",")
|
||||
);
|
||||
}
|
||||
Command::TenantShardMigrate {
|
||||
tenant_shard_id,
|
||||
node,
|
||||
} => {
|
||||
let req = TenantShardMigrateRequest {
|
||||
tenant_shard_id,
|
||||
node_id: node,
|
||||
};
|
||||
|
||||
storcon_client
|
||||
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
|
||||
Method::PUT,
|
||||
format!("control/v1/tenant/{tenant_shard_id}/migrate"),
|
||||
Some(req),
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
Command::TenantConfig { tenant_id, config } => {
|
||||
let tenant_conf = serde_json::from_str(&config)?;
|
||||
|
||||
vps_client
|
||||
.tenant_config(&TenantConfigRequest {
|
||||
tenant_id,
|
||||
config: tenant_conf,
|
||||
})
|
||||
.await?;
|
||||
}
|
||||
Command::TenantScatter { tenant_id } => {
|
||||
// Find the shards
|
||||
let locate_response = storcon_client
|
||||
.dispatch::<(), TenantLocateResponse>(
|
||||
Method::GET,
|
||||
format!("control/v1/tenant/{tenant_id}/locate"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
let shards = locate_response.shards;
|
||||
|
||||
let mut node_to_shards: HashMap<NodeId, Vec<TenantShardId>> = HashMap::new();
|
||||
let shard_count = shards.len();
|
||||
for s in shards {
|
||||
let entry = node_to_shards.entry(s.node_id).or_default();
|
||||
entry.push(s.shard_id);
|
||||
}
|
||||
|
||||
// Load list of available nodes
|
||||
let nodes_resp = storcon_client
|
||||
.dispatch::<(), Vec<NodeDescribeResponse>>(
|
||||
Method::GET,
|
||||
"control/v1/node".to_string(),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
for node in nodes_resp {
|
||||
if matches!(node.availability, NodeAvailabilityWrapper::Active) {
|
||||
node_to_shards.entry(node.id).or_default();
|
||||
}
|
||||
}
|
||||
|
||||
let max_shard_per_node = shard_count / node_to_shards.len();
|
||||
|
||||
loop {
|
||||
let mut migrate_shard = None;
|
||||
for shards in node_to_shards.values_mut() {
|
||||
if shards.len() > max_shard_per_node {
|
||||
// Pick the emptiest
|
||||
migrate_shard = Some(shards.pop().unwrap());
|
||||
}
|
||||
}
|
||||
let Some(migrate_shard) = migrate_shard else {
|
||||
break;
|
||||
};
|
||||
|
||||
// Pick the emptiest node to migrate to
|
||||
let mut destinations = node_to_shards
|
||||
.iter()
|
||||
.map(|(k, v)| (k, v.len()))
|
||||
.collect::<Vec<_>>();
|
||||
destinations.sort_by_key(|i| i.1);
|
||||
let (destination_node, destination_count) = *destinations.first().unwrap();
|
||||
if destination_count + 1 > max_shard_per_node {
|
||||
// Even the emptiest destination doesn't have space: we're done
|
||||
break;
|
||||
}
|
||||
let destination_node = *destination_node;
|
||||
|
||||
node_to_shards
|
||||
.get_mut(&destination_node)
|
||||
.unwrap()
|
||||
.push(migrate_shard);
|
||||
|
||||
println!("Migrate {} -> {} ...", migrate_shard, destination_node);
|
||||
|
||||
storcon_client
|
||||
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
|
||||
Method::PUT,
|
||||
format!("control/v1/tenant/{migrate_shard}/migrate"),
|
||||
Some(TenantShardMigrateRequest {
|
||||
tenant_shard_id: migrate_shard,
|
||||
node_id: destination_node,
|
||||
}),
|
||||
)
|
||||
.await?;
|
||||
println!("Migrate {} -> {} OK", migrate_shard, destination_node);
|
||||
}
|
||||
|
||||
// Spread the shards across the nodes
|
||||
}
|
||||
Command::TenantDescribe { tenant_id } => {
|
||||
let describe_response = storcon_client
|
||||
.dispatch::<(), TenantDescribeResponse>(
|
||||
Method::GET,
|
||||
format!("control/v1/tenant/{tenant_id}"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
let shards = describe_response.shards;
|
||||
let mut table = comfy_table::Table::new();
|
||||
table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]);
|
||||
for shard in shards {
|
||||
let secondary = shard
|
||||
.node_secondary
|
||||
.iter()
|
||||
.map(|n| format!("{}", n))
|
||||
.collect::<Vec<_>>()
|
||||
.join(",");
|
||||
|
||||
let mut status_parts = Vec::new();
|
||||
if shard.is_reconciling {
|
||||
status_parts.push("reconciling");
|
||||
}
|
||||
|
||||
if shard.is_pending_compute_notification {
|
||||
status_parts.push("pending_compute");
|
||||
}
|
||||
|
||||
if shard.is_splitting {
|
||||
status_parts.push("splitting");
|
||||
}
|
||||
let status = status_parts.join(",");
|
||||
|
||||
table.add_row([
|
||||
format!("{}", shard.tenant_shard_id),
|
||||
shard
|
||||
.node_attached
|
||||
.map(|n| format!("{}", n))
|
||||
.unwrap_or(String::new()),
|
||||
secondary,
|
||||
shard.last_error,
|
||||
status,
|
||||
]);
|
||||
}
|
||||
println!("{table}");
|
||||
}
|
||||
Command::TenantWarmup { tenant_id } => {
|
||||
let describe_response = storcon_client
|
||||
.dispatch::<(), TenantDescribeResponse>(
|
||||
Method::GET,
|
||||
format!("control/v1/tenant/{tenant_id}"),
|
||||
None,
|
||||
)
|
||||
.await;
|
||||
match describe_response {
|
||||
Ok(describe) => {
|
||||
if matches!(describe.policy, PlacementPolicy::Secondary) {
|
||||
// Fine: it's already known to controller in secondary mode: calling
|
||||
// again to put it into secondary mode won't cause problems.
|
||||
} else {
|
||||
anyhow::bail!("Tenant already present with policy {:?}", describe.policy);
|
||||
}
|
||||
}
|
||||
Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _)) => {
|
||||
// Fine: this tenant isn't know to the storage controller yet.
|
||||
}
|
||||
Err(e) => {
|
||||
// Unexpected API error
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
|
||||
vps_client
|
||||
.location_config(
|
||||
TenantShardId::unsharded(tenant_id),
|
||||
pageserver_api::models::LocationConfig {
|
||||
mode: pageserver_api::models::LocationConfigMode::Secondary,
|
||||
generation: None,
|
||||
secondary_conf: Some(LocationConfigSecondary { warm: true }),
|
||||
shard_number: 0,
|
||||
shard_count: 0,
|
||||
shard_stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE.0,
|
||||
tenant_conf: TenantConfig::default(),
|
||||
},
|
||||
None,
|
||||
true,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let describe_response = storcon_client
|
||||
.dispatch::<(), TenantDescribeResponse>(
|
||||
Method::GET,
|
||||
format!("control/v1/tenant/{tenant_id}"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let secondary_ps_id = describe_response
|
||||
.shards
|
||||
.first()
|
||||
.unwrap()
|
||||
.node_secondary
|
||||
.first()
|
||||
.unwrap();
|
||||
|
||||
println!("Tenant {tenant_id} warming up on pageserver {secondary_ps_id}");
|
||||
loop {
|
||||
let (status, progress) = vps_client
|
||||
.tenant_secondary_download(
|
||||
TenantShardId::unsharded(tenant_id),
|
||||
Some(Duration::from_secs(10)),
|
||||
)
|
||||
.await?;
|
||||
println!(
|
||||
"Progress: {}/{} layers, {}/{} bytes",
|
||||
progress.layers_downloaded,
|
||||
progress.layers_total,
|
||||
progress.bytes_downloaded,
|
||||
progress.bytes_total
|
||||
);
|
||||
match status {
|
||||
StatusCode::OK => {
|
||||
println!("Download complete");
|
||||
break;
|
||||
}
|
||||
StatusCode::ACCEPTED => {
|
||||
// Loop
|
||||
}
|
||||
_ => {
|
||||
anyhow::bail!("Unexpected download status: {status}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -2,8 +2,8 @@
|
||||
# see https://diesel.rs/guides/configuring-diesel-cli
|
||||
|
||||
[print_schema]
|
||||
file = "storage_controller/src/schema.rs"
|
||||
file = "control_plane/attachment_service/src/schema.rs"
|
||||
custom_type_derives = ["diesel::query_builder::QueryId"]
|
||||
|
||||
[migrations_directory]
|
||||
dir = "storage_controller/migrations"
|
||||
dir = "control_plane/attachment_service/migrations"
|
||||
|
||||
@@ -7,11 +7,6 @@ Below you will find a brief overview of each subdir in the source tree in alphab
|
||||
Neon storage broker, providing messaging between safekeepers and pageservers.
|
||||
[storage_broker.md](./storage_broker.md)
|
||||
|
||||
`storage_controller`:
|
||||
|
||||
Neon storage controller, manages a cluster of pageservers and exposes an API that enables
|
||||
managing a many-sharded tenant as a single entity.
|
||||
|
||||
`/control_plane`:
|
||||
|
||||
Local control plane.
|
||||
|
||||
@@ -1,150 +0,0 @@
|
||||
# Storage Controller
|
||||
|
||||
## Concepts
|
||||
|
||||
The storage controller sits between administrative API clients and pageservers, and handles the details of mapping tenants to pageserver tenant shards. For example, creating a tenant is one API call to the storage controller,
|
||||
which is mapped into many API calls to many pageservers (for multiple shards, and for secondary locations).
|
||||
|
||||
It implements a pageserver-compatible API that may be used for CRUD operations on tenants and timelines, translating these requests into appropriate operations on the shards within a tenant, which may be on many different pageservers. Using this API, the storage controller may be used in the same way as the pageserver's administrative HTTP API, hiding
|
||||
the underlying details of how data is spread across multiple nodes.
|
||||
|
||||
The storage controller also manages generations, high availability (via secondary locations) and live migrations for tenants under its management. This is done with a reconciliation loop pattern, where tenants have an “intent” state and a “reconcile” task that tries to make the outside world match the intent.
|
||||
|
||||
## APIs
|
||||
|
||||
The storage controller’s HTTP server implements four logically separate APIs:
|
||||
|
||||
- `/v1/...` path is the pageserver-compatible API. This has to be at the path root because that’s where clients expect to find it on a pageserver.
|
||||
- `/control/v1/...` path is the storage controller’s API, which enables operations such as registering and management pageservers, or executing shard splits.
|
||||
- `/debug/v1/...` path contains endpoints which are either exclusively used in tests, or are for use by engineers when supporting a deployed system.
|
||||
- `/upcall/v1/...` path contains endpoints that are called by pageservers. This includes the `/re-attach` and `/validate` APIs used by pageservers
|
||||
to ensure data safety with generation numbers.
|
||||
|
||||
The API is authenticated with a JWT token, and tokens must have scope `pageserverapi` (i.e. the same scope as pageservers’ APIs).
|
||||
|
||||
See the `http.rs` file in the source for where the HTTP APIs are implemented.
|
||||
|
||||
## Database
|
||||
|
||||
The storage controller uses a postgres database to persist a subset of its state. Note that the storage controller does _not_ keep all its state in the database: this is a design choice to enable most operations to be done efficiently in memory, rather than having to read from the database. See `persistence.rs` for a more comprehensive comment explaining what we do and do not persist: a useful metaphor is that we persist objects like tenants and nodes, but we do not
|
||||
persist the _relationships_ between them: the attachment state of a tenant's shards to nodes is kept in memory and
|
||||
rebuilt on startup.
|
||||
|
||||
The file `[persistence.rs](http://persistence.rs)` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.
|
||||
|
||||
The `diesel` crate is used for defining models & migrations.
|
||||
|
||||
Running a local cluster with `cargo neon` automatically starts a vanilla postgress process to host the storage controller’s database.
|
||||
|
||||
### Diesel tip: migrations
|
||||
|
||||
If you need to modify the database schema, here’s how to create a migration:
|
||||
|
||||
- Install the diesel CLI with `cargo install diesel_cli`
|
||||
- Use `diesel migration generate <name>` to create a new migration
|
||||
- Populate the SQL files in the `migrations/` subdirectory
|
||||
- Use `DATABASE_URL=... diesel migration run` to apply the migration you just wrote: this will update the `[schema.rs](http://schema.rs)` file automatically.
|
||||
- This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/attachment_service`
|
||||
- Commit the migration files and the changes to schema.rs
|
||||
- If you need to iterate, you can rewind migrations with `diesel migration revert -a` and then `diesel migration run` again.
|
||||
- The migrations are build into the storage controller binary, and automatically run at startup after it is deployed, so once you’ve committed a migration no further steps are needed.
|
||||
|
||||
## storcon_cli
|
||||
|
||||
The `storcon_cli` tool enables interactive management of the storage controller. This is usually
|
||||
only necessary for debug, but may also be used to manage nodes (e.g. marking a node as offline).
|
||||
|
||||
`storcon_cli --help` includes details on commands.
|
||||
|
||||
# Deploying
|
||||
|
||||
This section is aimed at engineers deploying the storage controller outside of Neon's cloud platform, as
|
||||
part of a self-hosted system.
|
||||
|
||||
_General note: since the default `neon_local` environment includes a storage controller, this is a useful
|
||||
reference when figuring out deployment._
|
||||
|
||||
## Database
|
||||
|
||||
It is **essential** that the database used by the storage controller is durable (**do not store it on ephemeral
|
||||
local disk**). This database contains pageserver generation numbers, which are essential to data safety on the pageserver.
|
||||
|
||||
The resource requirements for the database are very low: a single CPU core and 1GiB of memory should work well for most deployments. The physical size of the database is typically under a gigabyte.
|
||||
|
||||
Set the URL to the database using the `--database-url` CLI option.
|
||||
|
||||
There is no need to run migrations manually: the storage controller automatically applies migrations
|
||||
when it starts up.
|
||||
|
||||
## Configure pageservers to use the storage controller
|
||||
|
||||
1. The pageserver `control_plane_api` and `control_plane_api_token` should be set in the `pageserver.toml` file. The API setting should
|
||||
point to the "upcall" prefix, for example `http://127.0.0.1:1234/upcall/v1/` is used in neon_local clusters.
|
||||
2. Create a `metadata.json` file in the same directory as `pageserver.toml`: this enables the pageserver to automatically register itself
|
||||
with the storage controller when it starts up. See the example below for the format of this file.
|
||||
|
||||
### Example `metadata.json`
|
||||
|
||||
```
|
||||
{"host":"acmehost.localdomain","http_host":"acmehost.localdomain","http_port":9898,"port":64000}
|
||||
```
|
||||
|
||||
- `port` and `host` refer to the _postgres_ port and host, and these must be accessible from wherever
|
||||
postgres runs.
|
||||
- `http_port` and `http_host` refer to the pageserver's HTTP api, this must be accessible from where
|
||||
the storage controller runs.
|
||||
|
||||
## Handle compute notifications.
|
||||
|
||||
The storage controller independently moves tenant attachments between pageservers in response to
|
||||
changes such as a pageserver node becoming unavailable, or the tenant's shard count changing. To enable
|
||||
postgres clients to handle such changes, the storage controller calls an API hook when a tenant's pageserver
|
||||
location changes.
|
||||
|
||||
The hook is configured using the storage controller's `--compute-hook-url` CLI option. If the hook requires
|
||||
JWT auth, the token may be provided with `--control-plane-jwt-token`. The hook will be invoked with a `PUT` request.
|
||||
|
||||
In the Neon cloud service, this hook is implemented by Neon's internal cloud control plane. In `neon_local` systems
|
||||
the storage controller integrates directly with neon_local to reconfigure local postgres processes instead of calling
|
||||
the compute hook.
|
||||
|
||||
When implementing an on-premise Neon deployment, you must implement a service that handles the compute hook. This is not complicated:
|
||||
the request body has format of the `ComputeHookNotifyRequest` structure, provided below for convenience.
|
||||
|
||||
```
|
||||
struct ComputeHookNotifyRequestShard {
|
||||
node_id: NodeId,
|
||||
shard_number: ShardNumber,
|
||||
}
|
||||
|
||||
struct ComputeHookNotifyRequest {
|
||||
tenant_id: TenantId,
|
||||
stripe_size: Option<ShardStripeSize>,
|
||||
shards: Vec<ComputeHookNotifyRequestShard>,
|
||||
}
|
||||
```
|
||||
|
||||
When a notification is received:
|
||||
|
||||
1. Modify postgres configuration for this tenant:
|
||||
|
||||
- set `neon.pageserver_connstr` to a comma-separated list of postgres connection strings to pageservers according to the `shards` list. The
|
||||
shards identified by `NodeId` must be converted to the address+port of the node.
|
||||
- if stripe_size is not None, set `neon.stripe_size` to this value
|
||||
|
||||
2. Send SIGHUP to postgres to reload configuration
|
||||
3. Respond with 200 to the notification request. Do not return success if postgres was not updated: if an error is returned, the controller
|
||||
will retry the notification until it succeeds..
|
||||
|
||||
### Example notification body
|
||||
|
||||
```
|
||||
{
|
||||
"tenant_id": "1f359dd625e519a1a4e8d7509690f6fc",
|
||||
"stripe_size": 32768,
|
||||
"shards": [
|
||||
{"node_id": 344, "shard_number": 0},
|
||||
{"node_id": 722, "shard_number": 1},
|
||||
],
|
||||
}
|
||||
```
|
||||
@@ -10,13 +10,11 @@ libc.workspace = true
|
||||
once_cell.workspace = true
|
||||
chrono.workspace = true
|
||||
twox-hash.workspace = true
|
||||
measured.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[target.'cfg(target_os = "linux")'.dependencies]
|
||||
procfs.workspace = true
|
||||
measured-process.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
rand = "0.8"
|
||||
|
||||
@@ -7,19 +7,14 @@
|
||||
//! use significantly less memory than this, but can only approximate the cardinality.
|
||||
|
||||
use std::{
|
||||
hash::{BuildHasher, BuildHasherDefault, Hash},
|
||||
sync::atomic::AtomicU8,
|
||||
collections::HashMap,
|
||||
hash::{BuildHasher, BuildHasherDefault, Hash, Hasher},
|
||||
sync::{atomic::AtomicU8, Arc, RwLock},
|
||||
};
|
||||
|
||||
use measured::{
|
||||
label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor},
|
||||
metric::{
|
||||
group::{Encoding, MetricValue},
|
||||
name::MetricNameEncoder,
|
||||
Metric, MetricType, MetricVec,
|
||||
},
|
||||
text::TextEncoder,
|
||||
LabelGroup,
|
||||
use prometheus::{
|
||||
core::{self, Describer},
|
||||
proto, Opts,
|
||||
};
|
||||
use twox_hash::xxh3;
|
||||
|
||||
@@ -98,25 +93,203 @@ macro_rules! register_hll {
|
||||
/// ```
|
||||
///
|
||||
/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
|
||||
pub type HyperLogLogVec<L, const N: usize> = MetricVec<HyperLogLogState<N>, L>;
|
||||
pub type HyperLogLog<const N: usize> = Metric<HyperLogLogState<N>>;
|
||||
|
||||
pub struct HyperLogLogState<const N: usize> {
|
||||
shards: [AtomicU8; N],
|
||||
#[derive(Clone)]
|
||||
pub struct HyperLogLogVec<const N: usize> {
|
||||
core: Arc<HyperLogLogVecCore<N>>,
|
||||
}
|
||||
impl<const N: usize> Default for HyperLogLogState<N> {
|
||||
fn default() -> Self {
|
||||
#[allow(clippy::declare_interior_mutable_const)]
|
||||
const ZERO: AtomicU8 = AtomicU8::new(0);
|
||||
Self { shards: [ZERO; N] }
|
||||
|
||||
struct HyperLogLogVecCore<const N: usize> {
|
||||
pub children: RwLock<HashMap<u64, HyperLogLog<N>, BuildHasherDefault<xxh3::Hash64>>>,
|
||||
pub desc: core::Desc,
|
||||
pub opts: Opts,
|
||||
}
|
||||
|
||||
impl<const N: usize> core::Collector for HyperLogLogVec<N> {
|
||||
fn desc(&self) -> Vec<&core::Desc> {
|
||||
vec![&self.core.desc]
|
||||
}
|
||||
|
||||
fn collect(&self) -> Vec<proto::MetricFamily> {
|
||||
let mut m = proto::MetricFamily::default();
|
||||
m.set_name(self.core.desc.fq_name.clone());
|
||||
m.set_help(self.core.desc.help.clone());
|
||||
m.set_field_type(proto::MetricType::GAUGE);
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
for child in self.core.children.read().unwrap().values() {
|
||||
child.core.collect_into(&mut metrics);
|
||||
}
|
||||
m.set_metric(metrics);
|
||||
|
||||
vec![m]
|
||||
}
|
||||
}
|
||||
|
||||
impl<const N: usize> MetricType for HyperLogLogState<N> {
|
||||
type Metadata = ();
|
||||
impl<const N: usize> HyperLogLogVec<N> {
|
||||
/// Create a new [`HyperLogLogVec`] based on the provided
|
||||
/// [`Opts`] and partitioned by the given label names. At least one label name must be
|
||||
/// provided.
|
||||
pub fn new(opts: Opts, label_names: &[&str]) -> prometheus::Result<Self> {
|
||||
assert!(N.is_power_of_two());
|
||||
let variable_names = label_names.iter().map(|s| (*s).to_owned()).collect();
|
||||
let opts = opts.variable_labels(variable_names);
|
||||
|
||||
let desc = opts.describe()?;
|
||||
let v = HyperLogLogVecCore {
|
||||
children: RwLock::new(HashMap::default()),
|
||||
desc,
|
||||
opts,
|
||||
};
|
||||
|
||||
Ok(Self { core: Arc::new(v) })
|
||||
}
|
||||
|
||||
/// `get_metric_with_label_values` returns the [`HyperLogLog<P>`] for the given slice
|
||||
/// of label values (same order as the VariableLabels in Desc). If that combination of
|
||||
/// label values is accessed for the first time, a new [`HyperLogLog<P>`] is created.
|
||||
///
|
||||
/// An error is returned if the number of label values is not the same as the
|
||||
/// number of VariableLabels in Desc.
|
||||
pub fn get_metric_with_label_values(
|
||||
&self,
|
||||
vals: &[&str],
|
||||
) -> prometheus::Result<HyperLogLog<N>> {
|
||||
self.core.get_metric_with_label_values(vals)
|
||||
}
|
||||
|
||||
/// `with_label_values` works as `get_metric_with_label_values`, but panics if an error
|
||||
/// occurs.
|
||||
pub fn with_label_values(&self, vals: &[&str]) -> HyperLogLog<N> {
|
||||
self.get_metric_with_label_values(vals).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl<const N: usize> HyperLogLogState<N> {
|
||||
impl<const N: usize> HyperLogLogVecCore<N> {
|
||||
pub fn get_metric_with_label_values(
|
||||
&self,
|
||||
vals: &[&str],
|
||||
) -> prometheus::Result<HyperLogLog<N>> {
|
||||
let h = self.hash_label_values(vals)?;
|
||||
|
||||
if let Some(metric) = self.children.read().unwrap().get(&h).cloned() {
|
||||
return Ok(metric);
|
||||
}
|
||||
|
||||
self.get_or_create_metric(h, vals)
|
||||
}
|
||||
|
||||
pub(crate) fn hash_label_values(&self, vals: &[&str]) -> prometheus::Result<u64> {
|
||||
if vals.len() != self.desc.variable_labels.len() {
|
||||
return Err(prometheus::Error::InconsistentCardinality {
|
||||
expect: self.desc.variable_labels.len(),
|
||||
got: vals.len(),
|
||||
});
|
||||
}
|
||||
|
||||
let mut h = xxh3::Hash64::default();
|
||||
for val in vals {
|
||||
h.write(val.as_bytes());
|
||||
}
|
||||
|
||||
Ok(h.finish())
|
||||
}
|
||||
|
||||
fn get_or_create_metric(
|
||||
&self,
|
||||
hash: u64,
|
||||
label_values: &[&str],
|
||||
) -> prometheus::Result<HyperLogLog<N>> {
|
||||
let mut children = self.children.write().unwrap();
|
||||
// Check exist first.
|
||||
if let Some(metric) = children.get(&hash).cloned() {
|
||||
return Ok(metric);
|
||||
}
|
||||
|
||||
let metric = HyperLogLog::with_opts_and_label_values(&self.opts, label_values)?;
|
||||
children.insert(hash, metric.clone());
|
||||
Ok(metric)
|
||||
}
|
||||
}
|
||||
|
||||
/// HLL is a probabilistic cardinality measure.
|
||||
///
|
||||
/// How to use this time-series for a metric name `my_metrics_total_hll`:
|
||||
///
|
||||
/// ```promql
|
||||
/// # harmonic mean
|
||||
/// 1 / (
|
||||
/// sum (
|
||||
/// 2 ^ -(
|
||||
/// # HLL merge operation
|
||||
/// max (my_metrics_total_hll{}) by (hll_shard, other_labels...)
|
||||
/// )
|
||||
/// ) without (hll_shard)
|
||||
/// )
|
||||
/// * alpha
|
||||
/// * shards_count
|
||||
/// * shards_count
|
||||
/// ```
|
||||
///
|
||||
/// If you want an estimate over time, you can use the following query:
|
||||
///
|
||||
/// ```promql
|
||||
/// # harmonic mean
|
||||
/// 1 / (
|
||||
/// sum (
|
||||
/// 2 ^ -(
|
||||
/// # HLL merge operation
|
||||
/// max (
|
||||
/// max_over_time(my_metrics_total_hll{}[$__rate_interval])
|
||||
/// ) by (hll_shard, other_labels...)
|
||||
/// )
|
||||
/// ) without (hll_shard)
|
||||
/// )
|
||||
/// * alpha
|
||||
/// * shards_count
|
||||
/// * shards_count
|
||||
/// ```
|
||||
///
|
||||
/// In the case of low cardinality, you might want to use the linear counting approximation:
|
||||
///
|
||||
/// ```promql
|
||||
/// # LinearCounting(m, V) = m log (m / V)
|
||||
/// shards_count * ln(shards_count /
|
||||
/// # calculate V = how many shards contain a 0
|
||||
/// count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard)
|
||||
/// )
|
||||
/// ```
|
||||
///
|
||||
/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
|
||||
#[derive(Clone)]
|
||||
pub struct HyperLogLog<const N: usize> {
|
||||
core: Arc<HyperLogLogCore<N>>,
|
||||
}
|
||||
|
||||
impl<const N: usize> HyperLogLog<N> {
|
||||
/// Create a [`HyperLogLog`] with the `name` and `help` arguments.
|
||||
pub fn new<S1: Into<String>, S2: Into<String>>(name: S1, help: S2) -> prometheus::Result<Self> {
|
||||
assert!(N.is_power_of_two());
|
||||
let opts = Opts::new(name, help);
|
||||
Self::with_opts(opts)
|
||||
}
|
||||
|
||||
/// Create a [`HyperLogLog`] with the `opts` options.
|
||||
pub fn with_opts(opts: Opts) -> prometheus::Result<Self> {
|
||||
Self::with_opts_and_label_values(&opts, &[])
|
||||
}
|
||||
|
||||
fn with_opts_and_label_values(opts: &Opts, label_values: &[&str]) -> prometheus::Result<Self> {
|
||||
let desc = opts.describe()?;
|
||||
let labels = make_label_pairs(&desc, label_values)?;
|
||||
|
||||
let v = HyperLogLogCore {
|
||||
shards: [0; N].map(AtomicU8::new),
|
||||
desc,
|
||||
labels,
|
||||
};
|
||||
Ok(Self { core: Arc::new(v) })
|
||||
}
|
||||
|
||||
pub fn measure(&self, item: &impl Hash) {
|
||||
// changing the hasher will break compatibility with previous measurements.
|
||||
self.record(BuildHasherDefault::<xxh3::Hash64>::default().hash_one(item));
|
||||
@@ -126,11 +299,42 @@ impl<const N: usize> HyperLogLogState<N> {
|
||||
let p = N.ilog2() as u8;
|
||||
let j = hash & (N as u64 - 1);
|
||||
let rho = (hash >> p).leading_zeros() as u8 + 1 - p;
|
||||
self.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
|
||||
self.core.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
struct HyperLogLogCore<const N: usize> {
|
||||
shards: [AtomicU8; N],
|
||||
desc: core::Desc,
|
||||
labels: Vec<proto::LabelPair>,
|
||||
}
|
||||
|
||||
impl<const N: usize> core::Collector for HyperLogLog<N> {
|
||||
fn desc(&self) -> Vec<&core::Desc> {
|
||||
vec![&self.core.desc]
|
||||
}
|
||||
|
||||
fn take_sample(&self) -> [u8; N] {
|
||||
self.shards.each_ref().map(|x| {
|
||||
fn collect(&self) -> Vec<proto::MetricFamily> {
|
||||
let mut m = proto::MetricFamily::default();
|
||||
m.set_name(self.core.desc.fq_name.clone());
|
||||
m.set_help(self.core.desc.help.clone());
|
||||
m.set_field_type(proto::MetricType::GAUGE);
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
self.core.collect_into(&mut metrics);
|
||||
m.set_metric(metrics);
|
||||
|
||||
vec![m]
|
||||
}
|
||||
}
|
||||
|
||||
impl<const N: usize> HyperLogLogCore<N> {
|
||||
fn collect_into(&self, metrics: &mut Vec<proto::Metric>) {
|
||||
self.shards.iter().enumerate().for_each(|(i, x)| {
|
||||
let mut shard_label = proto::LabelPair::default();
|
||||
shard_label.set_name("hll_shard".to_owned());
|
||||
shard_label.set_value(format!("{i}"));
|
||||
|
||||
// We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus.
|
||||
|
||||
// This seems like it would be a race condition,
|
||||
@@ -140,90 +344,85 @@ impl<const N: usize> HyperLogLogState<N> {
|
||||
|
||||
// TODO: maybe we shouldn't reset this on every collect, instead, only after a time window.
|
||||
// this would mean that a dev port-forwarding the metrics url won't break the sampling.
|
||||
x.swap(0, std::sync::atomic::Ordering::Relaxed)
|
||||
let v = x.swap(0, std::sync::atomic::Ordering::Relaxed);
|
||||
|
||||
let mut m = proto::Metric::default();
|
||||
let mut c = proto::Gauge::default();
|
||||
c.set_value(v as f64);
|
||||
m.set_gauge(c);
|
||||
|
||||
let mut labels = Vec::with_capacity(self.labels.len() + 1);
|
||||
labels.extend_from_slice(&self.labels);
|
||||
labels.push(shard_label);
|
||||
|
||||
m.set_label(labels);
|
||||
metrics.push(m);
|
||||
})
|
||||
}
|
||||
}
|
||||
impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEncoder<W>>
|
||||
for HyperLogLogState<N>
|
||||
{
|
||||
fn write_type(
|
||||
name: impl MetricNameEncoder,
|
||||
enc: &mut TextEncoder<W>,
|
||||
) -> Result<(), std::io::Error> {
|
||||
enc.write_type(&name, measured::text::MetricType::Gauge)
|
||||
|
||||
fn make_label_pairs(
|
||||
desc: &core::Desc,
|
||||
label_values: &[&str],
|
||||
) -> prometheus::Result<Vec<proto::LabelPair>> {
|
||||
if desc.variable_labels.len() != label_values.len() {
|
||||
return Err(prometheus::Error::InconsistentCardinality {
|
||||
expect: desc.variable_labels.len(),
|
||||
got: label_values.len(),
|
||||
});
|
||||
}
|
||||
fn collect_into(
|
||||
&self,
|
||||
_: &(),
|
||||
labels: impl LabelGroup,
|
||||
name: impl MetricNameEncoder,
|
||||
enc: &mut TextEncoder<W>,
|
||||
) -> Result<(), std::io::Error> {
|
||||
struct I64(i64);
|
||||
impl LabelValue for I64 {
|
||||
fn visit<V: LabelVisitor>(&self, v: V) -> V::Output {
|
||||
v.write_int(self.0)
|
||||
}
|
||||
}
|
||||
|
||||
struct HllShardLabel {
|
||||
hll_shard: i64,
|
||||
}
|
||||
|
||||
impl LabelGroup for HllShardLabel {
|
||||
fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
|
||||
const LE: &LabelName = LabelName::from_str("hll_shard");
|
||||
v.write_value(LE, &I64(self.hll_shard));
|
||||
}
|
||||
}
|
||||
|
||||
self.take_sample()
|
||||
.into_iter()
|
||||
.enumerate()
|
||||
.try_for_each(|(hll_shard, val)| {
|
||||
enc.write_metric_value(
|
||||
name.by_ref(),
|
||||
labels.by_ref().compose_with(HllShardLabel {
|
||||
hll_shard: hll_shard as i64,
|
||||
}),
|
||||
MetricValue::Int(val as i64),
|
||||
)
|
||||
})
|
||||
let total_len = desc.variable_labels.len() + desc.const_label_pairs.len();
|
||||
if total_len == 0 {
|
||||
return Ok(vec![]);
|
||||
}
|
||||
|
||||
if desc.variable_labels.is_empty() {
|
||||
return Ok(desc.const_label_pairs.clone());
|
||||
}
|
||||
|
||||
let mut label_pairs = Vec::with_capacity(total_len);
|
||||
for (i, n) in desc.variable_labels.iter().enumerate() {
|
||||
let mut label_pair = proto::LabelPair::default();
|
||||
label_pair.set_name(n.clone());
|
||||
label_pair.set_value(label_values[i].to_owned());
|
||||
label_pairs.push(label_pair);
|
||||
}
|
||||
|
||||
for label_pair in &desc.const_label_pairs {
|
||||
label_pairs.push(label_pair.clone());
|
||||
}
|
||||
label_pairs.sort();
|
||||
Ok(label_pairs)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::HashSet;
|
||||
|
||||
use measured::{label::StaticLabelSet, FixedCardinalityLabel};
|
||||
use prometheus::{proto, Opts};
|
||||
use rand::{rngs::StdRng, Rng, SeedableRng};
|
||||
use rand_distr::{Distribution, Zipf};
|
||||
|
||||
use crate::HyperLogLogVec;
|
||||
|
||||
#[derive(FixedCardinalityLabel, Clone, Copy)]
|
||||
#[label(singleton = "x")]
|
||||
enum Label {
|
||||
A,
|
||||
B,
|
||||
fn collect(hll: &HyperLogLogVec<32>) -> Vec<proto::Metric> {
|
||||
let mut metrics = vec![];
|
||||
hll.core
|
||||
.children
|
||||
.read()
|
||||
.unwrap()
|
||||
.values()
|
||||
.for_each(|c| c.core.collect_into(&mut metrics));
|
||||
metrics
|
||||
}
|
||||
|
||||
fn collect(hll: &HyperLogLogVec<StaticLabelSet<Label>, 32>) -> ([u8; 32], [u8; 32]) {
|
||||
// cannot go through the `hll.collect_family_into` interface yet...
|
||||
// need to see if I can fix the conflicting impls problem in measured.
|
||||
(
|
||||
hll.get_metric(hll.with_labels(Label::A)).take_sample(),
|
||||
hll.get_metric(hll.with_labels(Label::B)).take_sample(),
|
||||
)
|
||||
}
|
||||
|
||||
fn get_cardinality(samples: &[[u8; 32]]) -> f64 {
|
||||
fn get_cardinality(metrics: &[proto::Metric], filter: impl Fn(&proto::Metric) -> bool) -> f64 {
|
||||
let mut buckets = [0.0; 32];
|
||||
for &sample in samples {
|
||||
for (i, m) in sample.into_iter().enumerate() {
|
||||
buckets[i] = f64::max(buckets[i], m as f64);
|
||||
for metric in metrics.chunks_exact(32) {
|
||||
if filter(&metric[0]) {
|
||||
for (i, m) in metric.iter().enumerate() {
|
||||
buckets[i] = f64::max(buckets[i], m.get_gauge().get_value());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -238,7 +437,7 @@ mod tests {
|
||||
}
|
||||
|
||||
fn test_cardinality(n: usize, dist: impl Distribution<f64>) -> ([usize; 3], [f64; 3]) {
|
||||
let hll = HyperLogLogVec::<StaticLabelSet<Label>, 32>::new();
|
||||
let hll = HyperLogLogVec::<32>::new(Opts::new("foo", "bar"), &["x"]).unwrap();
|
||||
|
||||
let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist);
|
||||
let mut set_a = HashSet::new();
|
||||
@@ -246,20 +445,18 @@ mod tests {
|
||||
|
||||
for x in iter.by_ref().take(n) {
|
||||
set_a.insert(x.to_bits());
|
||||
hll.get_metric(hll.with_labels(Label::A))
|
||||
.measure(&x.to_bits());
|
||||
hll.with_label_values(&["a"]).measure(&x.to_bits());
|
||||
}
|
||||
for x in iter.by_ref().take(n) {
|
||||
set_b.insert(x.to_bits());
|
||||
hll.get_metric(hll.with_labels(Label::B))
|
||||
.measure(&x.to_bits());
|
||||
hll.with_label_values(&["b"]).measure(&x.to_bits());
|
||||
}
|
||||
let merge = &set_a | &set_b;
|
||||
|
||||
let (a, b) = collect(&hll);
|
||||
let len = get_cardinality(&[a, b]);
|
||||
let len_a = get_cardinality(&[a]);
|
||||
let len_b = get_cardinality(&[b]);
|
||||
let metrics = collect(&hll);
|
||||
let len = get_cardinality(&metrics, |_| true);
|
||||
let len_a = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "a");
|
||||
let len_b = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "b");
|
||||
|
||||
([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b])
|
||||
}
|
||||
|
||||
@@ -4,17 +4,6 @@
|
||||
//! a default registry.
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
|
||||
use measured::{
|
||||
label::{LabelGroupSet, LabelGroupVisitor, LabelName, NoLabels},
|
||||
metric::{
|
||||
counter::CounterState,
|
||||
gauge::GaugeState,
|
||||
group::{Encoding, MetricValue},
|
||||
name::{MetricName, MetricNameEncoder},
|
||||
MetricEncoding, MetricFamilyEncoding,
|
||||
},
|
||||
FixedCardinalityLabel, LabelGroup, MetricGroup,
|
||||
};
|
||||
use once_cell::sync::Lazy;
|
||||
use prometheus::core::{
|
||||
Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec,
|
||||
@@ -22,7 +11,6 @@ use prometheus::core::{
|
||||
pub use prometheus::opts;
|
||||
pub use prometheus::register;
|
||||
pub use prometheus::Error;
|
||||
use prometheus::Registry;
|
||||
pub use prometheus::{core, default_registry, proto};
|
||||
pub use prometheus::{exponential_buckets, linear_buckets};
|
||||
pub use prometheus::{register_counter_vec, Counter, CounterVec};
|
||||
@@ -35,12 +23,13 @@ pub use prometheus::{register_int_counter_vec, IntCounterVec};
|
||||
pub use prometheus::{register_int_gauge, IntGauge};
|
||||
pub use prometheus::{register_int_gauge_vec, IntGaugeVec};
|
||||
pub use prometheus::{Encoder, TextEncoder};
|
||||
use prometheus::{Registry, Result};
|
||||
|
||||
pub mod launch_timestamp;
|
||||
mod wrappers;
|
||||
pub use wrappers::{CountedReader, CountedWriter};
|
||||
mod hll;
|
||||
pub use hll::{HyperLogLog, HyperLogLogState, HyperLogLogVec};
|
||||
pub use hll::{HyperLogLog, HyperLogLogVec};
|
||||
#[cfg(target_os = "linux")]
|
||||
pub mod more_process_metrics;
|
||||
|
||||
@@ -70,7 +59,7 @@ static INTERNAL_REGISTRY: Lazy<Registry> = Lazy::new(Registry::new);
|
||||
/// Register a collector in the internal registry. MUST be called before the first call to `gather()`.
|
||||
/// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector
|
||||
/// while holding the lock.
|
||||
pub fn register_internal(c: Box<dyn Collector>) -> prometheus::Result<()> {
|
||||
pub fn register_internal(c: Box<dyn Collector>) -> Result<()> {
|
||||
INTERNAL_REGISTRY.register(c)
|
||||
}
|
||||
|
||||
@@ -107,127 +96,6 @@ pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
|
||||
0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
|
||||
];
|
||||
|
||||
pub struct BuildInfo {
|
||||
pub revision: &'static str,
|
||||
pub build_tag: &'static str,
|
||||
}
|
||||
|
||||
// todo: allow label group without the set
|
||||
impl LabelGroup for BuildInfo {
|
||||
fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
|
||||
const REVISION: &LabelName = LabelName::from_str("revision");
|
||||
v.write_value(REVISION, &self.revision);
|
||||
const BUILD_TAG: &LabelName = LabelName::from_str("build_tag");
|
||||
v.write_value(BUILD_TAG, &self.build_tag);
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Encoding> MetricFamilyEncoding<T> for BuildInfo
|
||||
where
|
||||
GaugeState: MetricEncoding<T>,
|
||||
{
|
||||
fn collect_family_into(
|
||||
&self,
|
||||
name: impl measured::metric::name::MetricNameEncoder,
|
||||
enc: &mut T,
|
||||
) -> Result<(), T::Err> {
|
||||
enc.write_help(&name, "Build/version information")?;
|
||||
GaugeState::write_type(&name, enc)?;
|
||||
GaugeState {
|
||||
count: std::sync::atomic::AtomicI64::new(1),
|
||||
}
|
||||
.collect_into(&(), self, name, enc)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(MetricGroup)]
|
||||
#[metric(new(build_info: BuildInfo))]
|
||||
pub struct NeonMetrics {
|
||||
#[cfg(target_os = "linux")]
|
||||
#[metric(namespace = "process")]
|
||||
#[metric(init = measured_process::ProcessCollector::for_self())]
|
||||
process: measured_process::ProcessCollector,
|
||||
|
||||
#[metric(namespace = "libmetrics")]
|
||||
#[metric(init = LibMetrics::new(build_info))]
|
||||
libmetrics: LibMetrics,
|
||||
}
|
||||
|
||||
#[derive(MetricGroup)]
|
||||
#[metric(new(build_info: BuildInfo))]
|
||||
pub struct LibMetrics {
|
||||
#[metric(init = build_info)]
|
||||
build_info: BuildInfo,
|
||||
|
||||
#[metric(flatten)]
|
||||
rusage: Rusage,
|
||||
|
||||
serve_count: CollectionCounter,
|
||||
}
|
||||
|
||||
fn write_gauge<Enc: Encoding>(
|
||||
x: i64,
|
||||
labels: impl LabelGroup,
|
||||
name: impl MetricNameEncoder,
|
||||
enc: &mut Enc,
|
||||
) -> Result<(), Enc::Err> {
|
||||
enc.write_metric_value(name, labels, MetricValue::Int(x))
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct Rusage;
|
||||
|
||||
#[derive(FixedCardinalityLabel, Clone, Copy)]
|
||||
#[label(singleton = "io_operation")]
|
||||
enum IoOp {
|
||||
Read,
|
||||
Write,
|
||||
}
|
||||
|
||||
impl<T: Encoding> MetricGroup<T> for Rusage
|
||||
where
|
||||
GaugeState: MetricEncoding<T>,
|
||||
{
|
||||
fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
|
||||
const DISK_IO: &MetricName = MetricName::from_str("disk_io_bytes_total");
|
||||
const MAXRSS: &MetricName = MetricName::from_str("maxrss_kb");
|
||||
|
||||
let ru = get_rusage_stats();
|
||||
|
||||
enc.write_help(
|
||||
DISK_IO,
|
||||
"Bytes written and read from disk, grouped by the operation (read|write)",
|
||||
)?;
|
||||
GaugeState::write_type(DISK_IO, enc)?;
|
||||
write_gauge(ru.ru_inblock * BYTES_IN_BLOCK, IoOp::Read, DISK_IO, enc)?;
|
||||
write_gauge(ru.ru_oublock * BYTES_IN_BLOCK, IoOp::Write, DISK_IO, enc)?;
|
||||
|
||||
enc.write_help(MAXRSS, "Memory usage (Maximum Resident Set Size)")?;
|
||||
GaugeState::write_type(MAXRSS, enc)?;
|
||||
write_gauge(ru.ru_maxrss, IoOp::Read, MAXRSS, enc)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct CollectionCounter(CounterState);
|
||||
|
||||
impl<T: Encoding> MetricFamilyEncoding<T> for CollectionCounter
|
||||
where
|
||||
CounterState: MetricEncoding<T>,
|
||||
{
|
||||
fn collect_family_into(
|
||||
&self,
|
||||
name: impl measured::metric::name::MetricNameEncoder,
|
||||
enc: &mut T,
|
||||
) -> Result<(), T::Err> {
|
||||
self.0.inc();
|
||||
enc.write_help(&name, "Number of metric requests made")?;
|
||||
self.0.collect_into(&(), NoLabels, name, enc)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_build_info_metric(revision: &str, build_tag: &str) {
|
||||
let metric = register_int_gauge_vec!(
|
||||
"libmetrics_build_info",
|
||||
@@ -237,7 +105,6 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) {
|
||||
.expect("Failed to register build info metric");
|
||||
metric.with_label_values(&[revision, build_tag]).set(1);
|
||||
}
|
||||
const BYTES_IN_BLOCK: i64 = 512;
|
||||
|
||||
// Records I/O stats in a "cross-platform" way.
|
||||
// Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats.
|
||||
@@ -250,6 +117,7 @@ const BYTES_IN_BLOCK: i64 = 512;
|
||||
fn update_rusage_metrics() {
|
||||
let rusage_stats = get_rusage_stats();
|
||||
|
||||
const BYTES_IN_BLOCK: i64 = 512;
|
||||
DISK_IO_BYTES
|
||||
.with_label_values(&["read"])
|
||||
.set(rusage_stats.ru_inblock * BYTES_IN_BLOCK);
|
||||
@@ -283,7 +151,6 @@ macro_rules! register_int_counter_pair_vec {
|
||||
}
|
||||
}};
|
||||
}
|
||||
|
||||
/// Create an [`IntCounterPair`] and registers to default registry.
|
||||
#[macro_export(local_inner_macros)]
|
||||
macro_rules! register_int_counter_pair {
|
||||
@@ -321,10 +188,7 @@ impl<P: Atomic> GenericCounterPairVec<P> {
|
||||
///
|
||||
/// An error is returned if the number of label values is not the same as the
|
||||
/// number of VariableLabels in Desc.
|
||||
pub fn get_metric_with_label_values(
|
||||
&self,
|
||||
vals: &[&str],
|
||||
) -> prometheus::Result<GenericCounterPair<P>> {
|
||||
pub fn get_metric_with_label_values(&self, vals: &[&str]) -> Result<GenericCounterPair<P>> {
|
||||
Ok(GenericCounterPair {
|
||||
inc: self.inc.get_metric_with_label_values(vals)?,
|
||||
dec: self.dec.get_metric_with_label_values(vals)?,
|
||||
@@ -337,7 +201,7 @@ impl<P: Atomic> GenericCounterPairVec<P> {
|
||||
self.get_metric_with_label_values(vals).unwrap()
|
||||
}
|
||||
|
||||
pub fn remove_label_values(&self, res: &mut [prometheus::Result<()>; 2], vals: &[&str]) {
|
||||
pub fn remove_label_values(&self, res: &mut [Result<()>; 2], vals: &[&str]) {
|
||||
res[0] = self.inc.remove_label_values(vals);
|
||||
res[1] = self.dec.remove_label_values(vals);
|
||||
}
|
||||
@@ -421,171 +285,3 @@ pub type IntCounterPair = GenericCounterPair<AtomicU64>;
|
||||
|
||||
/// A guard for [`IntCounterPair`] that will decrement the gauge on drop
|
||||
pub type IntCounterPairGuard = GenericCounterPairGuard<AtomicU64>;
|
||||
|
||||
pub trait CounterPairAssoc {
|
||||
const INC_NAME: &'static MetricName;
|
||||
const DEC_NAME: &'static MetricName;
|
||||
|
||||
const INC_HELP: &'static str;
|
||||
const DEC_HELP: &'static str;
|
||||
|
||||
type LabelGroupSet: LabelGroupSet;
|
||||
}
|
||||
|
||||
pub struct CounterPairVec<A: CounterPairAssoc> {
|
||||
vec: measured::metric::MetricVec<MeasuredCounterPairState, A::LabelGroupSet>,
|
||||
}
|
||||
|
||||
impl<A: CounterPairAssoc> Default for CounterPairVec<A>
|
||||
where
|
||||
A::LabelGroupSet: Default,
|
||||
{
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
vec: Default::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<A: CounterPairAssoc> CounterPairVec<A> {
|
||||
pub fn guard(
|
||||
&self,
|
||||
labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>,
|
||||
) -> MeasuredCounterPairGuard<'_, A> {
|
||||
let id = self.vec.with_labels(labels);
|
||||
self.vec.get_metric(id).inc.inc();
|
||||
MeasuredCounterPairGuard { vec: &self.vec, id }
|
||||
}
|
||||
pub fn inc(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) {
|
||||
let id = self.vec.with_labels(labels);
|
||||
self.vec.get_metric(id).inc.inc();
|
||||
}
|
||||
pub fn dec(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) {
|
||||
let id = self.vec.with_labels(labels);
|
||||
self.vec.get_metric(id).dec.inc();
|
||||
}
|
||||
pub fn remove_metric(
|
||||
&self,
|
||||
labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>,
|
||||
) -> Option<MeasuredCounterPairState> {
|
||||
let id = self.vec.with_labels(labels);
|
||||
self.vec.remove_metric(id)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T, A> ::measured::metric::group::MetricGroup<T> for CounterPairVec<A>
|
||||
where
|
||||
T: ::measured::metric::group::Encoding,
|
||||
A: CounterPairAssoc,
|
||||
::measured::metric::counter::CounterState: ::measured::metric::MetricEncoding<T>,
|
||||
{
|
||||
fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
|
||||
// write decrement first to avoid a race condition where inc - dec < 0
|
||||
T::write_help(enc, A::DEC_NAME, A::DEC_HELP)?;
|
||||
self.vec
|
||||
.collect_family_into(A::DEC_NAME, &mut Dec(&mut *enc))?;
|
||||
|
||||
T::write_help(enc, A::INC_NAME, A::INC_HELP)?;
|
||||
self.vec
|
||||
.collect_family_into(A::INC_NAME, &mut Inc(&mut *enc))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(MetricGroup, Default)]
|
||||
pub struct MeasuredCounterPairState {
|
||||
pub inc: CounterState,
|
||||
pub dec: CounterState,
|
||||
}
|
||||
|
||||
impl measured::metric::MetricType for MeasuredCounterPairState {
|
||||
type Metadata = ();
|
||||
}
|
||||
|
||||
pub struct MeasuredCounterPairGuard<'a, A: CounterPairAssoc> {
|
||||
vec: &'a measured::metric::MetricVec<MeasuredCounterPairState, A::LabelGroupSet>,
|
||||
id: measured::metric::LabelId<A::LabelGroupSet>,
|
||||
}
|
||||
|
||||
impl<A: CounterPairAssoc> Drop for MeasuredCounterPairGuard<'_, A> {
|
||||
fn drop(&mut self) {
|
||||
self.vec.get_metric(self.id).dec.inc();
|
||||
}
|
||||
}
|
||||
|
||||
/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the inc counter to the inner encoder.
|
||||
struct Inc<T>(T);
|
||||
/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the dec counter to the inner encoder.
|
||||
struct Dec<T>(T);
|
||||
|
||||
impl<T: Encoding> Encoding for Inc<T> {
|
||||
type Err = T::Err;
|
||||
|
||||
fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
|
||||
self.0.write_help(name, help)
|
||||
}
|
||||
|
||||
fn write_metric_value(
|
||||
&mut self,
|
||||
name: impl MetricNameEncoder,
|
||||
labels: impl LabelGroup,
|
||||
value: MetricValue,
|
||||
) -> Result<(), Self::Err> {
|
||||
self.0.write_metric_value(name, labels, value)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Encoding> MetricEncoding<Inc<T>> for MeasuredCounterPairState
|
||||
where
|
||||
CounterState: MetricEncoding<T>,
|
||||
{
|
||||
fn write_type(name: impl MetricNameEncoder, enc: &mut Inc<T>) -> Result<(), T::Err> {
|
||||
CounterState::write_type(name, &mut enc.0)
|
||||
}
|
||||
fn collect_into(
|
||||
&self,
|
||||
metadata: &(),
|
||||
labels: impl LabelGroup,
|
||||
name: impl MetricNameEncoder,
|
||||
enc: &mut Inc<T>,
|
||||
) -> Result<(), T::Err> {
|
||||
self.inc.collect_into(metadata, labels, name, &mut enc.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Encoding> Encoding for Dec<T> {
|
||||
type Err = T::Err;
|
||||
|
||||
fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
|
||||
self.0.write_help(name, help)
|
||||
}
|
||||
|
||||
fn write_metric_value(
|
||||
&mut self,
|
||||
name: impl MetricNameEncoder,
|
||||
labels: impl LabelGroup,
|
||||
value: MetricValue,
|
||||
) -> Result<(), Self::Err> {
|
||||
self.0.write_metric_value(name, labels, value)
|
||||
}
|
||||
}
|
||||
|
||||
/// Write the dec counter to the encoder
|
||||
impl<T: Encoding> MetricEncoding<Dec<T>> for MeasuredCounterPairState
|
||||
where
|
||||
CounterState: MetricEncoding<T>,
|
||||
{
|
||||
fn write_type(name: impl MetricNameEncoder, enc: &mut Dec<T>) -> Result<(), T::Err> {
|
||||
CounterState::write_type(name, &mut enc.0)
|
||||
}
|
||||
fn collect_into(
|
||||
&self,
|
||||
metadata: &(),
|
||||
labels: impl LabelGroup,
|
||||
name: impl MetricNameEncoder,
|
||||
enc: &mut Dec<T>,
|
||||
) -> Result<(), T::Err> {
|
||||
self.dec.collect_into(metadata, labels, name, &mut enc.0)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,9 +2,9 @@ use std::str::FromStr;
|
||||
|
||||
/// Request/response types for the storage controller
|
||||
/// API (`/control/v1` prefix). Implemented by the server
|
||||
/// in [`storage_controller::http`]
|
||||
/// in [`attachment_service::http`]
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::{NodeId, TenantId};
|
||||
use utils::id::NodeId;
|
||||
|
||||
use crate::{
|
||||
models::{ShardParameters, TenantConfig},
|
||||
@@ -42,12 +42,6 @@ pub struct NodeConfigureRequest {
|
||||
pub scheduling: Option<NodeSchedulingPolicy>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantPolicyRequest {
|
||||
pub placement: Option<PlacementPolicy>,
|
||||
pub scheduling: Option<ShardSchedulingPolicy>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantLocateResponseShard {
|
||||
pub shard_id: TenantShardId,
|
||||
@@ -68,27 +62,12 @@ pub struct TenantLocateResponse {
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantDescribeResponse {
|
||||
pub tenant_id: TenantId,
|
||||
pub shards: Vec<TenantDescribeResponseShard>,
|
||||
pub stripe_size: ShardStripeSize,
|
||||
pub policy: PlacementPolicy,
|
||||
pub config: TenantConfig,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct NodeDescribeResponse {
|
||||
pub id: NodeId,
|
||||
|
||||
pub availability: NodeAvailabilityWrapper,
|
||||
pub scheduling: NodeSchedulingPolicy,
|
||||
|
||||
pub listen_http_addr: String,
|
||||
pub listen_http_port: u16,
|
||||
|
||||
pub listen_pg_addr: String,
|
||||
pub listen_pg_port: u16,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantDescribeResponseShard {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
@@ -104,8 +83,6 @@ pub struct TenantDescribeResponseShard {
|
||||
pub is_pending_compute_notification: bool,
|
||||
/// A shard split is currently underway
|
||||
pub is_splitting: bool,
|
||||
|
||||
pub scheduling_policy: ShardSchedulingPolicy,
|
||||
}
|
||||
|
||||
/// Explicitly migrating a particular shard is a low level operation
|
||||
@@ -120,7 +97,7 @@ pub struct TenantShardMigrateRequest {
|
||||
/// Utilisation score indicating how good a candidate a pageserver
|
||||
/// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
|
||||
/// Lower values are better.
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)]
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
|
||||
pub struct UtilizationScore(pub u64);
|
||||
|
||||
impl UtilizationScore {
|
||||
@@ -129,7 +106,7 @@ impl UtilizationScore {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
|
||||
#[derive(Serialize, Clone, Copy)]
|
||||
#[serde(into = "NodeAvailabilityWrapper")]
|
||||
pub enum NodeAvailability {
|
||||
// Normal, happy state
|
||||
@@ -152,7 +129,7 @@ impl Eq for NodeAvailability {}
|
||||
// This wrapper provides serde functionality and it should only be used to
|
||||
// communicate with external callers which don't know or care about the
|
||||
// utilisation score of the pageserver it is targeting.
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub enum NodeAvailabilityWrapper {
|
||||
Active,
|
||||
Offline,
|
||||
@@ -178,33 +155,22 @@ impl From<NodeAvailability> for NodeAvailabilityWrapper {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
|
||||
pub enum ShardSchedulingPolicy {
|
||||
// Normal mode: the tenant's scheduled locations may be updated at will, including
|
||||
// for non-essential optimization.
|
||||
Active,
|
||||
impl FromStr for NodeAvailability {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
// Disable optimizations, but permit scheduling when necessary to fulfil the PlacementPolicy.
|
||||
// For example, this still permits a node's attachment location to change to a secondary in
|
||||
// response to a node failure, or to assign a new secondary if a node was removed.
|
||||
Essential,
|
||||
|
||||
// No scheduling: leave the shard running wherever it currently is. Even if the shard is
|
||||
// unavailable, it will not be rescheduled to another node.
|
||||
Pause,
|
||||
|
||||
// No reconciling: we will make no location_conf API calls to pageservers at all. If the
|
||||
// shard is unavailable, it stays that way. If a node fails, this shard doesn't get failed over.
|
||||
Stop,
|
||||
}
|
||||
|
||||
impl Default for ShardSchedulingPolicy {
|
||||
fn default() -> Self {
|
||||
Self::Active
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s {
|
||||
// This is used when parsing node configuration requests from neon-local.
|
||||
// Assume the worst possible utilisation score
|
||||
// and let it get updated via the heartbeats.
|
||||
"active" => Ok(Self::Active(UtilizationScore::worst())),
|
||||
"offline" => Ok(Self::Offline),
|
||||
_ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
|
||||
pub enum NodeSchedulingPolicy {
|
||||
Active,
|
||||
Filling,
|
||||
|
||||
@@ -20,7 +20,6 @@ use utils::{
|
||||
history_buffer::HistoryBufferWithDropCounter,
|
||||
id::{NodeId, TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
serde_system_time,
|
||||
};
|
||||
|
||||
use crate::controller_api::PlacementPolicy;
|
||||
@@ -302,7 +301,6 @@ pub struct TenantConfig {
|
||||
pub heatmap_period: Option<String>,
|
||||
pub lazy_slru_download: Option<bool>,
|
||||
pub timeline_get_throttle: Option<ThrottleConfig>,
|
||||
pub image_layer_creation_check_threshold: Option<u8>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
@@ -747,18 +745,10 @@ pub struct TimelineGcRequest {
|
||||
pub gc_horizon: Option<u64>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct WalRedoManagerProcessStatus {
|
||||
pub pid: u32,
|
||||
/// The strum-generated `into::<&'static str>()` for `pageserver::walredo::ProcessKind`.
|
||||
/// `ProcessKind` are a transitory thing, so, they have no enum representation in `pageserver_api`.
|
||||
pub kind: Cow<'static, str>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct WalRedoManagerStatus {
|
||||
pub last_redo_at: Option<chrono::DateTime<chrono::Utc>>,
|
||||
pub process: Option<WalRedoManagerProcessStatus>,
|
||||
pub pid: Option<u32>,
|
||||
}
|
||||
|
||||
/// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
|
||||
@@ -767,7 +757,11 @@ pub struct WalRedoManagerStatus {
|
||||
#[derive(Default, Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct SecondaryProgress {
|
||||
/// The remote storage LastModified time of the heatmap object we last downloaded.
|
||||
pub heatmap_mtime: Option<serde_system_time::SystemTime>,
|
||||
#[serde(
|
||||
serialize_with = "opt_ser_rfc3339_millis",
|
||||
deserialize_with = "opt_deser_rfc3339_millis"
|
||||
)]
|
||||
pub heatmap_mtime: Option<SystemTime>,
|
||||
|
||||
/// The number of layers currently on-disk
|
||||
pub layers_downloaded: usize,
|
||||
@@ -780,15 +774,27 @@ pub struct SecondaryProgress {
|
||||
pub bytes_total: u64,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantScanRemoteStorageShard {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
pub generation: Option<u32>,
|
||||
fn opt_ser_rfc3339_millis<S: serde::Serializer>(
|
||||
ts: &Option<SystemTime>,
|
||||
serializer: S,
|
||||
) -> Result<S::Ok, S::Error> {
|
||||
match ts {
|
||||
Some(ts) => serializer.collect_str(&humantime::format_rfc3339_millis(*ts)),
|
||||
None => serializer.serialize_none(),
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Default)]
|
||||
pub struct TenantScanRemoteStorageResponse {
|
||||
pub shards: Vec<TenantScanRemoteStorageShard>,
|
||||
fn opt_deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<Option<SystemTime>, D::Error>
|
||||
where
|
||||
D: serde::de::Deserializer<'de>,
|
||||
{
|
||||
let s: Option<String> = serde::de::Deserialize::deserialize(deserializer)?;
|
||||
match s {
|
||||
None => Ok(None),
|
||||
Some(s) => humantime::parse_rfc3339(&s)
|
||||
.map_err(serde::de::Error::custom)
|
||||
.map(Some),
|
||||
}
|
||||
}
|
||||
|
||||
pub mod virtual_file {
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use utils::serde_system_time::SystemTime;
|
||||
use std::time::SystemTime;
|
||||
|
||||
/// Pageserver current utilization and scoring for how good candidate the pageserver would be for
|
||||
/// the next tenant.
|
||||
@@ -21,9 +21,28 @@ pub struct PageserverUtilization {
|
||||
/// When was this snapshot captured, pageserver local time.
|
||||
///
|
||||
/// Use millis to give confidence that the value is regenerated often enough.
|
||||
#[serde(
|
||||
serialize_with = "ser_rfc3339_millis",
|
||||
deserialize_with = "deser_rfc3339_millis"
|
||||
)]
|
||||
pub captured_at: SystemTime,
|
||||
}
|
||||
|
||||
fn ser_rfc3339_millis<S: serde::Serializer>(
|
||||
ts: &SystemTime,
|
||||
serializer: S,
|
||||
) -> Result<S::Ok, S::Error> {
|
||||
serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
|
||||
}
|
||||
|
||||
fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<SystemTime, D::Error>
|
||||
where
|
||||
D: serde::de::Deserializer<'de>,
|
||||
{
|
||||
let s: String = serde::de::Deserialize::deserialize(deserializer)?;
|
||||
humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom)
|
||||
}
|
||||
|
||||
/// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
|
||||
///
|
||||
/// Instead of newtype, use this because a newtype would get require handling deserializing values
|
||||
@@ -50,9 +69,7 @@ mod tests {
|
||||
disk_usage_bytes: u64::MAX,
|
||||
free_space_bytes: 0,
|
||||
utilization_score: u64::MAX,
|
||||
captured_at: SystemTime(
|
||||
std::time::SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
|
||||
),
|
||||
captured_at: SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
|
||||
};
|
||||
|
||||
let s = serde_json::to_string(&doc).unwrap();
|
||||
|
||||
@@ -8,89 +8,12 @@ use hex::FromHex;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::TenantId;
|
||||
|
||||
/// See docs/rfcs/031-sharding-static.md for an overview of sharding.
|
||||
///
|
||||
/// This module contains a variety of types used to represent the concept of sharding
|
||||
/// a Neon tenant across multiple physical shards. Since there are quite a few of these,
|
||||
/// we provide an summary here.
|
||||
///
|
||||
/// Types used to describe shards:
|
||||
/// - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
|
||||
/// which identifies a tenant which is not shard-aware. This means its storage paths do not include
|
||||
/// a shard suffix.
|
||||
/// - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
|
||||
/// - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
|
||||
/// without the tenant ID. This is useful for things that are implicitly scoped to a particular
|
||||
/// tenant, such as layer files.
|
||||
/// - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
|
||||
/// detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
|
||||
/// - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
|
||||
/// four hex digits. An unsharded tenant is `0000`.
|
||||
/// - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
|
||||
///
|
||||
/// Types used to describe the parameters for data distribution in a sharded tenant:
|
||||
/// - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
|
||||
/// multiple shards. Its value is given in 8kiB pages.
|
||||
/// - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
|
||||
/// always zero: this is provided for future upgrades that might introduce different
|
||||
/// data distribution schemes.
|
||||
///
|
||||
/// Examples:
|
||||
/// - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
|
||||
/// - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
|
||||
/// - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
|
||||
/// and their slugs are 0004, 0104, 0204, and 0304.
|
||||
|
||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
|
||||
pub struct ShardNumber(pub u8);
|
||||
|
||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
|
||||
pub struct ShardCount(u8);
|
||||
|
||||
/// Combination of ShardNumber and ShardCount. For use within the context of a particular tenant,
|
||||
/// when we need to know which shard we're dealing with, but do not need to know the full
|
||||
/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
|
||||
/// the fully qualified TenantShardId.
|
||||
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
|
||||
pub struct ShardIndex {
|
||||
pub shard_number: ShardNumber,
|
||||
pub shard_count: ShardCount,
|
||||
}
|
||||
|
||||
/// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`],
|
||||
/// and to check whether that [`ShardNumber`] is the same as the current shard.
|
||||
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
|
||||
pub struct ShardIdentity {
|
||||
pub number: ShardNumber,
|
||||
pub count: ShardCount,
|
||||
pub stripe_size: ShardStripeSize,
|
||||
layout: ShardLayout,
|
||||
}
|
||||
|
||||
/// Formatting helper, for generating the `shard_id` label in traces.
|
||||
struct ShardSlug<'a>(&'a TenantShardId);
|
||||
|
||||
/// TenantShardId globally identifies a particular shard in a particular tenant.
|
||||
///
|
||||
/// These are written as `<TenantId>-<ShardSlug>`, for example:
|
||||
/// # The second shard in a two-shard tenant
|
||||
/// 072f1291a5310026820b2fe4b2968934-0102
|
||||
///
|
||||
/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
|
||||
/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
|
||||
/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
|
||||
///
|
||||
/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
|
||||
/// is both forward and backward compatible with TenantId: a legacy TenantId can be
|
||||
/// decoded as a TenantShardId, and when re-encoded it will be parseable
|
||||
/// as a TenantId.
|
||||
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
|
||||
pub struct TenantShardId {
|
||||
pub tenant_id: TenantId,
|
||||
pub shard_number: ShardNumber,
|
||||
pub shard_count: ShardCount,
|
||||
}
|
||||
|
||||
impl ShardCount {
|
||||
pub const MAX: Self = Self(u8::MAX);
|
||||
|
||||
@@ -115,7 +38,6 @@ impl ShardCount {
|
||||
self.0
|
||||
}
|
||||
|
||||
///
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.0 == 0
|
||||
}
|
||||
@@ -131,6 +53,33 @@ impl ShardNumber {
|
||||
pub const MAX: Self = Self(u8::MAX);
|
||||
}
|
||||
|
||||
/// TenantShardId identify the units of work for the Pageserver.
|
||||
///
|
||||
/// These are written as `<tenant_id>-<shard number><shard-count>`, for example:
|
||||
///
|
||||
/// # The second shard in a two-shard tenant
|
||||
/// 072f1291a5310026820b2fe4b2968934-0102
|
||||
///
|
||||
/// Historically, tenants could not have multiple shards, and were identified
|
||||
/// by TenantId. To support this, TenantShardId has a special legacy
|
||||
/// mode where `shard_count` is equal to zero: this represents a single-sharded
|
||||
/// tenant which should be written as a TenantId with no suffix.
|
||||
///
|
||||
/// The human-readable encoding of TenantShardId, such as used in API URLs,
|
||||
/// is both forward and backward compatible: a legacy TenantId can be
|
||||
/// decoded as a TenantShardId, and when re-encoded it will be parseable
|
||||
/// as a TenantId.
|
||||
///
|
||||
/// Note that the binary encoding is _not_ backward compatible, because
|
||||
/// at the time sharding is introduced, there are no existing binary structures
|
||||
/// containing TenantId that we need to handle.
|
||||
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
|
||||
pub struct TenantShardId {
|
||||
pub tenant_id: TenantId,
|
||||
pub shard_number: ShardNumber,
|
||||
pub shard_count: ShardCount,
|
||||
}
|
||||
|
||||
impl TenantShardId {
|
||||
pub fn unsharded(tenant_id: TenantId) -> Self {
|
||||
Self {
|
||||
@@ -162,13 +111,10 @@ impl TenantShardId {
|
||||
}
|
||||
|
||||
/// Convenience for code that has special behavior on the 0th shard.
|
||||
pub fn is_shard_zero(&self) -> bool {
|
||||
pub fn is_zero(&self) -> bool {
|
||||
self.shard_number == ShardNumber(0)
|
||||
}
|
||||
|
||||
/// The "unsharded" value is distinct from simply having a single shard: it represents
|
||||
/// a tenant which is not shard-aware at all, and whose storage paths will not include
|
||||
/// a shard suffix.
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
|
||||
}
|
||||
@@ -204,6 +150,9 @@ impl TenantShardId {
|
||||
}
|
||||
}
|
||||
|
||||
/// Formatting helper
|
||||
struct ShardSlug<'a>(&'a TenantShardId);
|
||||
|
||||
impl<'a> std::fmt::Display for ShardSlug<'a> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
@@ -273,6 +222,16 @@ impl From<[u8; 18]> for TenantShardId {
|
||||
}
|
||||
}
|
||||
|
||||
/// For use within the context of a particular tenant, when we need to know which
|
||||
/// shard we're dealing with, but do not need to know the full ShardIdentity (because
|
||||
/// we won't be doing any page->shard mapping), and do not need to know the fully qualified
|
||||
/// TenantShardId.
|
||||
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
|
||||
pub struct ShardIndex {
|
||||
pub shard_number: ShardNumber,
|
||||
pub shard_count: ShardCount,
|
||||
}
|
||||
|
||||
impl ShardIndex {
|
||||
pub fn new(number: ShardNumber, count: ShardCount) -> Self {
|
||||
Self {
|
||||
@@ -287,9 +246,6 @@ impl ShardIndex {
|
||||
}
|
||||
}
|
||||
|
||||
/// The "unsharded" value is distinct from simply having a single shard: it represents
|
||||
/// a tenant which is not shard-aware at all, and whose storage paths will not include
|
||||
/// a shard suffix.
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
|
||||
}
|
||||
@@ -357,8 +313,6 @@ impl Serialize for TenantShardId {
|
||||
if serializer.is_human_readable() {
|
||||
serializer.collect_str(self)
|
||||
} else {
|
||||
// Note: while human encoding of [`TenantShardId`] is backward and forward
|
||||
// compatible, this binary encoding is not.
|
||||
let mut packed: [u8; 18] = [0; 18];
|
||||
packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
|
||||
packed[16] = self.shard_number.0;
|
||||
@@ -436,6 +390,16 @@ const LAYOUT_BROKEN: ShardLayout = ShardLayout(255);
|
||||
/// Default stripe size in pages: 256MiB divided by 8kiB page size.
|
||||
const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
|
||||
|
||||
/// The ShardIdentity contains the information needed for one member of map
|
||||
/// to resolve a key to a shard, and then check whether that shard is ==self.
|
||||
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
|
||||
pub struct ShardIdentity {
|
||||
pub number: ShardNumber,
|
||||
pub count: ShardCount,
|
||||
pub stripe_size: ShardStripeSize,
|
||||
layout: ShardLayout,
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug, PartialEq, Eq)]
|
||||
pub enum ShardConfigError {
|
||||
#[error("Invalid shard count")]
|
||||
@@ -475,9 +439,6 @@ impl ShardIdentity {
|
||||
}
|
||||
}
|
||||
|
||||
/// The "unsharded" value is distinct from simply having a single shard: it represents
|
||||
/// a tenant which is not shard-aware at all, and whose storage paths will not include
|
||||
/// a shard suffix.
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.number == ShardNumber(0) && self.count == ShardCount(0)
|
||||
}
|
||||
@@ -526,8 +487,6 @@ impl ShardIdentity {
|
||||
}
|
||||
|
||||
/// Return true if the key should be ingested by this shard
|
||||
///
|
||||
/// Shards must ingest _at least_ keys which return true from this check.
|
||||
pub fn is_key_local(&self, key: &Key) -> bool {
|
||||
assert!(!self.is_broken());
|
||||
if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) {
|
||||
@@ -538,9 +497,7 @@ impl ShardIdentity {
|
||||
}
|
||||
|
||||
/// Return true if the key should be discarded if found in this shard's
|
||||
/// data store, e.g. during compaction after a split.
|
||||
///
|
||||
/// Shards _may_ drop keys which return false here, but are not obliged to.
|
||||
/// data store, e.g. during compaction after a split
|
||||
pub fn is_key_disposable(&self, key: &Key) -> bool {
|
||||
if key_is_shard0(key) {
|
||||
// Q: Why can't we dispose of shard0 content if we're not shard 0?
|
||||
@@ -566,7 +523,7 @@ impl ShardIdentity {
|
||||
|
||||
/// Convenience for checking if this identity is the 0th shard in a tenant,
|
||||
/// for special cases on shard 0 such as ingesting relation sizes.
|
||||
pub fn is_shard_zero(&self) -> bool {
|
||||
pub fn is_zero(&self) -> bool {
|
||||
self.number == ShardNumber(0)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -134,11 +134,6 @@ impl RemotePath {
|
||||
pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> {
|
||||
self.0.strip_prefix(&p.0)
|
||||
}
|
||||
|
||||
pub fn add_trailing_slash(&self) -> Self {
|
||||
// Unwrap safety inputs are guararnteed to be valid UTF-8
|
||||
Self(format!("{}/", self.0).try_into().unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
/// We don't need callers to be able to pass arbitrary delimiters: just control
|
||||
@@ -162,21 +157,47 @@ pub struct Listing {
|
||||
/// providing basic CRUD operations for storage files.
|
||||
#[allow(async_fn_in_trait)]
|
||||
pub trait RemoteStorage: Send + Sync + 'static {
|
||||
/// List objects in remote storage, with semantics matching AWS S3's ListObjectsV2.
|
||||
/// (see `<https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html>`)
|
||||
///
|
||||
/// Note that the prefix is relative to any `prefix_in_bucket` configured for the client, not
|
||||
/// from the absolute root of the bucket.
|
||||
///
|
||||
/// `mode` configures whether to use a delimiter. Without a delimiter all keys
|
||||
/// within the prefix are listed in the `keys` of the result. With a delimiter, any "directories" at the top level of
|
||||
/// the prefix are returned in the `prefixes` of the result, and keys in the top level of the prefix are
|
||||
/// returned in `keys` ().
|
||||
///
|
||||
/// `max_keys` controls the maximum number of keys that will be returned. If this is None, this function
|
||||
/// will iteratively call listobjects until it runs out of keys. Note that this is not safe to use on
|
||||
/// unlimted size buckets, as the full list of objects is allocated into a monolithic data structure.
|
||||
/// Lists all top level subdirectories for a given prefix
|
||||
/// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
|
||||
/// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS)
|
||||
/// so this method doesnt need to.
|
||||
async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
let result = self
|
||||
.list(prefix, ListingMode::WithDelimiter, None, cancel)
|
||||
.await?
|
||||
.prefixes;
|
||||
Ok(result)
|
||||
}
|
||||
/// Lists all files in directory "recursively"
|
||||
/// (not really recursively, because AWS has a flat namespace)
|
||||
/// Note: This is subtely different than list_prefixes,
|
||||
/// because it is for listing files instead of listing
|
||||
/// names sharing common prefixes.
|
||||
/// For example,
|
||||
/// list_files("foo/bar") = ["foo/bar/cat123.txt",
|
||||
/// "foo/bar/cat567.txt", "foo/bar/dog123.txt", "foo/bar/dog456.txt"]
|
||||
/// whereas,
|
||||
/// list_prefixes("foo/bar/") = ["cat", "dog"]
|
||||
/// See `test_real_s3.rs` for more details.
|
||||
///
|
||||
/// max_keys limits max number of keys returned; None means unlimited.
|
||||
async fn list_files(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
let result = self
|
||||
.list(prefix, ListingMode::NoDelimiter, max_keys, cancel)
|
||||
.await?
|
||||
.keys;
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
async fn list(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
@@ -315,6 +336,41 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
||||
}
|
||||
}
|
||||
|
||||
// A function for listing all the files in a "directory"
|
||||
// Example:
|
||||
// list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
|
||||
//
|
||||
// max_keys limits max number of keys returned; None means unlimited.
|
||||
pub async fn list_files(
|
||||
&self,
|
||||
folder: Option<&RemotePath>,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.list_files(folder, max_keys, cancel).await,
|
||||
Self::AwsS3(s) => s.list_files(folder, max_keys, cancel).await,
|
||||
Self::AzureBlob(s) => s.list_files(folder, max_keys, cancel).await,
|
||||
Self::Unreliable(s) => s.list_files(folder, max_keys, cancel).await,
|
||||
}
|
||||
}
|
||||
|
||||
// lists common *prefixes*, if any of files
|
||||
// Example:
|
||||
// list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"]
|
||||
pub async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.list_prefixes(prefix, cancel).await,
|
||||
Self::AwsS3(s) => s.list_prefixes(prefix, cancel).await,
|
||||
Self::AzureBlob(s) => s.list_prefixes(prefix, cancel).await,
|
||||
Self::Unreliable(s) => s.list_prefixes(prefix, cancel).await,
|
||||
}
|
||||
}
|
||||
|
||||
/// See [`RemoteStorage::upload`]
|
||||
pub async fn upload(
|
||||
&self,
|
||||
@@ -509,16 +565,6 @@ impl GenericRemoteStorage {
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct StorageMetadata(HashMap<String, String>);
|
||||
|
||||
impl<const N: usize> From<[(&str, &str); N]> for StorageMetadata {
|
||||
fn from(arr: [(&str, &str); N]) -> Self {
|
||||
let map: HashMap<String, String> = arr
|
||||
.iter()
|
||||
.map(|(k, v)| (k.to_string(), v.to_string()))
|
||||
.collect();
|
||||
Self(map)
|
||||
}
|
||||
}
|
||||
|
||||
/// External backup storage configuration, enough for creating a client for that storage.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct RemoteStorageConfig {
|
||||
|
||||
@@ -5,9 +5,11 @@
|
||||
//! volume is mounted to the local FS.
|
||||
|
||||
use std::{
|
||||
collections::HashSet,
|
||||
borrow::Cow,
|
||||
future::Future,
|
||||
io::ErrorKind,
|
||||
num::NonZeroU32,
|
||||
pin::Pin,
|
||||
time::{Duration, SystemTime, UNIX_EPOCH},
|
||||
};
|
||||
|
||||
@@ -20,11 +22,11 @@ use tokio::{
|
||||
io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
|
||||
};
|
||||
use tokio_util::{io::ReaderStream, sync::CancellationToken};
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use tracing::*;
|
||||
use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
|
||||
|
||||
use crate::{
|
||||
Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
|
||||
REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
};
|
||||
|
||||
use super::{RemoteStorage, StorageMetadata};
|
||||
@@ -91,47 +93,7 @@ impl LocalFs {
|
||||
|
||||
#[cfg(test)]
|
||||
async fn list_all(&self) -> anyhow::Result<Vec<RemotePath>> {
|
||||
use std::{future::Future, pin::Pin};
|
||||
fn get_all_files<'a, P>(
|
||||
directory_path: P,
|
||||
) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
|
||||
where
|
||||
P: AsRef<Utf8Path> + Send + Sync + 'a,
|
||||
{
|
||||
Box::pin(async move {
|
||||
let directory_path = directory_path.as_ref();
|
||||
if directory_path.exists() {
|
||||
if directory_path.is_dir() {
|
||||
let mut paths = Vec::new();
|
||||
let mut dir_contents = fs::read_dir(directory_path).await?;
|
||||
while let Some(dir_entry) = dir_contents.next_entry().await? {
|
||||
let file_type = dir_entry.file_type().await?;
|
||||
let entry_path =
|
||||
Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
|
||||
anyhow::Error::msg(format!(
|
||||
"non-Unicode path: {}",
|
||||
pb.to_string_lossy()
|
||||
))
|
||||
})?;
|
||||
if file_type.is_symlink() {
|
||||
tracing::debug!("{entry_path:?} is a symlink, skipping")
|
||||
} else if file_type.is_dir() {
|
||||
paths.extend(get_all_files(&entry_path).await?.into_iter())
|
||||
} else {
|
||||
paths.push(entry_path);
|
||||
}
|
||||
}
|
||||
Ok(paths)
|
||||
} else {
|
||||
bail!("Path {directory_path:?} is not a directory")
|
||||
}
|
||||
} else {
|
||||
Ok(Vec::new())
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
Ok(get_all_files(&self.storage_root)
|
||||
Ok(get_all_files(&self.storage_root, true)
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|path| {
|
||||
@@ -153,20 +115,11 @@ impl LocalFs {
|
||||
None => self.storage_root.clone(),
|
||||
};
|
||||
|
||||
eprintln!("local_fs list: searching from {full_path} for initial_dir");
|
||||
// If we were given a directory, we may use it as our starting point.
|
||||
// Otherwise, we must go up to the first ancestor dir that exists. This is because
|
||||
// S3 object list prefixes can be arbitrary strings, but when reading
|
||||
// the local filesystem we need a directory to start calling read_dir on.
|
||||
let mut initial_dir = full_path.clone();
|
||||
|
||||
// If there's no trailing slash, we have to start looking from one above: even if
|
||||
// `initial_dir` is a directory, we should still list any prefixes in the parent
|
||||
// that start with the same string.
|
||||
if !full_path.to_string().ends_with('/') {
|
||||
initial_dir.pop();
|
||||
}
|
||||
|
||||
loop {
|
||||
// Did we make it to the root?
|
||||
if initial_dir.parent().is_none() {
|
||||
@@ -197,8 +150,6 @@ impl LocalFs {
|
||||
// starts_with later.
|
||||
let prefix = full_path.as_str();
|
||||
|
||||
eprintln!("local_fs list: initial_dir={initial_dir}");
|
||||
|
||||
let mut files = vec![];
|
||||
let mut directory_queue = vec![initial_dir];
|
||||
while let Some(cur_folder) = directory_queue.pop() {
|
||||
@@ -212,8 +163,6 @@ impl LocalFs {
|
||||
if full_file_name.is_dir() {
|
||||
directory_queue.push(full_file_name);
|
||||
}
|
||||
} else {
|
||||
eprintln!("Drop {full_file_name}, not in prefix");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -343,76 +292,64 @@ impl RemoteStorage for LocalFs {
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Listing, DownloadError> {
|
||||
if let Some(prefix) = prefix {
|
||||
eprintln!("local_fs list: prefix={}", prefix);
|
||||
}
|
||||
let op = async {
|
||||
let mut result = Listing::default();
|
||||
|
||||
// Filter out directories: in S3 directories don't exist, only the keys within them do.
|
||||
let keys = self
|
||||
.list_recursive(prefix)
|
||||
if let ListingMode::NoDelimiter = mode {
|
||||
let keys = self
|
||||
.list_recursive(prefix)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
result.keys = keys
|
||||
.into_iter()
|
||||
.filter(|k| {
|
||||
let path = k.with_base(&self.storage_root);
|
||||
!path.is_dir()
|
||||
})
|
||||
.collect();
|
||||
|
||||
if let Some(max_keys) = max_keys {
|
||||
result.keys.truncate(max_keys.get() as usize);
|
||||
}
|
||||
|
||||
return Ok(result);
|
||||
}
|
||||
|
||||
let path = match prefix {
|
||||
Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
|
||||
None => Cow::Borrowed(&self.storage_root),
|
||||
};
|
||||
|
||||
let prefixes_to_filter = get_all_files(path.as_ref(), false)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
let keys = keys
|
||||
.into_iter()
|
||||
.filter(|k| {
|
||||
let path = k.with_base(&self.storage_root);
|
||||
!path.is_dir()
|
||||
})
|
||||
.collect();
|
||||
|
||||
if let ListingMode::NoDelimiter = mode {
|
||||
result.keys = keys;
|
||||
} else {
|
||||
let mut prefixes = HashSet::new();
|
||||
for key in keys {
|
||||
eprintln!("key: {key}");
|
||||
// If the part after the prefix includes a "/", take only the first part and put it in `prefixes`.
|
||||
let relative_key = if let Some(prefix) = prefix {
|
||||
let mut prefix = prefix.clone();
|
||||
// We only strip the dirname of the prefix, so that when we strip it from the start of keys we
|
||||
// end up with full file/dir names.
|
||||
let prefix_full_local_path = prefix.with_base(&self.storage_root);
|
||||
let has_slash = prefix.0.to_string().ends_with('/');
|
||||
let strip_prefix = if prefix_full_local_path.is_dir() && has_slash {
|
||||
prefix
|
||||
} else {
|
||||
prefix.0.pop();
|
||||
prefix
|
||||
};
|
||||
eprintln!("strip_prefix={strip_prefix}");
|
||||
|
||||
RemotePath::new(key.strip_prefix(&strip_prefix).unwrap()).unwrap()
|
||||
} else {
|
||||
key
|
||||
};
|
||||
|
||||
eprintln!("relative_key: {relative_key}");
|
||||
|
||||
let relative_key = format!("{}", relative_key);
|
||||
if relative_key.contains(REMOTE_STORAGE_PREFIX_SEPARATOR) {
|
||||
let first_part = relative_key
|
||||
.split(REMOTE_STORAGE_PREFIX_SEPARATOR)
|
||||
.next()
|
||||
.unwrap()
|
||||
.to_owned();
|
||||
prefixes.insert(first_part);
|
||||
} else {
|
||||
result
|
||||
.keys
|
||||
.push(RemotePath::from_string(&relative_key).unwrap());
|
||||
}
|
||||
// filter out empty directories to mirror s3 behavior.
|
||||
for prefix in prefixes_to_filter {
|
||||
if prefix.is_dir()
|
||||
&& is_directory_empty(&prefix)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
let stripped = prefix
|
||||
.strip_prefix(&self.storage_root)
|
||||
.context("Failed to strip prefix")
|
||||
.and_then(RemotePath::new)
|
||||
.expect(
|
||||
"We list files for storage root, hence should be able to remote the prefix",
|
||||
);
|
||||
|
||||
if prefix.is_dir() {
|
||||
result.prefixes.push(stripped);
|
||||
} else {
|
||||
result.keys.push(stripped);
|
||||
}
|
||||
result.prefixes = prefixes
|
||||
.into_iter()
|
||||
.map(|s| RemotePath::from_string(&s).unwrap())
|
||||
.collect();
|
||||
}
|
||||
|
||||
if let Some(max_keys) = max_keys {
|
||||
result.keys.truncate(max_keys.get() as usize);
|
||||
}
|
||||
Ok(result)
|
||||
};
|
||||
|
||||
@@ -623,6 +560,50 @@ fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
|
||||
path_with_suffix_extension(original_path, "metadata")
|
||||
}
|
||||
|
||||
fn get_all_files<'a, P>(
|
||||
directory_path: P,
|
||||
recursive: bool,
|
||||
) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
|
||||
where
|
||||
P: AsRef<Utf8Path> + Send + Sync + 'a,
|
||||
{
|
||||
Box::pin(async move {
|
||||
let directory_path = directory_path.as_ref();
|
||||
if directory_path.exists() {
|
||||
if directory_path.is_dir() {
|
||||
let mut paths = Vec::new();
|
||||
let mut dir_contents = fs::read_dir(directory_path).await?;
|
||||
while let Some(dir_entry) = dir_contents.next_entry().await? {
|
||||
let file_type = dir_entry.file_type().await?;
|
||||
let entry_path =
|
||||
Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
|
||||
anyhow::Error::msg(format!(
|
||||
"non-Unicode path: {}",
|
||||
pb.to_string_lossy()
|
||||
))
|
||||
})?;
|
||||
if file_type.is_symlink() {
|
||||
debug!("{entry_path:?} is a symlink, skipping")
|
||||
} else if file_type.is_dir() {
|
||||
if recursive {
|
||||
paths.extend(get_all_files(&entry_path, true).await?.into_iter())
|
||||
} else {
|
||||
paths.push(entry_path)
|
||||
}
|
||||
} else {
|
||||
paths.push(entry_path);
|
||||
}
|
||||
}
|
||||
Ok(paths)
|
||||
} else {
|
||||
bail!("Path {directory_path:?} is not a directory")
|
||||
}
|
||||
} else {
|
||||
Ok(Vec::new())
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<()> {
|
||||
let target_dir = match target_file_path.parent() {
|
||||
Some(parent_dir) => parent_dir,
|
||||
@@ -942,18 +923,13 @@ mod fs_tests {
|
||||
// No delimiter: should recursively list everything
|
||||
let (storage, cancel) = create_storage()?;
|
||||
let child = upload_dummy_file(&storage, "grandparent/parent/child", None, &cancel).await?;
|
||||
let child_sibling =
|
||||
upload_dummy_file(&storage, "grandparent/parent/child_sibling", None, &cancel).await?;
|
||||
let uncle = upload_dummy_file(&storage, "grandparent/uncle", None, &cancel).await?;
|
||||
|
||||
let listing = storage
|
||||
.list(None, ListingMode::NoDelimiter, None, &cancel)
|
||||
.await?;
|
||||
assert!(listing.prefixes.is_empty());
|
||||
assert_eq!(
|
||||
listing.keys,
|
||||
[uncle.clone(), child.clone(), child_sibling.clone()].to_vec()
|
||||
);
|
||||
assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());
|
||||
|
||||
// Delimiter: should only go one deep
|
||||
let listing = storage
|
||||
@@ -966,25 +942,7 @@ mod fs_tests {
|
||||
);
|
||||
assert!(listing.keys.is_empty());
|
||||
|
||||
// Delimiter & prefix with a trailing slash
|
||||
let listing = storage
|
||||
.list(
|
||||
Some(&RemotePath::from_string("timelines/some_timeline/grandparent/").unwrap()),
|
||||
ListingMode::WithDelimiter,
|
||||
None,
|
||||
&cancel,
|
||||
)
|
||||
.await?;
|
||||
assert_eq!(
|
||||
listing.keys,
|
||||
[RemotePath::from_string("uncle").unwrap()].to_vec()
|
||||
);
|
||||
assert_eq!(
|
||||
listing.prefixes,
|
||||
[RemotePath::from_string("parent").unwrap()].to_vec()
|
||||
);
|
||||
|
||||
// Delimiter and prefix without a trailing slash
|
||||
// Delimiter & prefix
|
||||
let listing = storage
|
||||
.list(
|
||||
Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
|
||||
@@ -993,66 +951,12 @@ mod fs_tests {
|
||||
&cancel,
|
||||
)
|
||||
.await?;
|
||||
assert_eq!(listing.keys, [].to_vec());
|
||||
assert_eq!(
|
||||
listing.prefixes,
|
||||
[RemotePath::from_string("grandparent").unwrap()].to_vec()
|
||||
);
|
||||
|
||||
// Delimiter and prefix that's partway through a path component
|
||||
let listing = storage
|
||||
.list(
|
||||
Some(&RemotePath::from_string("timelines/some_timeline/grandp").unwrap()),
|
||||
ListingMode::WithDelimiter,
|
||||
None,
|
||||
&cancel,
|
||||
)
|
||||
.await?;
|
||||
assert_eq!(listing.keys, [].to_vec());
|
||||
assert_eq!(
|
||||
listing.prefixes,
|
||||
[RemotePath::from_string("grandparent").unwrap()].to_vec()
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn list_part_component() -> anyhow::Result<()> {
|
||||
// No delimiter: should recursively list everything
|
||||
let (storage, cancel) = create_storage()?;
|
||||
|
||||
// Imitates what happens in a tenant path when we have an unsharded path and a sharded path, and do a listing
|
||||
// of the unsharded path: although there is a "directory" at the unsharded path, it should be handled as
|
||||
// a freeform prefix.
|
||||
let _child_a =
|
||||
upload_dummy_file(&storage, "grandparent/tenant-01/child", None, &cancel).await?;
|
||||
let _child_b =
|
||||
upload_dummy_file(&storage, "grandparent/tenant/child", None, &cancel).await?;
|
||||
|
||||
// Delimiter and prefix that's partway through a path component
|
||||
let listing = storage
|
||||
.list(
|
||||
Some(
|
||||
&RemotePath::from_string("timelines/some_timeline/grandparent/tenant").unwrap(),
|
||||
),
|
||||
ListingMode::WithDelimiter,
|
||||
None,
|
||||
&cancel,
|
||||
)
|
||||
.await?;
|
||||
assert_eq!(listing.keys, [].to_vec());
|
||||
|
||||
let mut found_prefixes = listing.prefixes.clone();
|
||||
found_prefixes.sort();
|
||||
assert_eq!(
|
||||
found_prefixes,
|
||||
[
|
||||
RemotePath::from_string("tenant").unwrap(),
|
||||
RemotePath::from_string("tenant-01").unwrap(),
|
||||
]
|
||||
.to_vec()
|
||||
[RemotePath::from_string("timelines/some_timeline/grandparent/parent").unwrap()]
|
||||
.to_vec()
|
||||
);
|
||||
assert_eq!(listing.keys, [uncle.clone()].to_vec());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -178,7 +178,10 @@ impl S3Bucket {
|
||||
|
||||
pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
|
||||
assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
|
||||
let path_string = path.get_path().as_str();
|
||||
let path_string = path
|
||||
.get_path()
|
||||
.as_str()
|
||||
.trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR);
|
||||
match &self.prefix_in_bucket {
|
||||
Some(prefix) => prefix.clone() + "/" + path_string,
|
||||
None => path_string.to_string(),
|
||||
@@ -468,7 +471,17 @@ impl RemoteStorage for S3Bucket {
|
||||
// get the passed prefix or if it is not set use prefix_in_bucket value
|
||||
let list_prefix = prefix
|
||||
.map(|p| self.relative_path_to_s3_object(p))
|
||||
.or_else(|| self.prefix_in_bucket.clone().map(|s| s + "/"));
|
||||
.or_else(|| self.prefix_in_bucket.clone())
|
||||
.map(|mut p| {
|
||||
// required to end with a separator
|
||||
// otherwise request will return only the entry of a prefix
|
||||
if matches!(mode, ListingMode::WithDelimiter)
|
||||
&& !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
|
||||
{
|
||||
p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
|
||||
}
|
||||
p
|
||||
});
|
||||
|
||||
let _permit = self.permit(kind, cancel).await?;
|
||||
|
||||
@@ -536,15 +549,11 @@ impl RemoteStorage for S3Bucket {
|
||||
}
|
||||
}
|
||||
|
||||
// S3 gives us prefixes like "foo/", we return them like "foo"
|
||||
result.prefixes.extend(prefixes.iter().filter_map(|o| {
|
||||
Some(
|
||||
self.s3_object_to_relative_path(
|
||||
o.prefix()?
|
||||
.trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR),
|
||||
),
|
||||
)
|
||||
}));
|
||||
result.prefixes.extend(
|
||||
prefixes
|
||||
.iter()
|
||||
.filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
|
||||
);
|
||||
|
||||
continuation_token = match response.next_continuation_token {
|
||||
Some(new_token) => Some(new_token),
|
||||
@@ -1041,22 +1050,22 @@ mod tests {
|
||||
Some("/test/prefix/"),
|
||||
];
|
||||
let expected_outputs = [
|
||||
vec!["", "some/path", "some/path/"],
|
||||
vec!["/", "/some/path", "/some/path/"],
|
||||
vec!["", "some/path", "some/path"],
|
||||
vec!["/", "/some/path", "/some/path"],
|
||||
vec![
|
||||
"test/prefix/",
|
||||
"test/prefix/some/path",
|
||||
"test/prefix/some/path/",
|
||||
"test/prefix/some/path",
|
||||
],
|
||||
vec![
|
||||
"test/prefix/",
|
||||
"test/prefix/some/path",
|
||||
"test/prefix/some/path/",
|
||||
"test/prefix/some/path",
|
||||
],
|
||||
vec![
|
||||
"test/prefix/",
|
||||
"test/prefix/some/path",
|
||||
"test/prefix/some/path/",
|
||||
"test/prefix/some/path",
|
||||
],
|
||||
];
|
||||
|
||||
|
||||
@@ -107,6 +107,27 @@ impl UnreliableWrapper {
|
||||
type VoidStorage = crate::LocalFs;
|
||||
|
||||
impl RemoteStorage for UnreliableWrapper {
|
||||
async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
|
||||
.map_err(DownloadError::Other)?;
|
||||
self.inner.list_prefixes(prefix, cancel).await
|
||||
}
|
||||
|
||||
async fn list_files(
|
||||
&self,
|
||||
folder: Option<&RemotePath>,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
self.attempt(RemoteOp::ListPrefixes(folder.cloned()))
|
||||
.map_err(DownloadError::Other)?;
|
||||
self.inner.list_files(folder, max_keys, cancel).await
|
||||
}
|
||||
|
||||
async fn list(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use anyhow::Context;
|
||||
use camino::Utf8Path;
|
||||
use remote_storage::ListingMode;
|
||||
use remote_storage::RemotePath;
|
||||
use std::sync::Arc;
|
||||
use std::{collections::HashSet, num::NonZeroU32};
|
||||
@@ -55,9 +54,9 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
|
||||
let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
|
||||
.context("common_prefix construction")?;
|
||||
let root_remote_prefixes = test_client
|
||||
.list(None, ListingMode::WithDelimiter, None, &cancel)
|
||||
.await?
|
||||
.prefixes
|
||||
.list_prefixes(None, &cancel)
|
||||
.await
|
||||
.context("client list root prefixes failure")?
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>();
|
||||
assert_eq!(
|
||||
@@ -66,14 +65,9 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
|
||||
);
|
||||
|
||||
let nested_remote_prefixes = test_client
|
||||
.list(
|
||||
Some(&base_prefix.add_trailing_slash()),
|
||||
ListingMode::WithDelimiter,
|
||||
None,
|
||||
&cancel,
|
||||
)
|
||||
.await?
|
||||
.prefixes
|
||||
.list_prefixes(Some(&base_prefix), &cancel)
|
||||
.await
|
||||
.context("client list nested prefixes failure")?
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>();
|
||||
let remote_only_prefixes = nested_remote_prefixes
|
||||
@@ -96,13 +90,11 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
|
||||
///
|
||||
/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
|
||||
/// Then performs the following queries:
|
||||
/// 1. `list(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
|
||||
/// 2. `list("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt`
|
||||
/// 1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
|
||||
/// 2. `list_files("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt`
|
||||
#[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
|
||||
#[tokio::test]
|
||||
async fn list_no_delimiter_works(
|
||||
ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs,
|
||||
) -> anyhow::Result<()> {
|
||||
async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> anyhow::Result<()> {
|
||||
let ctx = match ctx {
|
||||
MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
|
||||
MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
|
||||
@@ -115,36 +107,29 @@ async fn list_no_delimiter_works(
|
||||
let base_prefix =
|
||||
RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
|
||||
let root_files = test_client
|
||||
.list(None, ListingMode::NoDelimiter, None, &cancel)
|
||||
.list_files(None, None, &cancel)
|
||||
.await
|
||||
.context("client list root files failure")?
|
||||
.keys
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>();
|
||||
assert_eq!(
|
||||
root_files,
|
||||
ctx.remote_blobs.clone(),
|
||||
"remote storage list on root mismatches with the uploads."
|
||||
"remote storage list_files on root mismatches with the uploads."
|
||||
);
|
||||
|
||||
// Test that max_keys limit works. In total there are about 21 files (see
|
||||
// upload_simple_remote_data call in test_real_s3.rs).
|
||||
let limited_root_files = test_client
|
||||
.list(
|
||||
None,
|
||||
ListingMode::NoDelimiter,
|
||||
Some(NonZeroU32::new(2).unwrap()),
|
||||
&cancel,
|
||||
)
|
||||
.list_files(None, Some(NonZeroU32::new(2).unwrap()), &cancel)
|
||||
.await
|
||||
.context("client list root files failure")?;
|
||||
assert_eq!(limited_root_files.keys.len(), 2);
|
||||
assert_eq!(limited_root_files.len(), 2);
|
||||
|
||||
let nested_remote_files = test_client
|
||||
.list(Some(&base_prefix), ListingMode::NoDelimiter, None, &cancel)
|
||||
.list_files(Some(&base_prefix), None, &cancel)
|
||||
.await
|
||||
.context("client list nested files failure")?
|
||||
.keys
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>();
|
||||
let trim_remote_blobs: HashSet<_> = ctx
|
||||
@@ -156,7 +141,7 @@ async fn list_no_delimiter_works(
|
||||
.collect();
|
||||
assert_eq!(
|
||||
nested_remote_files, trim_remote_blobs,
|
||||
"remote storage list on subdirrectory mismatches with the uploads."
|
||||
"remote storage list_files on subdirrectory mismatches with the uploads."
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
@@ -214,11 +199,7 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<(
|
||||
|
||||
ctx.client.delete_objects(&[path1, path2], &cancel).await?;
|
||||
|
||||
let prefixes = ctx
|
||||
.client
|
||||
.list(None, ListingMode::WithDelimiter, None, &cancel)
|
||||
.await?
|
||||
.prefixes;
|
||||
let prefixes = ctx.client.list_prefixes(None, &cancel).await?;
|
||||
|
||||
assert_eq!(prefixes.len(), 1);
|
||||
|
||||
|
||||
@@ -57,6 +57,7 @@ enum MaybeEnabledStorage {
|
||||
Disabled,
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl AsyncTestContext for MaybeEnabledStorage {
|
||||
async fn setup() -> Self {
|
||||
ensure_logging_ready();
|
||||
@@ -85,6 +86,7 @@ struct AzureWithTestBlobs {
|
||||
remote_blobs: HashSet<RemotePath>,
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
|
||||
async fn setup() -> Self {
|
||||
ensure_logging_ready();
|
||||
@@ -132,6 +134,10 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
|
||||
}
|
||||
}
|
||||
|
||||
// NOTE: the setups for the list_prefixes test and the list_files test are very similar
|
||||
// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
|
||||
// whereas the list_files function is concerned with listing files.
|
||||
// See `RemoteStorage::list_files` documentation for more details
|
||||
enum MaybeEnabledStorageWithSimpleTestBlobs {
|
||||
Enabled(AzureWithSimpleTestBlobs),
|
||||
Disabled,
|
||||
@@ -142,6 +148,7 @@ struct AzureWithSimpleTestBlobs {
|
||||
remote_blobs: HashSet<RemotePath>,
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
|
||||
async fn setup() -> Self {
|
||||
ensure_logging_ready();
|
||||
|
||||
@@ -12,8 +12,8 @@ use anyhow::Context;
|
||||
use camino::Utf8Path;
|
||||
use futures_util::StreamExt;
|
||||
use remote_storage::{
|
||||
DownloadError, GenericRemoteStorage, ListingMode, RemotePath, RemoteStorageConfig,
|
||||
RemoteStorageKind, S3Config,
|
||||
DownloadError, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
|
||||
S3Config,
|
||||
};
|
||||
use test_context::test_context;
|
||||
use test_context::AsyncTestContext;
|
||||
@@ -75,14 +75,11 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
||||
client: &Arc<GenericRemoteStorage>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<HashSet<RemotePath>> {
|
||||
Ok(
|
||||
retry(|| client.list(None, ListingMode::NoDelimiter, None, cancel))
|
||||
.await
|
||||
.context("list root files failure")?
|
||||
.keys
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>(),
|
||||
)
|
||||
Ok(retry(|| client.list_files(None, None, cancel))
|
||||
.await
|
||||
.context("list root files failure")?
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>())
|
||||
}
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
@@ -222,6 +219,7 @@ enum MaybeEnabledStorage {
|
||||
Disabled,
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl AsyncTestContext for MaybeEnabledStorage {
|
||||
async fn setup() -> Self {
|
||||
ensure_logging_ready();
|
||||
@@ -250,6 +248,7 @@ struct S3WithTestBlobs {
|
||||
remote_blobs: HashSet<RemotePath>,
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
|
||||
async fn setup() -> Self {
|
||||
ensure_logging_ready();
|
||||
@@ -297,6 +296,10 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
|
||||
}
|
||||
}
|
||||
|
||||
// NOTE: the setups for the list_prefixes test and the list_files test are very similar
|
||||
// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
|
||||
// whereas the list_files function is concerned with listing files.
|
||||
// See `RemoteStorage::list_files` documentation for more details
|
||||
enum MaybeEnabledStorageWithSimpleTestBlobs {
|
||||
Enabled(S3WithSimpleTestBlobs),
|
||||
Disabled,
|
||||
@@ -307,6 +310,7 @@ struct S3WithSimpleTestBlobs {
|
||||
remote_blobs: HashSet<RemotePath>,
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
|
||||
async fn setup() -> Self {
|
||||
ensure_logging_ready();
|
||||
|
||||
@@ -22,7 +22,6 @@ camino.workspace = true
|
||||
chrono.workspace = true
|
||||
heapless.workspace = true
|
||||
hex = { workspace = true, features = ["serde"] }
|
||||
humantime.workspace = true
|
||||
hyper = { workspace = true, features = ["full"] }
|
||||
fail.workspace = true
|
||||
futures = { workspace = true}
|
||||
|
||||
@@ -1,21 +0,0 @@
|
||||
//! Wrapper around `std::env::var` for parsing environment variables.
|
||||
|
||||
use std::{fmt::Display, str::FromStr};
|
||||
|
||||
pub fn var<V, E>(varname: &str) -> Option<V>
|
||||
where
|
||||
V: FromStr<Err = E>,
|
||||
E: Display,
|
||||
{
|
||||
match std::env::var(varname) {
|
||||
Ok(s) => Some(
|
||||
s.parse()
|
||||
.map_err(|e| format!("failed to parse env var {varname}: {e:#}"))
|
||||
.unwrap(),
|
||||
),
|
||||
Err(std::env::VarError::NotPresent) => None,
|
||||
Err(std::env::VarError::NotUnicode(_)) => {
|
||||
panic!("env var {varname} is not unicode")
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -34,8 +34,6 @@ pub enum Generation {
|
||||
/// scenarios where pageservers might otherwise issue conflicting writes to
|
||||
/// remote storage
|
||||
impl Generation {
|
||||
pub const MAX: Self = Self::Valid(u32::MAX);
|
||||
|
||||
/// Create a new Generation that represents a legacy key format with
|
||||
/// no generation suffix
|
||||
pub fn none() -> Self {
|
||||
|
||||
@@ -63,7 +63,6 @@ pub mod measured_stream;
|
||||
|
||||
pub mod serde_percent;
|
||||
pub mod serde_regex;
|
||||
pub mod serde_system_time;
|
||||
|
||||
pub mod pageserver_feedback;
|
||||
|
||||
@@ -90,10 +89,6 @@ pub mod yielding_loop;
|
||||
|
||||
pub mod zstd;
|
||||
|
||||
pub mod env;
|
||||
|
||||
pub mod poison;
|
||||
|
||||
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
|
||||
///
|
||||
/// we have several cases:
|
||||
|
||||
@@ -1,121 +0,0 @@
|
||||
//! Protect a piece of state from reuse after it is left in an inconsistent state.
|
||||
//!
|
||||
//! # Example
|
||||
//!
|
||||
//! ```
|
||||
//! # tokio_test::block_on(async {
|
||||
//! use utils::poison::Poison;
|
||||
//! use std::time::Duration;
|
||||
//!
|
||||
//! struct State {
|
||||
//! clean: bool,
|
||||
//! }
|
||||
//! let state = tokio::sync::Mutex::new(Poison::new("mystate", State { clean: true }));
|
||||
//!
|
||||
//! let mut mutex_guard = state.lock().await;
|
||||
//! let mut poison_guard = mutex_guard.check_and_arm()?;
|
||||
//! let state = poison_guard.data_mut();
|
||||
//! state.clean = false;
|
||||
//! // If we get cancelled at this await point, subsequent check_and_arm() calls will fail.
|
||||
//! tokio::time::sleep(Duration::from_secs(10)).await;
|
||||
//! state.clean = true;
|
||||
//! poison_guard.disarm();
|
||||
//! # Ok::<(), utils::poison::Error>(())
|
||||
//! # });
|
||||
//! ```
|
||||
|
||||
use tracing::warn;
|
||||
|
||||
pub struct Poison<T> {
|
||||
what: &'static str,
|
||||
state: State,
|
||||
data: T,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
enum State {
|
||||
Clean,
|
||||
Armed,
|
||||
Poisoned { at: chrono::DateTime<chrono::Utc> },
|
||||
}
|
||||
|
||||
impl<T> Poison<T> {
|
||||
/// We log `what` `warning!` level if the [`Guard`] gets dropped without being [`Guard::disarm`]ed.
|
||||
pub fn new(what: &'static str, data: T) -> Self {
|
||||
Self {
|
||||
what,
|
||||
state: State::Clean,
|
||||
data,
|
||||
}
|
||||
}
|
||||
|
||||
/// Check for poisoning and return a [`Guard`] that provides access to the wrapped state.
|
||||
pub fn check_and_arm(&mut self) -> Result<Guard<T>, Error> {
|
||||
match self.state {
|
||||
State::Clean => {
|
||||
self.state = State::Armed;
|
||||
Ok(Guard(self))
|
||||
}
|
||||
State::Armed => unreachable!("transient state"),
|
||||
State::Poisoned { at } => Err(Error::Poisoned {
|
||||
what: self.what,
|
||||
at,
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Use [`Self::data`] and [`Self::data_mut`] to access the wrapped state.
|
||||
/// Once modifications are done, use [`Self::disarm`].
|
||||
/// If [`Guard`] gets dropped instead of calling [`Self::disarm`], the state is poisoned
|
||||
/// and subsequent calls to [`Poison::check_and_arm`] will fail with an error.
|
||||
pub struct Guard<'a, T>(&'a mut Poison<T>);
|
||||
|
||||
impl<'a, T> Guard<'a, T> {
|
||||
pub fn data(&self) -> &T {
|
||||
&self.0.data
|
||||
}
|
||||
pub fn data_mut(&mut self) -> &mut T {
|
||||
&mut self.0.data
|
||||
}
|
||||
|
||||
pub fn disarm(self) {
|
||||
match self.0.state {
|
||||
State::Clean => unreachable!("we set it to Armed in check_and_arm()"),
|
||||
State::Armed => {
|
||||
self.0.state = State::Clean;
|
||||
}
|
||||
State::Poisoned { at } => {
|
||||
unreachable!("we fail check_and_arm() if it's in that state: {at}")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T> Drop for Guard<'a, T> {
|
||||
fn drop(&mut self) {
|
||||
match self.0.state {
|
||||
State::Clean => {
|
||||
// set by disarm()
|
||||
}
|
||||
State::Armed => {
|
||||
// still armed => poison it
|
||||
let at = chrono::Utc::now();
|
||||
self.0.state = State::Poisoned { at };
|
||||
warn!(at=?at, "poisoning {}", self.0.what);
|
||||
}
|
||||
State::Poisoned { at } => {
|
||||
unreachable!("we fail check_and_arm() if it's in that state: {at}")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum Error {
|
||||
#[error("poisoned at {at}: {what}")]
|
||||
Poisoned {
|
||||
what: &'static str,
|
||||
at: chrono::DateTime<chrono::Utc>,
|
||||
},
|
||||
}
|
||||
@@ -182,18 +182,6 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if [`Self::wait_for`] or [`Self::wait_for_timeout`] would wait if called with `num`.
|
||||
pub fn would_wait_for(&self, num: V) -> Result<(), V> {
|
||||
let internal = self.internal.lock().unwrap();
|
||||
let cnt = internal.current.cnt_value();
|
||||
drop(internal);
|
||||
if cnt >= num {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(cnt)
|
||||
}
|
||||
}
|
||||
|
||||
/// Register and return a channel that will be notified when a number arrives,
|
||||
/// or None, if it has already arrived.
|
||||
fn queue_for_wait(&self, num: V) -> Result<Option<Receiver<()>>, SeqWaitError> {
|
||||
|
||||
@@ -1,55 +0,0 @@
|
||||
//! A `serde::{Deserialize,Serialize}` type for SystemTime with RFC3339 format and millisecond precision.
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, serde::Serialize, serde::Deserialize)]
|
||||
#[serde(transparent)]
|
||||
pub struct SystemTime(
|
||||
#[serde(
|
||||
deserialize_with = "deser_rfc3339_millis",
|
||||
serialize_with = "ser_rfc3339_millis"
|
||||
)]
|
||||
pub std::time::SystemTime,
|
||||
);
|
||||
|
||||
fn ser_rfc3339_millis<S: serde::ser::Serializer>(
|
||||
ts: &std::time::SystemTime,
|
||||
serializer: S,
|
||||
) -> Result<S::Ok, S::Error> {
|
||||
serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
|
||||
}
|
||||
|
||||
fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<std::time::SystemTime, D::Error>
|
||||
where
|
||||
D: serde::de::Deserializer<'de>,
|
||||
{
|
||||
let s: String = serde::de::Deserialize::deserialize(deserializer)?;
|
||||
humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
/// Helper function to make a SystemTime have millisecond precision by truncating additional nanoseconds.
|
||||
fn to_millisecond_precision(time: SystemTime) -> SystemTime {
|
||||
match time.0.duration_since(std::time::SystemTime::UNIX_EPOCH) {
|
||||
Ok(duration) => {
|
||||
let total_millis = duration.as_secs() * 1_000 + u64::from(duration.subsec_millis());
|
||||
SystemTime(
|
||||
std::time::SystemTime::UNIX_EPOCH
|
||||
+ std::time::Duration::from_millis(total_millis),
|
||||
)
|
||||
}
|
||||
Err(_) => time,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_serialize_deserialize() {
|
||||
let input = SystemTime(std::time::SystemTime::now());
|
||||
let expected_serialized = format!("\"{}\"", humantime::format_rfc3339_millis(input.0));
|
||||
let serialized = serde_json::to_string(&input).unwrap();
|
||||
assert_eq!(expected_serialized, serialized);
|
||||
let deserialized: SystemTime = serde_json::from_str(&expected_serialized).unwrap();
|
||||
assert_eq!(to_millisecond_precision(input), deserialized);
|
||||
}
|
||||
}
|
||||
@@ -192,14 +192,6 @@ impl<T> OnceCell<T> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Like [`Guard::take_and_deinit`], but will return `None` if this OnceCell was never
|
||||
/// initialized.
|
||||
pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> {
|
||||
let inner = self.inner.get_mut().unwrap();
|
||||
|
||||
inner.take_and_deinit()
|
||||
}
|
||||
|
||||
/// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete.
|
||||
pub fn initializer_count(&self) -> usize {
|
||||
self.initializers.load(Ordering::Relaxed)
|
||||
@@ -254,23 +246,15 @@ impl<'a, T> Guard<'a, T> {
|
||||
/// The permit will be on a semaphore part of the new internal value, and any following
|
||||
/// [`OnceCell::get_or_init`] will wait on it to complete.
|
||||
pub fn take_and_deinit(mut self) -> (T, InitPermit) {
|
||||
self.0
|
||||
.take_and_deinit()
|
||||
.expect("guard is not created unless value has been initialized")
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Inner<T> {
|
||||
pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> {
|
||||
let value = self.value.take()?;
|
||||
|
||||
let mut swapped = Inner::default();
|
||||
let sem = swapped.init_semaphore.clone();
|
||||
// acquire and forget right away, moving the control over to InitPermit
|
||||
sem.try_acquire().expect("we just created this").forget();
|
||||
let permit = InitPermit(sem);
|
||||
std::mem::swap(self, &mut swapped);
|
||||
Some((value, permit))
|
||||
std::mem::swap(&mut *self.0, &mut swapped);
|
||||
swapped
|
||||
.value
|
||||
.map(|v| (v, InitPermit(sem)))
|
||||
.expect("guard is not created unless value has been initialized")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -279,13 +263,6 @@ impl<T> Inner<T> {
|
||||
/// On drop, this type will return the permit.
|
||||
pub struct InitPermit(Arc<tokio::sync::Semaphore>);
|
||||
|
||||
impl std::fmt::Debug for InitPermit {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let ptr = Arc::as_ptr(&self.0) as *const ();
|
||||
f.debug_tuple("InitPermit").field(&ptr).finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for InitPermit {
|
||||
fn drop(&mut self) {
|
||||
assert_eq!(
|
||||
@@ -582,22 +559,4 @@ mod tests {
|
||||
|
||||
assert_eq!(*target.get().unwrap(), 11);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn take_and_deinit_on_mut() {
|
||||
use std::convert::Infallible;
|
||||
|
||||
let mut target = OnceCell::<u32>::default();
|
||||
assert!(target.take_and_deinit().is_none());
|
||||
|
||||
target
|
||||
.get_or_init(|permit| async move { Ok::<_, Infallible>((42, permit)) })
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let again = target.take_and_deinit();
|
||||
assert!(matches!(again, Some((42, _))), "{again:?}");
|
||||
|
||||
assert!(target.take_and_deinit().is_none());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -27,50 +27,30 @@
|
||||
//!
|
||||
//! # Reference Numbers
|
||||
//!
|
||||
//! 2024-04-15 on i3en.3xlarge
|
||||
//! 2024-03-20 on i3en.3xlarge
|
||||
//!
|
||||
//! ```text
|
||||
//! async-short/1 time: [24.584 µs 24.737 µs 24.922 µs]
|
||||
//! async-short/2 time: [33.479 µs 33.660 µs 33.888 µs]
|
||||
//! async-short/4 time: [42.713 µs 43.046 µs 43.440 µs]
|
||||
//! async-short/8 time: [71.814 µs 72.478 µs 73.240 µs]
|
||||
//! async-short/16 time: [132.73 µs 134.45 µs 136.22 µs]
|
||||
//! async-short/32 time: [258.31 µs 260.73 µs 263.27 µs]
|
||||
//! async-short/64 time: [511.61 µs 514.44 µs 517.51 µs]
|
||||
//! async-short/128 time: [992.64 µs 998.23 µs 1.0042 ms]
|
||||
//! async-medium/1 time: [110.11 µs 110.50 µs 110.96 µs]
|
||||
//! async-medium/2 time: [153.06 µs 153.85 µs 154.99 µs]
|
||||
//! async-medium/4 time: [317.51 µs 319.92 µs 322.85 µs]
|
||||
//! async-medium/8 time: [638.30 µs 644.68 µs 652.12 µs]
|
||||
//! async-medium/16 time: [1.2651 ms 1.2773 ms 1.2914 ms]
|
||||
//! async-medium/32 time: [2.5117 ms 2.5410 ms 2.5720 ms]
|
||||
//! async-medium/64 time: [4.8088 ms 4.8555 ms 4.9047 ms]
|
||||
//! async-medium/128 time: [8.8311 ms 8.9849 ms 9.1263 ms]
|
||||
//! sync-short/1 time: [25.503 µs 25.626 µs 25.771 µs]
|
||||
//! sync-short/2 time: [30.850 µs 31.013 µs 31.208 µs]
|
||||
//! sync-short/4 time: [45.543 µs 45.856 µs 46.193 µs]
|
||||
//! sync-short/8 time: [84.114 µs 84.639 µs 85.220 µs]
|
||||
//! sync-short/16 time: [185.22 µs 186.15 µs 187.13 µs]
|
||||
//! sync-short/32 time: [377.43 µs 378.87 µs 380.46 µs]
|
||||
//! sync-short/64 time: [756.49 µs 759.04 µs 761.70 µs]
|
||||
//! sync-short/128 time: [1.4825 ms 1.4874 ms 1.4923 ms]
|
||||
//! sync-medium/1 time: [105.66 µs 106.01 µs 106.43 µs]
|
||||
//! sync-medium/2 time: [153.10 µs 153.84 µs 154.72 µs]
|
||||
//! sync-medium/4 time: [327.13 µs 329.44 µs 332.27 µs]
|
||||
//! sync-medium/8 time: [654.26 µs 658.73 µs 663.63 µs]
|
||||
//! sync-medium/16 time: [1.2682 ms 1.2748 ms 1.2816 ms]
|
||||
//! sync-medium/32 time: [2.4456 ms 2.4595 ms 2.4731 ms]
|
||||
//! sync-medium/64 time: [4.6523 ms 4.6890 ms 4.7256 ms]
|
||||
//! sync-medium/128 time: [8.7215 ms 8.8323 ms 8.9344 ms]
|
||||
//! short/1 time: [26.483 µs 26.614 µs 26.767 µs]
|
||||
//! short/2 time: [32.223 µs 32.465 µs 32.767 µs]
|
||||
//! short/4 time: [47.203 µs 47.583 µs 47.984 µs]
|
||||
//! short/8 time: [89.135 µs 89.612 µs 90.139 µs]
|
||||
//! short/16 time: [190.12 µs 191.52 µs 192.88 µs]
|
||||
//! short/32 time: [380.96 µs 382.63 µs 384.20 µs]
|
||||
//! short/64 time: [736.86 µs 741.07 µs 745.03 µs]
|
||||
//! short/128 time: [1.4106 ms 1.4206 ms 1.4294 ms]
|
||||
//! medium/1 time: [111.81 µs 112.25 µs 112.79 µs]
|
||||
//! medium/2 time: [158.26 µs 159.13 µs 160.21 µs]
|
||||
//! medium/4 time: [334.65 µs 337.14 µs 340.07 µs]
|
||||
//! medium/8 time: [675.32 µs 679.91 µs 685.25 µs]
|
||||
//! medium/16 time: [1.2929 ms 1.2996 ms 1.3067 ms]
|
||||
//! medium/32 time: [2.4295 ms 2.4461 ms 2.4623 ms]
|
||||
//! medium/64 time: [4.3973 ms 4.4458 ms 4.4875 ms]
|
||||
//! medium/128 time: [7.5955 ms 7.7847 ms 7.9481 ms]
|
||||
//! ```
|
||||
|
||||
use bytes::{Buf, Bytes};
|
||||
use criterion::{BenchmarkId, Criterion};
|
||||
use pageserver::{
|
||||
config::PageServerConf,
|
||||
walrecord::NeonWalRecord,
|
||||
walredo::{PostgresRedoManager, ProcessKind},
|
||||
};
|
||||
use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
|
||||
use pageserver_api::{key::Key, shard::TenantShardId};
|
||||
use std::{
|
||||
sync::Arc,
|
||||
@@ -80,39 +60,33 @@ use tokio::{sync::Barrier, task::JoinSet};
|
||||
use utils::{id::TenantId, lsn::Lsn};
|
||||
|
||||
fn bench(c: &mut Criterion) {
|
||||
for process_kind in &[ProcessKind::Async, ProcessKind::Sync] {
|
||||
{
|
||||
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||
for nclients in nclients {
|
||||
let mut group = c.benchmark_group(format!("{process_kind}-short"));
|
||||
group.bench_with_input(
|
||||
BenchmarkId::from_parameter(nclients),
|
||||
&nclients,
|
||||
|b, nclients| {
|
||||
let redo_work = Arc::new(Request::short_input());
|
||||
b.iter_custom(|iters| {
|
||||
bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
|
||||
});
|
||||
},
|
||||
);
|
||||
}
|
||||
{
|
||||
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||
for nclients in nclients {
|
||||
let mut group = c.benchmark_group("short");
|
||||
group.bench_with_input(
|
||||
BenchmarkId::from_parameter(nclients),
|
||||
&nclients,
|
||||
|b, nclients| {
|
||||
let redo_work = Arc::new(Request::short_input());
|
||||
b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||
for nclients in nclients {
|
||||
let mut group = c.benchmark_group(format!("{process_kind}-medium"));
|
||||
group.bench_with_input(
|
||||
BenchmarkId::from_parameter(nclients),
|
||||
&nclients,
|
||||
|b, nclients| {
|
||||
let redo_work = Arc::new(Request::medium_input());
|
||||
b.iter_custom(|iters| {
|
||||
bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
|
||||
});
|
||||
},
|
||||
);
|
||||
}
|
||||
{
|
||||
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||
for nclients in nclients {
|
||||
let mut group = c.benchmark_group("medium");
|
||||
group.bench_with_input(
|
||||
BenchmarkId::from_parameter(nclients),
|
||||
&nclients,
|
||||
|b, nclients| {
|
||||
let redo_work = Arc::new(Request::medium_input());
|
||||
b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -120,16 +94,10 @@ criterion::criterion_group!(benches, bench);
|
||||
criterion::criterion_main!(benches);
|
||||
|
||||
// Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
|
||||
fn bench_impl(
|
||||
process_kind: ProcessKind,
|
||||
redo_work: Arc<Request>,
|
||||
n_redos: u64,
|
||||
nclients: u64,
|
||||
) -> Duration {
|
||||
fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
|
||||
let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
|
||||
|
||||
let mut conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
|
||||
conf.walredo_process_kind = process_kind;
|
||||
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
|
||||
let conf = Box::leak(Box::new(conf));
|
||||
let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
|
||||
|
||||
@@ -145,40 +113,25 @@ fn bench_impl(
|
||||
let manager = PostgresRedoManager::new(conf, tenant_shard_id);
|
||||
let manager = Arc::new(manager);
|
||||
|
||||
// divide the amount of work equally among the clients.
|
||||
let nredos_per_client = n_redos / nclients;
|
||||
for _ in 0..nclients {
|
||||
rt.block_on(async {
|
||||
tasks.spawn(client(
|
||||
Arc::clone(&manager),
|
||||
Arc::clone(&start),
|
||||
Arc::clone(&redo_work),
|
||||
nredos_per_client,
|
||||
// divide the amount of work equally among the clients
|
||||
n_redos / nclients,
|
||||
))
|
||||
});
|
||||
}
|
||||
|
||||
let elapsed = rt.block_on(async move {
|
||||
let mut total_wallclock_time = Duration::ZERO;
|
||||
rt.block_on(async move {
|
||||
let mut total_wallclock_time = std::time::Duration::from_millis(0);
|
||||
while let Some(res) = tasks.join_next().await {
|
||||
total_wallclock_time += res.unwrap();
|
||||
}
|
||||
total_wallclock_time
|
||||
});
|
||||
|
||||
// consistency check to ensure process kind setting worked
|
||||
if nredos_per_client > 0 {
|
||||
assert_eq!(
|
||||
manager
|
||||
.status()
|
||||
.process
|
||||
.map(|p| p.kind)
|
||||
.expect("the benchmark work causes a walredo process to be spawned"),
|
||||
std::borrow::Cow::Borrowed(process_kind.into())
|
||||
);
|
||||
}
|
||||
|
||||
elapsed
|
||||
})
|
||||
}
|
||||
|
||||
async fn client(
|
||||
|
||||
@@ -128,12 +128,12 @@ impl Client {
|
||||
|
||||
pub async fn timeline_info(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
force_await_logical_size: ForceAwaitLogicalSize,
|
||||
) -> Result<pageserver_api::models::TimelineInfo> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}",
|
||||
"{}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
|
||||
self.mgmt_api_endpoint
|
||||
);
|
||||
|
||||
@@ -151,11 +151,11 @@ impl Client {
|
||||
|
||||
pub async fn keyspace(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<pageserver_api::models::partitioning::Partitioning> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/keyspace",
|
||||
"{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/keyspace",
|
||||
self.mgmt_api_endpoint
|
||||
);
|
||||
self.get(&uri)
|
||||
@@ -243,19 +243,6 @@ impl Client {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn tenant_scan_remote_storage(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<TenantScanRemoteStorageResponse> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{tenant_id}/scan_remote_storage",
|
||||
self.mgmt_api_endpoint
|
||||
);
|
||||
let response = self.request(Method::GET, &uri, ()).await?;
|
||||
let body = response.json().await.map_err(Error::ReceiveBody)?;
|
||||
Ok(body)
|
||||
}
|
||||
|
||||
pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
|
||||
let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
|
||||
self.request(Method::PUT, &uri, req).await?;
|
||||
|
||||
@@ -11,6 +11,7 @@ default = []
|
||||
anyhow.workspace = true
|
||||
async-compression.workspace = true
|
||||
async-stream.workspace = true
|
||||
async-trait.workspace = true
|
||||
byteorder.workspace = true
|
||||
bytes.workspace = true
|
||||
chrono = { workspace = true, features = ["serde"] }
|
||||
|
||||
@@ -43,8 +43,7 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
|
||||
fanout: u64,
|
||||
ctx: &E::RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
assert!(fanout >= 1, "fanout needs to be at least 1 but is {fanout}");
|
||||
let exp_base = fanout.max(2);
|
||||
assert!(fanout >= 2);
|
||||
// Start at L0
|
||||
let mut current_level_no = 0;
|
||||
let mut current_level_target_height = target_file_size;
|
||||
@@ -107,7 +106,7 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
|
||||
break;
|
||||
}
|
||||
current_level_no += 1;
|
||||
current_level_target_height = current_level_target_height.saturating_mul(exp_base);
|
||||
current_level_target_height = current_level_target_height.saturating_mul(fanout);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -180,7 +180,7 @@ where
|
||||
match top.deref_mut() {
|
||||
LazyLoadLayer::Unloaded(ref mut l) => {
|
||||
let fut = l.load_keys(this.ctx);
|
||||
this.load_future.set(Some(Box::pin(fut)));
|
||||
this.load_future.set(Some(fut));
|
||||
continue;
|
||||
}
|
||||
LazyLoadLayer::Loaded(ref mut entries) => {
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
//!
|
||||
//! All the heavy lifting is done by the create_image and create_delta
|
||||
//! functions that the implementor provides.
|
||||
use async_trait::async_trait;
|
||||
use futures::Future;
|
||||
use pageserver_api::{key::Key, keyspace::key_range_size};
|
||||
use std::ops::Range;
|
||||
@@ -140,16 +141,18 @@ pub trait CompactionLayer<K: CompactionKey + ?Sized> {
|
||||
|
||||
fn is_delta(&self) -> bool;
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
pub trait CompactionDeltaLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {
|
||||
type DeltaEntry<'a>: CompactionDeltaEntry<'a, E::Key>
|
||||
where
|
||||
Self: 'a;
|
||||
|
||||
/// Return all keys in this delta layer.
|
||||
fn load_keys<'a>(
|
||||
async fn load_keys<'a>(
|
||||
&self,
|
||||
ctx: &E::RequestContext,
|
||||
) -> impl Future<Output = anyhow::Result<Vec<Self::DeltaEntry<'_>>>> + Send;
|
||||
) -> anyhow::Result<Vec<Self::DeltaEntry<'_>>>;
|
||||
}
|
||||
|
||||
pub trait CompactionImageLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {}
|
||||
|
||||
@@ -2,6 +2,7 @@ mod draw;
|
||||
|
||||
use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp};
|
||||
|
||||
use async_trait::async_trait;
|
||||
use futures::StreamExt;
|
||||
use rand::Rng;
|
||||
use tracing::info;
|
||||
@@ -138,6 +139,7 @@ impl interface::CompactionLayer<Key> for Arc<MockDeltaLayer> {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl interface::CompactionDeltaLayer<MockTimeline> for Arc<MockDeltaLayer> {
|
||||
type DeltaEntry<'a> = MockRecord;
|
||||
|
||||
|
||||
@@ -12,14 +12,9 @@ bytes.workspace = true
|
||||
camino.workspace = true
|
||||
clap = { workspace = true, features = ["string"] }
|
||||
git-version.workspace = true
|
||||
humantime.workspace = true
|
||||
pageserver = { path = ".." }
|
||||
pageserver_api.workspace = true
|
||||
remote_storage = { path = "../../libs/remote_storage" }
|
||||
postgres_ffi.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-util.workspace = true
|
||||
toml_edit.workspace = true
|
||||
utils.workspace = true
|
||||
svg_fmt.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -9,11 +9,6 @@ mod index_part;
|
||||
mod layer_map_analyzer;
|
||||
mod layers;
|
||||
|
||||
use std::{
|
||||
str::FromStr,
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use clap::{Parser, Subcommand};
|
||||
use index_part::IndexPartCmd;
|
||||
@@ -25,16 +20,8 @@ use pageserver::{
|
||||
tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
|
||||
virtual_file,
|
||||
};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use postgres_ffi::ControlFileData;
|
||||
use remote_storage::{RemotePath, RemoteStorageConfig};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::{
|
||||
id::TimelineId,
|
||||
logging::{self, LogFormat, TracingErrorLayerEnablement},
|
||||
lsn::Lsn,
|
||||
project_git_version,
|
||||
};
|
||||
use utils::{lsn::Lsn, project_git_version};
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
|
||||
@@ -56,7 +43,6 @@ enum Commands {
|
||||
#[command(subcommand)]
|
||||
IndexPart(IndexPartCmd),
|
||||
PrintLayerFile(PrintLayerFileCmd),
|
||||
TimeTravelRemotePrefix(TimeTravelRemotePrefixCmd),
|
||||
DrawTimeline {},
|
||||
AnalyzeLayerMap(AnalyzeLayerMapCmd),
|
||||
#[command(subcommand)]
|
||||
@@ -82,26 +68,6 @@ struct PrintLayerFileCmd {
|
||||
path: Utf8PathBuf,
|
||||
}
|
||||
|
||||
/// Roll back the time for the specified prefix using S3 history.
|
||||
///
|
||||
/// The command is fairly low level and powerful. Validation is only very light,
|
||||
/// so it is more powerful, and thus potentially more dangerous.
|
||||
#[derive(Parser)]
|
||||
struct TimeTravelRemotePrefixCmd {
|
||||
/// A configuration string for the remote_storage configuration.
|
||||
///
|
||||
/// Example: `remote_storage = { bucket_name = "aws-storage-bucket-name", bucket_region = "us-east-2" }`
|
||||
config_toml_str: String,
|
||||
/// remote prefix to time travel recover. For safety reasons, we require it to contain
|
||||
/// a timeline or tenant ID in the prefix.
|
||||
prefix: String,
|
||||
/// Timestamp to travel to. Given in format like `2024-01-20T10:45:45Z`. Assumes UTC and second accuracy.
|
||||
travel_to: String,
|
||||
/// Timestamp of the start of the operation, must be after any changes we want to roll back and after.
|
||||
/// You can use a few seconds before invoking the command. Same format as `travel_to`.
|
||||
done_if_after: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
struct AnalyzeLayerMapCmd {
|
||||
/// Pageserver data path
|
||||
@@ -112,14 +78,6 @@ struct AnalyzeLayerMapCmd {
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
logging::init(
|
||||
LogFormat::Plain,
|
||||
TracingErrorLayerEnablement::EnableWithRustLogFilter,
|
||||
logging::Output::Stdout,
|
||||
)?;
|
||||
|
||||
logging::replace_panic_hook_with_tracing_panic_hook().forget();
|
||||
|
||||
let cli = CliOpts::parse();
|
||||
|
||||
match cli.command {
|
||||
@@ -147,42 +105,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
print_layerfile(&cmd.path).await?;
|
||||
}
|
||||
}
|
||||
Commands::TimeTravelRemotePrefix(cmd) => {
|
||||
let timestamp = humantime::parse_rfc3339(&cmd.travel_to)
|
||||
.map_err(|_e| anyhow::anyhow!("Invalid time for travel_to: '{}'", cmd.travel_to))?;
|
||||
|
||||
let done_if_after = if let Some(done_if_after) = &cmd.done_if_after {
|
||||
humantime::parse_rfc3339(done_if_after).map_err(|_e| {
|
||||
anyhow::anyhow!("Invalid time for done_if_after: '{}'", done_if_after)
|
||||
})?
|
||||
} else {
|
||||
const SAFETY_MARGIN: Duration = Duration::from_secs(3);
|
||||
tokio::time::sleep(SAFETY_MARGIN).await;
|
||||
// Convert to string representation and back to get rid of sub-second values
|
||||
let done_if_after = SystemTime::now();
|
||||
tokio::time::sleep(SAFETY_MARGIN).await;
|
||||
done_if_after
|
||||
};
|
||||
|
||||
let timestamp = strip_subsecond(timestamp);
|
||||
let done_if_after = strip_subsecond(done_if_after);
|
||||
|
||||
let Some(prefix) = validate_prefix(&cmd.prefix) else {
|
||||
println!("specified prefix '{}' failed validation", cmd.prefix);
|
||||
return Ok(());
|
||||
};
|
||||
let toml_document = toml_edit::Document::from_str(&cmd.config_toml_str)?;
|
||||
let toml_item = toml_document
|
||||
.get("remote_storage")
|
||||
.expect("need remote_storage");
|
||||
let config = RemoteStorageConfig::from_toml(toml_item)?.expect("incomplete config");
|
||||
let storage = remote_storage::GenericRemoteStorage::from_config(&config);
|
||||
let cancel = CancellationToken::new();
|
||||
storage
|
||||
.unwrap()
|
||||
.time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel)
|
||||
.await?;
|
||||
}
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
@@ -263,89 +185,3 @@ fn handle_metadata(
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Ensures that the given S3 prefix is sufficiently constrained.
|
||||
/// The command is very risky already and we don't want to expose something
|
||||
/// that allows usually unintentional and quite catastrophic time travel of
|
||||
/// an entire bucket, which would be a major catastrophy and away
|
||||
/// by only one character change (similar to "rm -r /home /username/foobar").
|
||||
fn validate_prefix(prefix: &str) -> Option<RemotePath> {
|
||||
if prefix.is_empty() {
|
||||
// Empty prefix means we want to specify the *whole* bucket
|
||||
return None;
|
||||
}
|
||||
let components = prefix.split('/').collect::<Vec<_>>();
|
||||
let (last, components) = {
|
||||
let last = components.last()?;
|
||||
if last.is_empty() {
|
||||
(
|
||||
components.iter().nth_back(1)?,
|
||||
&components[..(components.len() - 1)],
|
||||
)
|
||||
} else {
|
||||
(last, &components[..])
|
||||
}
|
||||
};
|
||||
'valid: {
|
||||
if let Ok(_timeline_id) = TimelineId::from_str(last) {
|
||||
// Ends in either a tenant or timeline ID
|
||||
break 'valid;
|
||||
}
|
||||
if *last == "timelines" {
|
||||
if let Some(before_last) = components.iter().nth_back(1) {
|
||||
if let Ok(_tenant_id) = TenantShardId::from_str(before_last) {
|
||||
// Has a valid tenant id
|
||||
break 'valid;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return None;
|
||||
}
|
||||
RemotePath::from_string(prefix).ok()
|
||||
}
|
||||
|
||||
fn strip_subsecond(timestamp: SystemTime) -> SystemTime {
|
||||
let ts_str = humantime::format_rfc3339_seconds(timestamp).to_string();
|
||||
humantime::parse_rfc3339(&ts_str).expect("can't parse just created timestamp")
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_validate_prefix() {
|
||||
assert_eq!(validate_prefix(""), None);
|
||||
assert_eq!(validate_prefix("/"), None);
|
||||
#[track_caller]
|
||||
fn assert_valid(prefix: &str) {
|
||||
let remote_path = RemotePath::from_string(prefix).unwrap();
|
||||
assert_eq!(validate_prefix(prefix), Some(remote_path));
|
||||
}
|
||||
assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/");
|
||||
// Path is not relative but absolute
|
||||
assert_eq!(
|
||||
validate_prefix(
|
||||
"/wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/"
|
||||
),
|
||||
None
|
||||
);
|
||||
assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/");
|
||||
// Partial tenant IDs should be invalid, S3 will match all tenants with the specific ID prefix
|
||||
assert_eq!(validate_prefix("wal/3aa8fcc61f6d357410b7d"), None);
|
||||
assert_eq!(validate_prefix("wal"), None);
|
||||
assert_eq!(validate_prefix("/wal/"), None);
|
||||
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001");
|
||||
// Partial tenant ID
|
||||
assert_eq!(
|
||||
validate_prefix("pageserver/v1/tenants/3aa8fcc61f6d357410b"),
|
||||
None
|
||||
);
|
||||
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines");
|
||||
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001-0004/timelines");
|
||||
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/");
|
||||
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/641e5342083b2235ee3deb8066819683");
|
||||
assert_eq!(validate_prefix("pageserver/v1/tenants/"), None);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use anyhow::Context;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::mgmt_api::ForceAwaitLogicalSize;
|
||||
use pageserver_client::page_service::BasebackupRequest;
|
||||
|
||||
@@ -96,7 +95,7 @@ async fn main_impl(
|
||||
let timeline = *timeline;
|
||||
let info = mgmt_api_client
|
||||
.timeline_info(
|
||||
TenantShardId::unsharded(timeline.tenant_id),
|
||||
timeline.tenant_id,
|
||||
timeline.timeline_id,
|
||||
ForceAwaitLogicalSize::No,
|
||||
)
|
||||
|
||||
@@ -4,7 +4,6 @@ use pageserver_api::key::{is_rel_block_key, key_to_rel_block, Key};
|
||||
use pageserver_api::keyspace::KeySpaceAccum;
|
||||
use pageserver_api::models::PagestreamGetPageRequest;
|
||||
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
@@ -174,10 +173,7 @@ async fn main_impl(
|
||||
let timeline = *timeline;
|
||||
async move {
|
||||
let partitioning = mgmt_api_client
|
||||
.keyspace(
|
||||
TenantShardId::unsharded(timeline.tenant_id),
|
||||
timeline.timeline_id,
|
||||
)
|
||||
.keyspace(timeline.tenant_id, timeline.timeline_id)
|
||||
.await?;
|
||||
let lsn = partitioning.at_lsn;
|
||||
let start = Instant::now();
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use humantime::Duration;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use tokio::task::JoinSet;
|
||||
use utils::id::TenantTimelineId;
|
||||
|
||||
@@ -60,11 +59,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
|
||||
let mgmt_api_client = Arc::clone(&mgmt_api_client);
|
||||
js.spawn(async move {
|
||||
let info = mgmt_api_client
|
||||
.timeline_info(
|
||||
TenantShardId::unsharded(tl.tenant_id),
|
||||
tl.timeline_id,
|
||||
ForceAwaitLogicalSize::Yes,
|
||||
)
|
||||
.timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
@@ -79,11 +74,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
|
||||
while !info.current_logical_size_is_accurate {
|
||||
ticker.tick().await;
|
||||
info = mgmt_api_client
|
||||
.timeline_info(
|
||||
TenantShardId::unsharded(tl.tenant_id),
|
||||
tl.timeline_id,
|
||||
ForceAwaitLogicalSize::Yes,
|
||||
)
|
||||
.timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
@@ -90,8 +90,15 @@ where
|
||||
(end_of_timeline.prev, end_of_timeline.last)
|
||||
};
|
||||
|
||||
// XXX hack for imports
|
||||
let prev_lsn = backup_lsn;
|
||||
// Consolidate the derived and the provided prev_lsn values
|
||||
let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn {
|
||||
if backup_prev != Lsn(0) {
|
||||
ensure!(backup_prev == provided_prev_lsn);
|
||||
}
|
||||
provided_prev_lsn
|
||||
} else {
|
||||
backup_prev
|
||||
};
|
||||
|
||||
info!(
|
||||
"taking basebackup lsn={}, prev_lsn={} (full_backup={})",
|
||||
|
||||
@@ -18,7 +18,6 @@ use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
|
||||
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
|
||||
use pageserver::tenant::{secondary, TenantSharedResources};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use tokio::signal::unix::SignalKind;
|
||||
use tokio::time::Instant;
|
||||
use tracing::*;
|
||||
|
||||
@@ -285,7 +284,6 @@ fn start_pageserver(
|
||||
))
|
||||
.unwrap();
|
||||
pageserver::preinitialize_metrics();
|
||||
pageserver::metrics::wal_redo::set_process_kind_metric(conf.walredo_process_kind);
|
||||
|
||||
// If any failpoints were set from FAILPOINTS environment variable,
|
||||
// print them to the log for debugging purposes
|
||||
@@ -673,37 +671,42 @@ fn start_pageserver(
|
||||
let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
|
||||
|
||||
// All started up! Now just sit and wait for shutdown signal.
|
||||
|
||||
{
|
||||
BACKGROUND_RUNTIME.block_on(async move {
|
||||
let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap();
|
||||
let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap();
|
||||
let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap();
|
||||
let signal = tokio::select! {
|
||||
_ = sigquit.recv() => {
|
||||
info!("Got signal SIGQUIT. Terminating in immediate shutdown mode",);
|
||||
std::process::exit(111);
|
||||
}
|
||||
_ = sigint.recv() => { "SIGINT" },
|
||||
_ = sigterm.recv() => { "SIGTERM" },
|
||||
};
|
||||
use signal_hook::consts::*;
|
||||
let signal_handler = BACKGROUND_RUNTIME.spawn_blocking(move || {
|
||||
let mut signals =
|
||||
signal_hook::iterator::Signals::new([SIGINT, SIGTERM, SIGQUIT]).unwrap();
|
||||
return signals
|
||||
.forever()
|
||||
.next()
|
||||
.expect("forever() never returns None unless explicitly closed");
|
||||
});
|
||||
let signal = BACKGROUND_RUNTIME
|
||||
.block_on(signal_handler)
|
||||
.expect("join error");
|
||||
match signal {
|
||||
SIGQUIT => {
|
||||
info!("Got signal {signal}. Terminating in immediate shutdown mode",);
|
||||
std::process::exit(111);
|
||||
}
|
||||
SIGINT | SIGTERM => {
|
||||
info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);
|
||||
|
||||
info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);
|
||||
|
||||
// This cancels the `shutdown_pageserver` cancellation tree.
|
||||
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
|
||||
// The plan is to change that over time.
|
||||
shutdown_pageserver.take();
|
||||
let bg_remote_storage = remote_storage.clone();
|
||||
let bg_deletion_queue = deletion_queue.clone();
|
||||
pageserver::shutdown_pageserver(
|
||||
&tenant_manager,
|
||||
bg_remote_storage.map(|_| bg_deletion_queue),
|
||||
0,
|
||||
)
|
||||
.await;
|
||||
unreachable!()
|
||||
})
|
||||
// This cancels the `shutdown_pageserver` cancellation tree.
|
||||
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
|
||||
// The plan is to change that over time.
|
||||
shutdown_pageserver.take();
|
||||
let bg_remote_storage = remote_storage.clone();
|
||||
let bg_deletion_queue = deletion_queue.clone();
|
||||
BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
|
||||
&tenant_manager,
|
||||
bg_remote_storage.map(|_| bg_deletion_queue),
|
||||
0,
|
||||
));
|
||||
unreachable!()
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -97,8 +97,6 @@ pub mod defaults {
|
||||
|
||||
pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
|
||||
|
||||
pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "sync";
|
||||
|
||||
///
|
||||
/// Default built-in configuration file.
|
||||
///
|
||||
@@ -142,8 +140,6 @@ pub mod defaults {
|
||||
|
||||
#validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
|
||||
|
||||
#walredo_process_kind = '{DEFAULT_WALREDO_PROCESS_KIND}'
|
||||
|
||||
[tenant_config]
|
||||
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
||||
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
|
||||
@@ -294,8 +290,6 @@ pub struct PageServerConf {
|
||||
///
|
||||
/// Setting this to zero disables limits on total ephemeral layer size.
|
||||
pub ephemeral_bytes_per_memory_kb: usize,
|
||||
|
||||
pub walredo_process_kind: crate::walredo::ProcessKind,
|
||||
}
|
||||
|
||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||
@@ -419,8 +413,6 @@ struct PageServerConfigBuilder {
|
||||
validate_vectored_get: BuilderValue<bool>,
|
||||
|
||||
ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
|
||||
|
||||
walredo_process_kind: BuilderValue<crate::walredo::ProcessKind>,
|
||||
}
|
||||
|
||||
impl PageServerConfigBuilder {
|
||||
@@ -508,8 +500,6 @@ impl PageServerConfigBuilder {
|
||||
)),
|
||||
validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
|
||||
ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
|
||||
|
||||
walredo_process_kind: Set(DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -693,10 +683,6 @@ impl PageServerConfigBuilder {
|
||||
self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn get_walredo_process_kind(&mut self, value: crate::walredo::ProcessKind) {
|
||||
self.walredo_process_kind = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||
let default = Self::default_values();
|
||||
|
||||
@@ -753,7 +739,6 @@ impl PageServerConfigBuilder {
|
||||
max_vectored_read_bytes,
|
||||
validate_vectored_get,
|
||||
ephemeral_bytes_per_memory_kb,
|
||||
walredo_process_kind,
|
||||
}
|
||||
CUSTOM LOGIC
|
||||
{
|
||||
@@ -1047,9 +1032,6 @@ impl PageServerConf {
|
||||
"ephemeral_bytes_per_memory_kb" => {
|
||||
builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
|
||||
}
|
||||
"walredo_process_kind" => {
|
||||
builder.get_walredo_process_kind(parse_toml_from_str("walredo_process_kind", item)?)
|
||||
}
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
}
|
||||
}
|
||||
@@ -1132,7 +1114,6 @@ impl PageServerConf {
|
||||
),
|
||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1370,8 +1351,7 @@ background_task_maximum_delay = '334 s'
|
||||
.expect("Invalid default constant")
|
||||
),
|
||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
|
||||
},
|
||||
"Correct defaults should be used when no config values are provided"
|
||||
);
|
||||
@@ -1443,8 +1423,7 @@ background_task_maximum_delay = '334 s'
|
||||
.expect("Invalid default constant")
|
||||
),
|
||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
|
||||
},
|
||||
"Should be able to parse all basic config values correctly"
|
||||
);
|
||||
|
||||
@@ -304,7 +304,7 @@ async fn calculate_synthetic_size_worker(
|
||||
continue;
|
||||
}
|
||||
|
||||
if !tenant_shard_id.is_shard_zero() {
|
||||
if !tenant_shard_id.is_zero() {
|
||||
// We only send consumption metrics from shard 0, so don't waste time calculating
|
||||
// synthetic size on other shards.
|
||||
continue;
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user