mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-30 02:30:38 +00:00
Compare commits
15 Commits
jcsp/hack
...
image_laye
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
70d1086e0f | ||
|
|
5a8e8baf9f | ||
|
|
57a4119a7b | ||
|
|
aaef3789b0 | ||
|
|
0b57e0b8f2 | ||
|
|
485ecbaf8f | ||
|
|
0bcbce197a | ||
|
|
19d59e58d2 | ||
|
|
ce65d13dbd | ||
|
|
18fefff026 | ||
|
|
2a69861896 | ||
|
|
98375b3896 | ||
|
|
8c60359ae5 | ||
|
|
8c7136b057 | ||
|
|
0df6c41eaa |
@@ -22,7 +22,6 @@
|
|||||||
!s3_scrubber/
|
!s3_scrubber/
|
||||||
!safekeeper/
|
!safekeeper/
|
||||||
!storage_broker/
|
!storage_broker/
|
||||||
!storage_controller/
|
|
||||||
!trace/
|
!trace/
|
||||||
!vendor/postgres-*/
|
!vendor/postgres-*/
|
||||||
!workspace_hack/
|
!workspace_hack/
|
||||||
|
|||||||
@@ -150,7 +150,7 @@ runs:
|
|||||||
|
|
||||||
# Use aws s3 cp (instead of aws s3 sync) to keep files from previous runs to make old URLs work,
|
# Use aws s3 cp (instead of aws s3 sync) to keep files from previous runs to make old URLs work,
|
||||||
# and to keep files on the host to upload them to the database
|
# and to keep files on the host to upload them to the database
|
||||||
time s5cmd --log error cp "${WORKDIR}/report/*" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}/"
|
time aws s3 cp --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
|
||||||
|
|
||||||
# Generate redirect
|
# Generate redirect
|
||||||
cat <<EOF > ${WORKDIR}/index.html
|
cat <<EOF > ${WORKDIR}/index.html
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ inputs:
|
|||||||
required: true
|
required: true
|
||||||
api_host:
|
api_host:
|
||||||
desctiption: 'Neon API host'
|
desctiption: 'Neon API host'
|
||||||
default: console-stage.neon.build
|
default: console.stage.neon.tech
|
||||||
outputs:
|
outputs:
|
||||||
dsn:
|
dsn:
|
||||||
description: 'Created Branch DSN (for main database)'
|
description: 'Created Branch DSN (for main database)'
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ inputs:
|
|||||||
required: true
|
required: true
|
||||||
api_host:
|
api_host:
|
||||||
desctiption: 'Neon API host'
|
desctiption: 'Neon API host'
|
||||||
default: console-stage.neon.build
|
default: console.stage.neon.tech
|
||||||
|
|
||||||
runs:
|
runs:
|
||||||
using: "composite"
|
using: "composite"
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ inputs:
|
|||||||
default: 15
|
default: 15
|
||||||
api_host:
|
api_host:
|
||||||
desctiption: 'Neon API host'
|
desctiption: 'Neon API host'
|
||||||
default: console-stage.neon.build
|
default: console.stage.neon.tech
|
||||||
provisioner:
|
provisioner:
|
||||||
desctiption: 'k8s-pod or k8s-neonvm'
|
desctiption: 'k8s-pod or k8s-neonvm'
|
||||||
default: 'k8s-pod'
|
default: 'k8s-pod'
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ inputs:
|
|||||||
required: true
|
required: true
|
||||||
api_host:
|
api_host:
|
||||||
desctiption: 'Neon API host'
|
desctiption: 'Neon API host'
|
||||||
default: console-stage.neon.build
|
default: console.stage.neon.tech
|
||||||
|
|
||||||
runs:
|
runs:
|
||||||
using: "composite"
|
using: "composite"
|
||||||
|
|||||||
1
.github/workflows/approved-for-ci-run.yml
vendored
1
.github/workflows/approved-for-ci-run.yml
vendored
@@ -18,7 +18,6 @@ on:
|
|||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
|
group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
|
||||||
cancel-in-progress: false
|
|
||||||
|
|
||||||
env:
|
env:
|
||||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
|||||||
58
.github/workflows/benchmarking.yml
vendored
58
.github/workflows/benchmarking.yml
vendored
@@ -147,16 +147,15 @@ jobs:
|
|||||||
"neonvm-captest-new"
|
"neonvm-captest-new"
|
||||||
],
|
],
|
||||||
"db_size": [ "10gb" ],
|
"db_size": [ "10gb" ],
|
||||||
"include": [{ "platform": "neon-captest-freetier", "db_size": "3gb" },
|
"include": [{ "platform": "neon-captest-freetier", "db_size": "3gb" },
|
||||||
{ "platform": "neon-captest-new", "db_size": "50gb" },
|
{ "platform": "neon-captest-new", "db_size": "50gb" },
|
||||||
{ "platform": "neonvm-captest-freetier", "db_size": "3gb" },
|
{ "platform": "neonvm-captest-freetier", "db_size": "3gb" },
|
||||||
{ "platform": "neonvm-captest-new", "db_size": "50gb" },
|
{ "platform": "neonvm-captest-new", "db_size": "50gb" }]
|
||||||
{ "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
|
|
||||||
}'
|
}'
|
||||||
|
|
||||||
if [ "$(date +%A)" = "Saturday" ]; then
|
if [ "$(date +%A)" = "Saturday" ]; then
|
||||||
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
|
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
|
||||||
{ "platform": "rds-aurora", "db_size": "50gb"}]')
|
{ "platform": "rds-aurora", "db_size": "50gb"}]')
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
||||||
@@ -172,7 +171,7 @@ jobs:
|
|||||||
|
|
||||||
if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
|
if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
|
||||||
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
|
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
|
||||||
{ "platform": "rds-aurora" }]')
|
{ "platform": "rds-aurora" }]')
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
||||||
@@ -191,7 +190,7 @@ jobs:
|
|||||||
|
|
||||||
if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
|
if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
|
||||||
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
|
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
|
||||||
{ "platform": "rds-aurora", "scale": "10" }]')
|
{ "platform": "rds-aurora", "scale": "10" }]')
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
||||||
@@ -254,9 +253,6 @@ jobs:
|
|||||||
neon-captest-reuse)
|
neon-captest-reuse)
|
||||||
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
|
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
|
||||||
;;
|
;;
|
||||||
neonvm-captest-sharding-reuse)
|
|
||||||
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
|
|
||||||
;;
|
|
||||||
neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
|
neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
|
||||||
CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
|
CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
|
||||||
;;
|
;;
|
||||||
@@ -274,15 +270,11 @@ jobs:
|
|||||||
|
|
||||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
QUERIES=("SELECT version()")
|
QUERY="SELECT version();"
|
||||||
if [[ "${PLATFORM}" = "neon"* ]]; then
|
if [[ "${PLATFORM}" = "neon"* ]]; then
|
||||||
QUERIES+=("SHOW neon.tenant_id")
|
QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
|
||||||
QUERIES+=("SHOW neon.timeline_id")
|
|
||||||
fi
|
fi
|
||||||
|
psql ${CONNSTR} -c "${QUERY}"
|
||||||
for q in "${QUERIES[@]}"; do
|
|
||||||
psql ${CONNSTR} -c "${q}"
|
|
||||||
done
|
|
||||||
|
|
||||||
- name: Benchmark init
|
- name: Benchmark init
|
||||||
uses: ./.github/actions/run-python-test-set
|
uses: ./.github/actions/run-python-test-set
|
||||||
@@ -409,15 +401,11 @@ jobs:
|
|||||||
|
|
||||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
QUERIES=("SELECT version()")
|
QUERY="SELECT version();"
|
||||||
if [[ "${PLATFORM}" = "neon"* ]]; then
|
if [[ "${PLATFORM}" = "neon"* ]]; then
|
||||||
QUERIES+=("SHOW neon.tenant_id")
|
QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
|
||||||
QUERIES+=("SHOW neon.timeline_id")
|
|
||||||
fi
|
fi
|
||||||
|
psql ${CONNSTR} -c "${QUERY}"
|
||||||
for q in "${QUERIES[@]}"; do
|
|
||||||
psql ${CONNSTR} -c "${q}"
|
|
||||||
done
|
|
||||||
|
|
||||||
- name: ClickBench benchmark
|
- name: ClickBench benchmark
|
||||||
uses: ./.github/actions/run-python-test-set
|
uses: ./.github/actions/run-python-test-set
|
||||||
@@ -519,15 +507,11 @@ jobs:
|
|||||||
|
|
||||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
QUERIES=("SELECT version()")
|
QUERY="SELECT version();"
|
||||||
if [[ "${PLATFORM}" = "neon"* ]]; then
|
if [[ "${PLATFORM}" = "neon"* ]]; then
|
||||||
QUERIES+=("SHOW neon.tenant_id")
|
QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
|
||||||
QUERIES+=("SHOW neon.timeline_id")
|
|
||||||
fi
|
fi
|
||||||
|
psql ${CONNSTR} -c "${QUERY}"
|
||||||
for q in "${QUERIES[@]}"; do
|
|
||||||
psql ${CONNSTR} -c "${q}"
|
|
||||||
done
|
|
||||||
|
|
||||||
- name: Run TPC-H benchmark
|
- name: Run TPC-H benchmark
|
||||||
uses: ./.github/actions/run-python-test-set
|
uses: ./.github/actions/run-python-test-set
|
||||||
@@ -613,15 +597,11 @@ jobs:
|
|||||||
|
|
||||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
QUERIES=("SELECT version()")
|
QUERY="SELECT version();"
|
||||||
if [[ "${PLATFORM}" = "neon"* ]]; then
|
if [[ "${PLATFORM}" = "neon"* ]]; then
|
||||||
QUERIES+=("SHOW neon.tenant_id")
|
QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
|
||||||
QUERIES+=("SHOW neon.timeline_id")
|
|
||||||
fi
|
fi
|
||||||
|
psql ${CONNSTR} -c "${QUERY}"
|
||||||
for q in "${QUERIES[@]}"; do
|
|
||||||
psql ${CONNSTR} -c "${q}"
|
|
||||||
done
|
|
||||||
|
|
||||||
- name: Run user examples
|
- name: Run user examples
|
||||||
uses: ./.github/actions/run-python-test-set
|
uses: ./.github/actions/run-python-test-set
|
||||||
|
|||||||
@@ -21,7 +21,6 @@ defaults:
|
|||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: build-build-tools-image-${{ inputs.image-tag }}
|
group: build-build-tools-image-${{ inputs.image-tag }}
|
||||||
cancel-in-progress: false
|
|
||||||
|
|
||||||
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
|
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
|
||||||
permissions: {}
|
permissions: {}
|
||||||
|
|||||||
25
.github/workflows/build_and_test.yml
vendored
25
.github/workflows/build_and_test.yml
vendored
@@ -461,7 +461,6 @@ jobs:
|
|||||||
|
|
||||||
- name: Pytest regression tests
|
- name: Pytest regression tests
|
||||||
uses: ./.github/actions/run-python-test-set
|
uses: ./.github/actions/run-python-test-set
|
||||||
timeout-minutes: 60
|
|
||||||
with:
|
with:
|
||||||
build_type: ${{ matrix.build_type }}
|
build_type: ${{ matrix.build_type }}
|
||||||
test_selection: regress
|
test_selection: regress
|
||||||
@@ -1121,34 +1120,18 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
|
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
|
||||||
|
|
||||||
|
# TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions
|
||||||
|
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
|
||||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
|
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
|
||||||
-f deployPgSniRouter=false \
|
-f deployPgSniRouter=false \
|
||||||
-f deployProxy=false \
|
-f deployProxy=false \
|
||||||
-f deployStorage=true \
|
-f deployStorage=true \
|
||||||
-f deployStorageBroker=true \
|
-f deployStorageBroker=true \
|
||||||
-f deployStorageController=true \
|
|
||||||
-f branch=main \
|
|
||||||
-f dockerTag=${{needs.tag.outputs.build-tag}} \
|
|
||||||
-f deployPreprodRegion=true
|
|
||||||
|
|
||||||
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
|
|
||||||
-f deployStorage=true \
|
|
||||||
-f deployStorageBroker=true \
|
|
||||||
-f deployStorageController=true \
|
|
||||||
-f branch=main \
|
-f branch=main \
|
||||||
-f dockerTag=${{needs.tag.outputs.build-tag}}
|
-f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||||
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
|
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
|
||||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
|
|
||||||
-f deployPgSniRouter=true \
|
|
||||||
-f deployProxy=true \
|
|
||||||
-f deployStorage=false \
|
|
||||||
-f deployStorageBroker=false \
|
|
||||||
-f deployStorageController=false \
|
|
||||||
-f branch=main \
|
|
||||||
-f dockerTag=${{needs.tag.outputs.build-tag}} \
|
|
||||||
-f deployPreprodRegion=true
|
|
||||||
|
|
||||||
gh workflow --repo neondatabase/aws run deploy-proxy-prod.yml --ref main \
|
gh workflow --repo neondatabase/aws run deploy-proxy-prod.yml --ref main \
|
||||||
-f deployPgSniRouter=true \
|
-f deployPgSniRouter=true \
|
||||||
-f deployProxy=true \
|
-f deployProxy=true \
|
||||||
|
|||||||
@@ -28,9 +28,7 @@ jobs:
|
|||||||
- name: Get build-tools image tag for the current commit
|
- name: Get build-tools image tag for the current commit
|
||||||
id: get-build-tools-tag
|
id: get-build-tools-tag
|
||||||
env:
|
env:
|
||||||
# Usually, for COMMIT_SHA, we use `github.event.pull_request.head.sha || github.sha`, but here, even for PRs,
|
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||||
# we want to use `github.sha` i.e. point to a phantom merge commit to determine the image tag correctly.
|
|
||||||
COMMIT_SHA: ${{ github.sha }}
|
|
||||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
run: |
|
run: |
|
||||||
LAST_BUILD_TOOLS_SHA=$(
|
LAST_BUILD_TOOLS_SHA=$(
|
||||||
|
|||||||
1
.github/workflows/pin-build-tools-image.yml
vendored
1
.github/workflows/pin-build-tools-image.yml
vendored
@@ -20,7 +20,6 @@ defaults:
|
|||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: pin-build-tools-image-${{ inputs.from-tag }}
|
group: pin-build-tools-image-${{ inputs.from-tag }}
|
||||||
cancel-in-progress: false
|
|
||||||
|
|
||||||
permissions: {}
|
permissions: {}
|
||||||
|
|
||||||
|
|||||||
90
.github/workflows/trigger-e2e-tests.yml
vendored
90
.github/workflows/trigger-e2e-tests.yml
vendored
@@ -62,14 +62,14 @@ jobs:
|
|||||||
|
|
||||||
trigger-e2e-tests:
|
trigger-e2e-tests:
|
||||||
needs: [ tag ]
|
needs: [ tag ]
|
||||||
runs-on: ubuntu-latest
|
runs-on: [ self-hosted, gen3, small ]
|
||||||
env:
|
env:
|
||||||
TAG: ${{ needs.tag.outputs.build-tag }}
|
TAG: ${{ needs.tag.outputs.build-tag }}
|
||||||
|
container:
|
||||||
|
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
||||||
|
options: --init
|
||||||
steps:
|
steps:
|
||||||
- name: check if ecr image are present
|
- name: check if ecr image are present
|
||||||
env:
|
|
||||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
|
||||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
|
||||||
run: |
|
run: |
|
||||||
for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
|
for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
|
||||||
OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
|
OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
|
||||||
@@ -79,55 +79,41 @@ jobs:
|
|||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
- name: Set e2e-platforms
|
|
||||||
id: e2e-platforms
|
|
||||||
env:
|
|
||||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
|
||||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
run: |
|
|
||||||
# Default set of platforms to run e2e tests on
|
|
||||||
platforms='["docker", "k8s"]'
|
|
||||||
|
|
||||||
# If the PR changes vendor/, pgxn/ or libs/vm_monitor/ directories, or Dockerfile.compute-node, add k8s-neonvm to the list of platforms.
|
|
||||||
# If the workflow run is not a pull request, add k8s-neonvm to the list.
|
|
||||||
if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then
|
|
||||||
for f in $(gh api "/repos/${GITHUB_REPOSITORY}/pulls/${PR_NUMBER}/files" --paginate --jq '.[].filename'); do
|
|
||||||
case "$f" in
|
|
||||||
vendor/*|pgxn/*|libs/vm_monitor/*|Dockerfile.compute-node)
|
|
||||||
platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
# no-op
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
else
|
|
||||||
platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "e2e-platforms=${platforms}" | tee -a $GITHUB_OUTPUT
|
|
||||||
|
|
||||||
- name: Set PR's status to pending and request a remote CI test
|
- name: Set PR's status to pending and request a remote CI test
|
||||||
env:
|
|
||||||
E2E_PLATFORMS: ${{ steps.e2e-platforms.outputs.e2e-platforms }}
|
|
||||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
|
||||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
|
||||||
run: |
|
run: |
|
||||||
REMOTE_REPO="${GITHUB_REPOSITORY_OWNER}/cloud"
|
# For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit
|
||||||
|
# but we need to use a real sha of a latest commit in the PR's branch for the e2e job,
|
||||||
|
# to place a job run status update later.
|
||||||
|
COMMIT_SHA=${{ github.event.pull_request.head.sha }}
|
||||||
|
# For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those
|
||||||
|
COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
|
||||||
|
|
||||||
gh api "/repos/${GITHUB_REPOSITORY}/statuses/${COMMIT_SHA}" \
|
REMOTE_REPO="${{ github.repository_owner }}/cloud"
|
||||||
--method POST \
|
|
||||||
--raw-field "state=pending" \
|
|
||||||
--raw-field "description=[$REMOTE_REPO] Remote CI job is about to start" \
|
|
||||||
--raw-field "context=neon-cloud-e2e"
|
|
||||||
|
|
||||||
gh workflow --repo ${REMOTE_REPO} \
|
curl -f -X POST \
|
||||||
run testing.yml \
|
https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
|
||||||
--ref "main" \
|
-H "Accept: application/vnd.github.v3+json" \
|
||||||
--raw-field "ci_job_name=neon-cloud-e2e" \
|
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
|
||||||
--raw-field "commit_hash=$COMMIT_SHA" \
|
--data \
|
||||||
--raw-field "remote_repo=${GITHUB_REPOSITORY}" \
|
"{
|
||||||
--raw-field "storage_image_tag=${TAG}" \
|
\"state\": \"pending\",
|
||||||
--raw-field "compute_image_tag=${TAG}" \
|
\"context\": \"neon-cloud-e2e\",
|
||||||
--raw-field "concurrency_group=${E2E_CONCURRENCY_GROUP}" \
|
\"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
|
||||||
--raw-field "e2e-platforms=${E2E_PLATFORMS}"
|
}"
|
||||||
|
|
||||||
|
curl -f -X POST \
|
||||||
|
https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
|
||||||
|
-H "Accept: application/vnd.github.v3+json" \
|
||||||
|
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
|
||||||
|
--data \
|
||||||
|
"{
|
||||||
|
\"ref\": \"main\",
|
||||||
|
\"inputs\": {
|
||||||
|
\"ci_job_name\": \"neon-cloud-e2e\",
|
||||||
|
\"commit_hash\": \"$COMMIT_SHA\",
|
||||||
|
\"remote_repo\": \"${{ github.repository }}\",
|
||||||
|
\"storage_image_tag\": \"${TAG}\",
|
||||||
|
\"compute_image_tag\": \"${TAG}\",
|
||||||
|
\"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
|
||||||
|
}
|
||||||
|
}"
|
||||||
|
|||||||
@@ -1,13 +1,12 @@
|
|||||||
/compute_tools/ @neondatabase/control-plane @neondatabase/compute
|
/compute_tools/ @neondatabase/control-plane @neondatabase/compute
|
||||||
/storage_controller @neondatabase/storage
|
/control_plane/attachment_service @neondatabase/storage
|
||||||
/libs/pageserver_api/ @neondatabase/storage
|
/libs/pageserver_api/ @neondatabase/storage
|
||||||
/libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers
|
/libs/postgres_ffi/ @neondatabase/compute
|
||||||
/libs/remote_storage/ @neondatabase/storage
|
/libs/remote_storage/ @neondatabase/storage
|
||||||
/libs/safekeeper_api/ @neondatabase/safekeepers
|
/libs/safekeeper_api/ @neondatabase/safekeepers
|
||||||
/libs/vm_monitor/ @neondatabase/autoscaling
|
/libs/vm_monitor/ @neondatabase/autoscaling
|
||||||
/pageserver/ @neondatabase/storage
|
/pageserver/ @neondatabase/storage
|
||||||
/pgxn/ @neondatabase/compute
|
/pgxn/ @neondatabase/compute
|
||||||
/pgxn/neon/ @neondatabase/compute @neondatabase/safekeepers
|
|
||||||
/proxy/ @neondatabase/proxy
|
/proxy/ @neondatabase/proxy
|
||||||
/safekeeper/ @neondatabase/safekeepers
|
/safekeeper/ @neondatabase/safekeepers
|
||||||
/vendor/ @neondatabase/compute
|
/vendor/ @neondatabase/compute
|
||||||
|
|||||||
712
Cargo.lock
generated
712
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
20
Cargo.toml
20
Cargo.toml
@@ -3,7 +3,7 @@ resolver = "2"
|
|||||||
members = [
|
members = [
|
||||||
"compute_tools",
|
"compute_tools",
|
||||||
"control_plane",
|
"control_plane",
|
||||||
"control_plane/storcon_cli",
|
"control_plane/attachment_service",
|
||||||
"pageserver",
|
"pageserver",
|
||||||
"pageserver/compaction",
|
"pageserver/compaction",
|
||||||
"pageserver/ctl",
|
"pageserver/ctl",
|
||||||
@@ -12,7 +12,6 @@ members = [
|
|||||||
"proxy",
|
"proxy",
|
||||||
"safekeeper",
|
"safekeeper",
|
||||||
"storage_broker",
|
"storage_broker",
|
||||||
"storage_controller",
|
|
||||||
"s3_scrubber",
|
"s3_scrubber",
|
||||||
"workspace_hack",
|
"workspace_hack",
|
||||||
"trace",
|
"trace",
|
||||||
@@ -44,7 +43,6 @@ license = "Apache-2.0"
|
|||||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||||
arc-swap = "1.6"
|
arc-swap = "1.6"
|
||||||
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
|
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
|
||||||
atomic-take = "1.1.0"
|
|
||||||
azure_core = "0.18"
|
azure_core = "0.18"
|
||||||
azure_identity = "0.18"
|
azure_identity = "0.18"
|
||||||
azure_storage = "0.18"
|
azure_storage = "0.18"
|
||||||
@@ -54,12 +52,10 @@ async-stream = "0.3"
|
|||||||
async-trait = "0.1"
|
async-trait = "0.1"
|
||||||
aws-config = { version = "1.1.4", default-features = false, features=["rustls"] }
|
aws-config = { version = "1.1.4", default-features = false, features=["rustls"] }
|
||||||
aws-sdk-s3 = "1.14"
|
aws-sdk-s3 = "1.14"
|
||||||
aws-sdk-iam = "1.15.0"
|
aws-sdk-secretsmanager = { version = "1.14.0" }
|
||||||
aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] }
|
aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] }
|
||||||
aws-smithy-types = "1.1.4"
|
aws-smithy-types = "1.1.4"
|
||||||
aws-credential-types = "1.1.4"
|
aws-credential-types = "1.1.4"
|
||||||
aws-sigv4 = { version = "1.2.0", features = ["sign-http"] }
|
|
||||||
aws-types = "1.1.7"
|
|
||||||
axum = { version = "0.6.20", features = ["ws"] }
|
axum = { version = "0.6.20", features = ["ws"] }
|
||||||
base64 = "0.13.0"
|
base64 = "0.13.0"
|
||||||
bincode = "1.3"
|
bincode = "1.3"
|
||||||
@@ -80,7 +76,6 @@ either = "1.8"
|
|||||||
enum-map = "2.4.2"
|
enum-map = "2.4.2"
|
||||||
enumset = "1.0.12"
|
enumset = "1.0.12"
|
||||||
fail = "0.5.0"
|
fail = "0.5.0"
|
||||||
fallible-iterator = "0.2"
|
|
||||||
fs2 = "0.4.3"
|
fs2 = "0.4.3"
|
||||||
futures = "0.3"
|
futures = "0.3"
|
||||||
futures-core = "0.3"
|
futures-core = "0.3"
|
||||||
@@ -93,12 +88,11 @@ hex = "0.4"
|
|||||||
hex-literal = "0.4"
|
hex-literal = "0.4"
|
||||||
hmac = "0.12.1"
|
hmac = "0.12.1"
|
||||||
hostname = "0.3.1"
|
hostname = "0.3.1"
|
||||||
http = {version = "1.1.0", features = ["std"]}
|
|
||||||
http-types = { version = "2", default-features = false }
|
http-types = { version = "2", default-features = false }
|
||||||
humantime = "2.1"
|
humantime = "2.1"
|
||||||
humantime-serde = "1.1.1"
|
humantime-serde = "1.1.1"
|
||||||
hyper = "0.14"
|
hyper = "0.14"
|
||||||
hyper-tungstenite = "0.13.0"
|
hyper-tungstenite = "0.11"
|
||||||
inotify = "0.10.2"
|
inotify = "0.10.2"
|
||||||
ipnet = "2.9.0"
|
ipnet = "2.9.0"
|
||||||
itertools = "0.10"
|
itertools = "0.10"
|
||||||
@@ -106,9 +100,8 @@ jsonwebtoken = "9"
|
|||||||
lasso = "0.7"
|
lasso = "0.7"
|
||||||
leaky-bucket = "1.0.1"
|
leaky-bucket = "1.0.1"
|
||||||
libc = "0.2"
|
libc = "0.2"
|
||||||
|
lz4_flex = "0.11.1"
|
||||||
md5 = "0.7.0"
|
md5 = "0.7.0"
|
||||||
measured = { version = "0.0.21", features=["lasso"] }
|
|
||||||
measured-process = { version = "0.0.21" }
|
|
||||||
memoffset = "0.8"
|
memoffset = "0.8"
|
||||||
native-tls = "0.2"
|
native-tls = "0.2"
|
||||||
nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
|
nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
|
||||||
@@ -128,7 +121,7 @@ procfs = "0.14"
|
|||||||
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
|
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
|
||||||
prost = "0.11"
|
prost = "0.11"
|
||||||
rand = "0.8"
|
rand = "0.8"
|
||||||
redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
|
redis = { version = "0.24.0", features = ["tokio-rustls-comp", "keep-alive"] }
|
||||||
regex = "1.10.2"
|
regex = "1.10.2"
|
||||||
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
|
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
|
||||||
reqwest-tracing = { version = "0.4.7", features = ["opentelemetry_0_20"] }
|
reqwest-tracing = { version = "0.4.7", features = ["opentelemetry_0_20"] }
|
||||||
@@ -156,12 +149,11 @@ smol_str = { version = "0.2.0", features = ["serde"] }
|
|||||||
socket2 = "0.5"
|
socket2 = "0.5"
|
||||||
strum = "0.24"
|
strum = "0.24"
|
||||||
strum_macros = "0.24"
|
strum_macros = "0.24"
|
||||||
"subtle" = "2.5.0"
|
|
||||||
svg_fmt = "0.4.1"
|
svg_fmt = "0.4.1"
|
||||||
sync_wrapper = "0.1.2"
|
sync_wrapper = "0.1.2"
|
||||||
tar = "0.4"
|
tar = "0.4"
|
||||||
task-local-extensions = "0.1.4"
|
task-local-extensions = "0.1.4"
|
||||||
test-context = "0.3"
|
test-context = "0.1"
|
||||||
thiserror = "1.0"
|
thiserror = "1.0"
|
||||||
tikv-jemallocator = "0.5"
|
tikv-jemallocator = "0.5"
|
||||||
tikv-jemalloc-ctl = "0.5"
|
tikv-jemalloc-ctl = "0.5"
|
||||||
|
|||||||
@@ -58,12 +58,6 @@ RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v$
|
|||||||
&& mv protoc/include/google /usr/local/include/google \
|
&& mv protoc/include/google /usr/local/include/google \
|
||||||
&& rm -rf protoc.zip protoc
|
&& rm -rf protoc.zip protoc
|
||||||
|
|
||||||
# s5cmd
|
|
||||||
ENV S5CMD_VERSION=2.2.2
|
|
||||||
RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/s5cmd_${S5CMD_VERSION}_Linux-$(uname -m | sed 's/x86_64/64bit/g' | sed 's/aarch64/arm64/g').tar.gz" | tar zxvf - s5cmd \
|
|
||||||
&& chmod +x s5cmd \
|
|
||||||
&& mv s5cmd /usr/local/bin/s5cmd
|
|
||||||
|
|
||||||
# LLVM
|
# LLVM
|
||||||
ENV LLVM_VERSION=17
|
ENV LLVM_VERSION=17
|
||||||
RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
|
RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
|
||||||
@@ -141,7 +135,7 @@ WORKDIR /home/nonroot
|
|||||||
|
|
||||||
# Rust
|
# Rust
|
||||||
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
|
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
|
||||||
ENV RUSTC_VERSION=1.77.0
|
ENV RUSTC_VERSION=1.76.0
|
||||||
ENV RUSTUP_HOME="/home/nonroot/.rustup"
|
ENV RUSTUP_HOME="/home/nonroot/.rustup"
|
||||||
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
|
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
|
||||||
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
|
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
|
||||||
@@ -155,7 +149,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
|
|||||||
cargo install --git https://github.com/paritytech/cachepot && \
|
cargo install --git https://github.com/paritytech/cachepot && \
|
||||||
cargo install rustfilt && \
|
cargo install rustfilt && \
|
||||||
cargo install cargo-hakari && \
|
cargo install cargo-hakari && \
|
||||||
cargo install cargo-deny --locked && \
|
cargo install cargo-deny && \
|
||||||
cargo install cargo-hack && \
|
cargo install cargo-hack && \
|
||||||
cargo install cargo-nextest && \
|
cargo install cargo-nextest && \
|
||||||
rm -rf /home/nonroot/.cargo/registry && \
|
rm -rf /home/nonroot/.cargo/registry && \
|
||||||
|
|||||||
@@ -944,9 +944,6 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
|
|||||||
COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
|
COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
|
||||||
COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
|
COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
|
||||||
|
|
||||||
# Create remote extension download directory
|
|
||||||
RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions
|
|
||||||
|
|
||||||
# Install:
|
# Install:
|
||||||
# libreadline8 for psql
|
# libreadline8 for psql
|
||||||
# libicu67, locales for collations (including ICU and plpgsql_check)
|
# libicu67, locales for collations (including ICU and plpgsql_check)
|
||||||
|
|||||||
@@ -238,14 +238,6 @@ If you encounter errors during setting up the initial tenant, it's best to stop
|
|||||||
|
|
||||||
## Running tests
|
## Running tests
|
||||||
|
|
||||||
### Rust unit tests
|
|
||||||
|
|
||||||
We are using [`cargo-nextest`](https://nexte.st/) to run the tests in Github Workflows.
|
|
||||||
Some crates do not support running plain `cargo test` anymore, prefer `cargo nextest run` instead.
|
|
||||||
You can install `cargo-nextest` with `cargo install cargo-nextest`.
|
|
||||||
|
|
||||||
### Integration tests
|
|
||||||
|
|
||||||
Ensure your dependencies are installed as described [here](https://github.com/neondatabase/neon#dependency-installation-notes).
|
Ensure your dependencies are installed as described [here](https://github.com/neondatabase/neon#dependency-installation-notes).
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
|
|||||||
@@ -2,8 +2,6 @@ disallowed-methods = [
|
|||||||
"tokio::task::block_in_place",
|
"tokio::task::block_in_place",
|
||||||
# Allow this for now, to deny it later once we stop using Handle::block_on completely
|
# Allow this for now, to deny it later once we stop using Handle::block_on completely
|
||||||
# "tokio::runtime::Handle::block_on",
|
# "tokio::runtime::Handle::block_on",
|
||||||
# use tokio_epoll_uring_ext instead
|
|
||||||
"tokio_epoll_uring::thread_local_system",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
disallowed-macros = [
|
disallowed-macros = [
|
||||||
|
|||||||
@@ -32,29 +32,6 @@ compute_ctl -D /var/db/postgres/compute \
|
|||||||
-b /usr/local/bin/postgres
|
-b /usr/local/bin/postgres
|
||||||
```
|
```
|
||||||
|
|
||||||
## State Diagram
|
|
||||||
|
|
||||||
Computes can be in various states. Below is a diagram that details how a
|
|
||||||
compute moves between states.
|
|
||||||
|
|
||||||
```mermaid
|
|
||||||
%% https://mermaid.js.org/syntax/stateDiagram.html
|
|
||||||
stateDiagram-v2
|
|
||||||
[*] --> Empty : Compute spawned
|
|
||||||
Empty --> ConfigurationPending : Waiting for compute spec
|
|
||||||
ConfigurationPending --> Configuration : Received compute spec
|
|
||||||
Configuration --> Failed : Failed to configure the compute
|
|
||||||
Configuration --> Running : Compute has been configured
|
|
||||||
Empty --> Init : Compute spec is immediately available
|
|
||||||
Empty --> TerminationPending : Requested termination
|
|
||||||
Init --> Failed : Failed to start Postgres
|
|
||||||
Init --> Running : Started Postgres
|
|
||||||
Running --> TerminationPending : Requested termination
|
|
||||||
TerminationPending --> Terminated : Terminated compute
|
|
||||||
Failed --> [*] : Compute exited
|
|
||||||
Terminated --> [*] : Compute exited
|
|
||||||
```
|
|
||||||
|
|
||||||
## Tests
|
## Tests
|
||||||
|
|
||||||
Cargo formatter:
|
Cargo formatter:
|
||||||
|
|||||||
@@ -818,15 +818,9 @@ impl ComputeNode {
|
|||||||
Client::connect(zenith_admin_connstr.as_str(), NoTls)
|
Client::connect(zenith_admin_connstr.as_str(), NoTls)
|
||||||
.context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?;
|
.context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?;
|
||||||
// Disable forwarding so that users don't get a cloud_admin role
|
// Disable forwarding so that users don't get a cloud_admin role
|
||||||
|
client.simple_query("SET neon.forward_ddl = false")?;
|
||||||
let mut func = || {
|
client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
|
||||||
client.simple_query("SET neon.forward_ddl = false")?;
|
client.simple_query("GRANT zenith_admin TO cloud_admin")?;
|
||||||
client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
|
|
||||||
client.simple_query("GRANT zenith_admin TO cloud_admin")?;
|
|
||||||
Ok::<_, anyhow::Error>(())
|
|
||||||
};
|
|
||||||
func().context("apply_config setup cloud_admin")?;
|
|
||||||
|
|
||||||
drop(client);
|
drop(client);
|
||||||
|
|
||||||
// reconnect with connstring with expected name
|
// reconnect with connstring with expected name
|
||||||
@@ -838,29 +832,24 @@ impl ComputeNode {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// Disable DDL forwarding because control plane already knows about these roles/databases.
|
// Disable DDL forwarding because control plane already knows about these roles/databases.
|
||||||
client
|
client.simple_query("SET neon.forward_ddl = false")?;
|
||||||
.simple_query("SET neon.forward_ddl = false")
|
|
||||||
.context("apply_config SET neon.forward_ddl = false")?;
|
|
||||||
|
|
||||||
// Proceed with post-startup configuration. Note, that order of operations is important.
|
// Proceed with post-startup configuration. Note, that order of operations is important.
|
||||||
let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
|
let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
|
||||||
create_neon_superuser(spec, &mut client).context("apply_config create_neon_superuser")?;
|
create_neon_superuser(spec, &mut client)?;
|
||||||
cleanup_instance(&mut client).context("apply_config cleanup_instance")?;
|
cleanup_instance(&mut client)?;
|
||||||
handle_roles(spec, &mut client).context("apply_config handle_roles")?;
|
handle_roles(spec, &mut client)?;
|
||||||
handle_databases(spec, &mut client).context("apply_config handle_databases")?;
|
handle_databases(spec, &mut client)?;
|
||||||
handle_role_deletions(spec, connstr.as_str(), &mut client)
|
handle_role_deletions(spec, connstr.as_str(), &mut client)?;
|
||||||
.context("apply_config handle_role_deletions")?;
|
|
||||||
handle_grants(
|
handle_grants(
|
||||||
spec,
|
spec,
|
||||||
&mut client,
|
&mut client,
|
||||||
connstr.as_str(),
|
connstr.as_str(),
|
||||||
self.has_feature(ComputeFeature::AnonExtension),
|
self.has_feature(ComputeFeature::AnonExtension),
|
||||||
)
|
)?;
|
||||||
.context("apply_config handle_grants")?;
|
handle_extensions(spec, &mut client)?;
|
||||||
handle_extensions(spec, &mut client).context("apply_config handle_extensions")?;
|
handle_extension_neon(&mut client)?;
|
||||||
handle_extension_neon(&mut client).context("apply_config handle_extension_neon")?;
|
create_availability_check_data(&mut client)?;
|
||||||
create_availability_check_data(&mut client)
|
|
||||||
.context("apply_config create_availability_check_data")?;
|
|
||||||
|
|
||||||
// 'Close' connection
|
// 'Close' connection
|
||||||
drop(client);
|
drop(client);
|
||||||
@@ -868,7 +857,7 @@ impl ComputeNode {
|
|||||||
// Run migrations separately to not hold up cold starts
|
// Run migrations separately to not hold up cold starts
|
||||||
thread::spawn(move || {
|
thread::spawn(move || {
|
||||||
let mut client = Client::connect(connstr.as_str(), NoTls)?;
|
let mut client = Client::connect(connstr.as_str(), NoTls)?;
|
||||||
handle_migrations(&mut client).context("apply_config handle_migrations")
|
handle_migrations(&mut client)
|
||||||
});
|
});
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -1273,12 +1262,10 @@ LIMIT 100",
|
|||||||
.await
|
.await
|
||||||
.map_err(DownloadError::Other);
|
.map_err(DownloadError::Other);
|
||||||
|
|
||||||
if download_size.is_ok() {
|
self.ext_download_progress
|
||||||
self.ext_download_progress
|
.write()
|
||||||
.write()
|
.expect("bad lock")
|
||||||
.expect("bad lock")
|
.insert(ext_archive_name.to_string(), (download_start, true));
|
||||||
.insert(ext_archive_name.to_string(), (download_start, true));
|
|
||||||
}
|
|
||||||
|
|
||||||
download_size
|
download_size
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,8 +6,8 @@ use std::path::Path;
|
|||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
|
|
||||||
use crate::pg_helpers::escape_conf_value;
|
use crate::pg_helpers::escape_conf_value;
|
||||||
use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize};
|
use crate::pg_helpers::PgOptionsSerialize;
|
||||||
use compute_api::spec::{ComputeMode, ComputeSpec, GenericOption};
|
use compute_api::spec::{ComputeMode, ComputeSpec};
|
||||||
|
|
||||||
/// Check that `line` is inside a text file and put it there if it is not.
|
/// Check that `line` is inside a text file and put it there if it is not.
|
||||||
/// Create file if it doesn't exist.
|
/// Create file if it doesn't exist.
|
||||||
@@ -17,7 +17,6 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
|
|||||||
.write(true)
|
.write(true)
|
||||||
.create(true)
|
.create(true)
|
||||||
.append(false)
|
.append(false)
|
||||||
.truncate(false)
|
|
||||||
.open(path)?;
|
.open(path)?;
|
||||||
let buf = io::BufReader::new(&file);
|
let buf = io::BufReader::new(&file);
|
||||||
let mut count: usize = 0;
|
let mut count: usize = 0;
|
||||||
@@ -92,27 +91,6 @@ pub fn write_postgres_conf(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if cfg!(target_os = "linux") {
|
|
||||||
// Check /proc/sys/vm/overcommit_memory -- if it equals 2 (i.e. linux memory overcommit is
|
|
||||||
// disabled), then the control plane has enabled swap and we should set
|
|
||||||
// dynamic_shared_memory_type = 'mmap'.
|
|
||||||
//
|
|
||||||
// This is (maybe?) temporary - for more, see https://github.com/neondatabase/cloud/issues/12047.
|
|
||||||
let overcommit_memory_contents = std::fs::read_to_string("/proc/sys/vm/overcommit_memory")
|
|
||||||
// ignore any errors - they may be expected to occur under certain situations (e.g. when
|
|
||||||
// not running in Linux).
|
|
||||||
.unwrap_or_else(|_| String::new());
|
|
||||||
if overcommit_memory_contents.trim() == "2" {
|
|
||||||
let opt = GenericOption {
|
|
||||||
name: "dynamic_shared_memory_type".to_owned(),
|
|
||||||
value: Some("mmap".to_owned()),
|
|
||||||
vartype: "enum".to_owned(),
|
|
||||||
};
|
|
||||||
|
|
||||||
write!(file, "{}", opt.to_pg_setting())?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// If there are any extra options in the 'settings' field, append those
|
// If there are any extra options in the 'settings' field, append those
|
||||||
if spec.cluster.settings.is_some() {
|
if spec.cluster.settings.is_some() {
|
||||||
writeln!(file, "# Managed by compute_ctl: begin")?;
|
writeln!(file, "# Managed by compute_ctl: begin")?;
|
||||||
|
|||||||
@@ -44,7 +44,7 @@ pub fn escape_conf_value(s: &str) -> String {
|
|||||||
format!("'{}'", res)
|
format!("'{}'", res)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub trait GenericOptionExt {
|
trait GenericOptionExt {
|
||||||
fn to_pg_option(&self) -> String;
|
fn to_pg_option(&self) -> String;
|
||||||
fn to_pg_setting(&self) -> String;
|
fn to_pg_setting(&self) -> String;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ use std::fs::File;
|
|||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
|
|
||||||
use anyhow::{anyhow, bail, Context, Result};
|
use anyhow::{anyhow, bail, Result};
|
||||||
use postgres::config::Config;
|
use postgres::config::Config;
|
||||||
use postgres::{Client, NoTls};
|
use postgres::{Client, NoTls};
|
||||||
use reqwest::StatusCode;
|
use reqwest::StatusCode;
|
||||||
@@ -302,9 +302,9 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
|||||||
RoleAction::Create => {
|
RoleAction::Create => {
|
||||||
// This branch only runs when roles are created through the console, so it is
|
// This branch only runs when roles are created through the console, so it is
|
||||||
// safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
|
// safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
|
||||||
// from neon_superuser.
|
// from neon_superuser. (NOTE: REPLICATION has been removed from here for now).
|
||||||
let mut query: String = format!(
|
let mut query: String = format!(
|
||||||
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
|
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
|
||||||
name.pg_quote()
|
name.pg_quote()
|
||||||
);
|
);
|
||||||
info!("running role create query: '{}'", &query);
|
info!("running role create query: '{}'", &query);
|
||||||
@@ -698,8 +698,7 @@ pub fn handle_grants(
|
|||||||
|
|
||||||
// it is important to run this after all grants
|
// it is important to run this after all grants
|
||||||
if enable_anon_extension {
|
if enable_anon_extension {
|
||||||
handle_extension_anon(spec, &db.owner, &mut db_client, false)
|
handle_extension_anon(spec, &db.owner, &mut db_client, false)?;
|
||||||
.context("handle_grants handle_extension_anon")?;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -746,12 +745,7 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
|
|||||||
// - extension was already installed and is up to date
|
// - extension was already installed and is up to date
|
||||||
let query = "ALTER EXTENSION neon UPDATE";
|
let query = "ALTER EXTENSION neon UPDATE";
|
||||||
info!("update neon extension version with query: {}", query);
|
info!("update neon extension version with query: {}", query);
|
||||||
if let Err(e) = client.simple_query(query) {
|
client.simple_query(query)?;
|
||||||
error!(
|
|
||||||
"failed to upgrade neon extension during `handle_extension_neon`: {}",
|
|
||||||
e
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -810,40 +804,43 @@ $$;"#,
|
|||||||
"",
|
"",
|
||||||
"",
|
"",
|
||||||
"",
|
"",
|
||||||
"",
|
|
||||||
// Add new migrations below.
|
// Add new migrations below.
|
||||||
|
r#"
|
||||||
|
DO $$
|
||||||
|
DECLARE
|
||||||
|
role_name TEXT;
|
||||||
|
BEGIN
|
||||||
|
FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
|
||||||
|
LOOP
|
||||||
|
RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
|
||||||
|
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
|
||||||
|
END LOOP;
|
||||||
|
END
|
||||||
|
$$;"#,
|
||||||
];
|
];
|
||||||
|
|
||||||
let mut func = || {
|
let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
|
||||||
let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
|
client.simple_query(query)?;
|
||||||
client.simple_query(query)?;
|
|
||||||
|
|
||||||
let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
|
query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
|
||||||
client.simple_query(query)?;
|
client.simple_query(query)?;
|
||||||
|
|
||||||
let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
|
query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
|
||||||
client.simple_query(query)?;
|
client.simple_query(query)?;
|
||||||
|
|
||||||
let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
|
query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
|
||||||
client.simple_query(query)?;
|
client.simple_query(query)?;
|
||||||
|
|
||||||
let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
|
query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
|
||||||
client.simple_query(query)?;
|
client.simple_query(query)?;
|
||||||
Ok::<_, anyhow::Error>(())
|
|
||||||
};
|
|
||||||
func().context("handle_migrations prepare")?;
|
|
||||||
|
|
||||||
let query = "SELECT id FROM neon_migration.migration_id";
|
query = "SELECT id FROM neon_migration.migration_id";
|
||||||
let row = client
|
let row = client.query_one(query, &[])?;
|
||||||
.query_one(query, &[])
|
|
||||||
.context("handle_migrations get migration_id")?;
|
|
||||||
let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
|
let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
|
||||||
let starting_migration_id = current_migration;
|
let starting_migration_id = current_migration;
|
||||||
|
|
||||||
let query = "BEGIN";
|
query = "BEGIN";
|
||||||
client
|
client.simple_query(query)?;
|
||||||
.simple_query(query)
|
|
||||||
.context("handle_migrations begin")?;
|
|
||||||
|
|
||||||
while current_migration < migrations.len() {
|
while current_migration < migrations.len() {
|
||||||
let migration = &migrations[current_migration];
|
let migration = &migrations[current_migration];
|
||||||
@@ -851,9 +848,7 @@ $$;"#,
|
|||||||
info!("Skip migration id={}", current_migration);
|
info!("Skip migration id={}", current_migration);
|
||||||
} else {
|
} else {
|
||||||
info!("Running migration:\n{}\n", migration);
|
info!("Running migration:\n{}\n", migration);
|
||||||
client.simple_query(migration).with_context(|| {
|
client.simple_query(migration)?;
|
||||||
format!("handle_migrations current_migration={}", current_migration)
|
|
||||||
})?;
|
|
||||||
}
|
}
|
||||||
current_migration += 1;
|
current_migration += 1;
|
||||||
}
|
}
|
||||||
@@ -861,14 +856,10 @@ $$;"#,
|
|||||||
"UPDATE neon_migration.migration_id SET id={}",
|
"UPDATE neon_migration.migration_id SET id={}",
|
||||||
migrations.len()
|
migrations.len()
|
||||||
);
|
);
|
||||||
client
|
client.simple_query(&setval)?;
|
||||||
.simple_query(&setval)
|
|
||||||
.context("handle_migrations update id")?;
|
|
||||||
|
|
||||||
let query = "COMMIT";
|
query = "COMMIT";
|
||||||
client
|
client.simple_query(query)?;
|
||||||
.simple_query(query)
|
|
||||||
.context("handle_migrations commit")?;
|
|
||||||
|
|
||||||
info!(
|
info!(
|
||||||
"Ran {} migrations",
|
"Ran {} migrations",
|
||||||
|
|||||||
@@ -12,7 +12,6 @@ clap.workspace = true
|
|||||||
comfy-table.workspace = true
|
comfy-table.workspace = true
|
||||||
futures.workspace = true
|
futures.workspace = true
|
||||||
git-version.workspace = true
|
git-version.workspace = true
|
||||||
humantime.workspace = true
|
|
||||||
nix.workspace = true
|
nix.workspace = true
|
||||||
once_cell.workspace = true
|
once_cell.workspace = true
|
||||||
postgres.workspace = true
|
postgres.workspace = true
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "storage_controller"
|
name = "attachment_service"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
@@ -16,37 +16,31 @@ testing = []
|
|||||||
[dependencies]
|
[dependencies]
|
||||||
anyhow.workspace = true
|
anyhow.workspace = true
|
||||||
aws-config.workspace = true
|
aws-config.workspace = true
|
||||||
bytes.workspace = true
|
aws-sdk-secretsmanager.workspace = true
|
||||||
camino.workspace = true
|
camino.workspace = true
|
||||||
clap.workspace = true
|
clap.workspace = true
|
||||||
fail.workspace = true
|
|
||||||
futures.workspace = true
|
futures.workspace = true
|
||||||
git-version.workspace = true
|
git-version.workspace = true
|
||||||
hex.workspace = true
|
|
||||||
hyper.workspace = true
|
hyper.workspace = true
|
||||||
humantime.workspace = true
|
humantime.workspace = true
|
||||||
itertools.workspace = true
|
|
||||||
lasso.workspace = true
|
|
||||||
once_cell.workspace = true
|
once_cell.workspace = true
|
||||||
pageserver_api.workspace = true
|
pageserver_api.workspace = true
|
||||||
pageserver_client.workspace = true
|
pageserver_client.workspace = true
|
||||||
postgres_connection.workspace = true
|
postgres_connection.workspace = true
|
||||||
reqwest.workspace = true
|
reqwest.workspace = true
|
||||||
routerify.workspace = true
|
|
||||||
serde.workspace = true
|
serde.workspace = true
|
||||||
serde_json.workspace = true
|
serde_json.workspace = true
|
||||||
thiserror.workspace = true
|
thiserror.workspace = true
|
||||||
tokio.workspace = true
|
tokio.workspace = true
|
||||||
tokio-util.workspace = true
|
tokio-util.workspace = true
|
||||||
tracing.workspace = true
|
tracing.workspace = true
|
||||||
measured.workspace = true
|
|
||||||
|
|
||||||
diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
|
diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
|
||||||
diesel_migrations = { version = "2.1.0" }
|
diesel_migrations = { version = "2.1.0" }
|
||||||
r2d2 = { version = "0.8.10" }
|
r2d2 = { version = "0.8.10" }
|
||||||
|
|
||||||
utils = { path = "../libs/utils/" }
|
utils = { path = "../../libs/utils/" }
|
||||||
metrics = { path = "../libs/metrics/" }
|
metrics = { path = "../../libs/metrics/" }
|
||||||
control_plane = { path = "../control_plane" }
|
control_plane = { path = ".." }
|
||||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||||
|
|
||||||
@@ -1,4 +1,3 @@
|
|||||||
use std::sync::Arc;
|
|
||||||
use std::{collections::HashMap, time::Duration};
|
use std::{collections::HashMap, time::Duration};
|
||||||
|
|
||||||
use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
|
use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
|
||||||
@@ -15,32 +14,19 @@ use utils::{
|
|||||||
|
|
||||||
use crate::service::Config;
|
use crate::service::Config;
|
||||||
|
|
||||||
|
const BUSY_DELAY: Duration = Duration::from_secs(1);
|
||||||
const SLOWDOWN_DELAY: Duration = Duration::from_secs(5);
|
const SLOWDOWN_DELAY: Duration = Duration::from_secs(5);
|
||||||
|
|
||||||
const NOTIFY_REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
|
|
||||||
|
|
||||||
pub(crate) const API_CONCURRENCY: usize = 32;
|
pub(crate) const API_CONCURRENCY: usize = 32;
|
||||||
|
|
||||||
struct UnshardedComputeHookTenant {
|
|
||||||
// Which node is this tenant attached to
|
|
||||||
node_id: NodeId,
|
|
||||||
|
|
||||||
// Must hold this lock to send a notification.
|
|
||||||
send_lock: Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>,
|
|
||||||
}
|
|
||||||
struct ShardedComputeHookTenant {
|
struct ShardedComputeHookTenant {
|
||||||
stripe_size: ShardStripeSize,
|
stripe_size: ShardStripeSize,
|
||||||
shard_count: ShardCount,
|
shard_count: ShardCount,
|
||||||
shards: Vec<(ShardNumber, NodeId)>,
|
shards: Vec<(ShardNumber, NodeId)>,
|
||||||
|
|
||||||
// Must hold this lock to send a notification. The contents represent
|
|
||||||
// the last successfully sent notification, and are used to coalesce multiple
|
|
||||||
// updates by only sending when there is a chance since our last successful send.
|
|
||||||
send_lock: Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
enum ComputeHookTenant {
|
enum ComputeHookTenant {
|
||||||
Unsharded(UnshardedComputeHookTenant),
|
Unsharded(NodeId),
|
||||||
Sharded(ShardedComputeHookTenant),
|
Sharded(ShardedComputeHookTenant),
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -52,20 +38,9 @@ impl ComputeHookTenant {
|
|||||||
shards: vec![(tenant_shard_id.shard_number, node_id)],
|
shards: vec![(tenant_shard_id.shard_number, node_id)],
|
||||||
stripe_size,
|
stripe_size,
|
||||||
shard_count: tenant_shard_id.shard_count,
|
shard_count: tenant_shard_id.shard_count,
|
||||||
send_lock: Arc::default(),
|
|
||||||
})
|
})
|
||||||
} else {
|
} else {
|
||||||
Self::Unsharded(UnshardedComputeHookTenant {
|
Self::Unsharded(node_id)
|
||||||
node_id,
|
|
||||||
send_lock: Arc::default(),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn get_send_lock(&self) -> &Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>> {
|
|
||||||
match self {
|
|
||||||
Self::Unsharded(unsharded_tenant) => &unsharded_tenant.send_lock,
|
|
||||||
Self::Sharded(sharded_tenant) => &sharded_tenant.send_lock,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -78,8 +53,8 @@ impl ComputeHookTenant {
|
|||||||
node_id: NodeId,
|
node_id: NodeId,
|
||||||
) {
|
) {
|
||||||
match self {
|
match self {
|
||||||
Self::Unsharded(unsharded_tenant) if tenant_shard_id.shard_count.count() == 1 => {
|
Self::Unsharded(existing_node_id) if tenant_shard_id.shard_count.count() == 1 => {
|
||||||
unsharded_tenant.node_id = node_id
|
*existing_node_id = node_id
|
||||||
}
|
}
|
||||||
Self::Sharded(sharded_tenant)
|
Self::Sharded(sharded_tenant)
|
||||||
if sharded_tenant.stripe_size == stripe_size
|
if sharded_tenant.stripe_size == stripe_size
|
||||||
@@ -106,14 +81,14 @@ impl ComputeHookTenant {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)]
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
struct ComputeHookNotifyRequestShard {
|
struct ComputeHookNotifyRequestShard {
|
||||||
node_id: NodeId,
|
node_id: NodeId,
|
||||||
shard_number: ShardNumber,
|
shard_number: ShardNumber,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Request body that we send to the control plane to notify it of where a tenant is attached
|
/// Request body that we send to the control plane to notify it of where a tenant is attached
|
||||||
#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)]
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
struct ComputeHookNotifyRequest {
|
struct ComputeHookNotifyRequest {
|
||||||
tenant_id: TenantId,
|
tenant_id: TenantId,
|
||||||
stripe_size: Option<ShardStripeSize>,
|
stripe_size: Option<ShardStripeSize>,
|
||||||
@@ -146,44 +121,14 @@ pub(crate) enum NotifyError {
|
|||||||
Fatal(StatusCode),
|
Fatal(StatusCode),
|
||||||
}
|
}
|
||||||
|
|
||||||
enum MaybeSendResult {
|
|
||||||
// Please send this request while holding the lock, and if you succeed then write
|
|
||||||
// the request into the lock.
|
|
||||||
Transmit(
|
|
||||||
(
|
|
||||||
ComputeHookNotifyRequest,
|
|
||||||
tokio::sync::OwnedMutexGuard<Option<ComputeHookNotifyRequest>>,
|
|
||||||
),
|
|
||||||
),
|
|
||||||
// Something requires sending, but you must wait for a current sender then call again
|
|
||||||
AwaitLock(Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>),
|
|
||||||
// Nothing requires sending
|
|
||||||
Noop,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ComputeHookTenant {
|
impl ComputeHookTenant {
|
||||||
fn maybe_send(
|
fn maybe_reconfigure(&self, tenant_id: TenantId) -> Option<ComputeHookNotifyRequest> {
|
||||||
&self,
|
match self {
|
||||||
tenant_id: TenantId,
|
Self::Unsharded(node_id) => Some(ComputeHookNotifyRequest {
|
||||||
lock: Option<tokio::sync::OwnedMutexGuard<Option<ComputeHookNotifyRequest>>>,
|
|
||||||
) -> MaybeSendResult {
|
|
||||||
let locked = match lock {
|
|
||||||
Some(already_locked) => already_locked,
|
|
||||||
None => {
|
|
||||||
// Lock order: this _must_ be only a try_lock, because we are called inside of the [`ComputeHook::state`] lock.
|
|
||||||
let Ok(locked) = self.get_send_lock().clone().try_lock_owned() else {
|
|
||||||
return MaybeSendResult::AwaitLock(self.get_send_lock().clone());
|
|
||||||
};
|
|
||||||
locked
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let request = match self {
|
|
||||||
Self::Unsharded(unsharded_tenant) => Some(ComputeHookNotifyRequest {
|
|
||||||
tenant_id,
|
tenant_id,
|
||||||
shards: vec![ComputeHookNotifyRequestShard {
|
shards: vec![ComputeHookNotifyRequestShard {
|
||||||
shard_number: ShardNumber(0),
|
shard_number: ShardNumber(0),
|
||||||
node_id: unsharded_tenant.node_id,
|
node_id: *node_id,
|
||||||
}],
|
}],
|
||||||
stripe_size: None,
|
stripe_size: None,
|
||||||
}),
|
}),
|
||||||
@@ -207,25 +152,12 @@ impl ComputeHookTenant {
|
|||||||
// Sharded tenant doesn't yet have information for all its shards
|
// Sharded tenant doesn't yet have information for all its shards
|
||||||
|
|
||||||
tracing::info!(
|
tracing::info!(
|
||||||
"ComputeHookTenant::maybe_send: not enough shards ({}/{})",
|
"ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
|
||||||
sharded_tenant.shards.len(),
|
sharded_tenant.shards.len(),
|
||||||
sharded_tenant.shard_count.count()
|
sharded_tenant.shard_count.count()
|
||||||
);
|
);
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
};
|
|
||||||
|
|
||||||
match request {
|
|
||||||
None => {
|
|
||||||
// Not yet ready to emit a notification
|
|
||||||
tracing::info!("Tenant isn't yet ready to emit a notification");
|
|
||||||
MaybeSendResult::Noop
|
|
||||||
}
|
|
||||||
Some(request) if Some(&request) == locked.as_ref() => {
|
|
||||||
// No change from the last value successfully sent
|
|
||||||
MaybeSendResult::Noop
|
|
||||||
}
|
|
||||||
Some(request) => MaybeSendResult::Transmit((request, locked)),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -235,19 +167,8 @@ impl ComputeHookTenant {
|
|||||||
/// the compute connection string.
|
/// the compute connection string.
|
||||||
pub(super) struct ComputeHook {
|
pub(super) struct ComputeHook {
|
||||||
config: Config,
|
config: Config,
|
||||||
state: std::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
|
state: tokio::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
|
||||||
authorization_header: Option<String>,
|
authorization_header: Option<String>,
|
||||||
|
|
||||||
// Concurrency limiter, so that we do not overload the cloud control plane when updating
|
|
||||||
// large numbers of tenants (e.g. when failing over after a node failure)
|
|
||||||
api_concurrency: tokio::sync::Semaphore,
|
|
||||||
|
|
||||||
// This lock is only used in testing enviroments, to serialize calls into neon_lock
|
|
||||||
neon_local_lock: tokio::sync::Mutex<()>,
|
|
||||||
|
|
||||||
// We share a client across all notifications to enable connection re-use etc when
|
|
||||||
// sending large numbers of notifications
|
|
||||||
client: reqwest::Client,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ComputeHook {
|
impl ComputeHook {
|
||||||
@@ -257,30 +178,18 @@ impl ComputeHook {
|
|||||||
.clone()
|
.clone()
|
||||||
.map(|jwt| format!("Bearer {}", jwt));
|
.map(|jwt| format!("Bearer {}", jwt));
|
||||||
|
|
||||||
let client = reqwest::ClientBuilder::new()
|
|
||||||
.timeout(NOTIFY_REQUEST_TIMEOUT)
|
|
||||||
.build()
|
|
||||||
.expect("Failed to construct HTTP client");
|
|
||||||
|
|
||||||
Self {
|
Self {
|
||||||
state: Default::default(),
|
state: Default::default(),
|
||||||
config,
|
config,
|
||||||
authorization_header,
|
authorization_header,
|
||||||
neon_local_lock: Default::default(),
|
|
||||||
api_concurrency: tokio::sync::Semaphore::new(API_CONCURRENCY),
|
|
||||||
client,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// For test environments: use neon_local's LocalEnv to update compute
|
/// For test environments: use neon_local's LocalEnv to update compute
|
||||||
async fn do_notify_local(
|
async fn do_notify_local(
|
||||||
&self,
|
&self,
|
||||||
reconfigure_request: &ComputeHookNotifyRequest,
|
reconfigure_request: ComputeHookNotifyRequest,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
// neon_local updates are not safe to call concurrently, use a lock to serialize
|
|
||||||
// all calls to this function
|
|
||||||
let _locked = self.neon_local_lock.lock().await;
|
|
||||||
|
|
||||||
let env = match LocalEnv::load_config() {
|
let env = match LocalEnv::load_config() {
|
||||||
Ok(e) => e,
|
Ok(e) => e,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
@@ -297,7 +206,7 @@ impl ComputeHook {
|
|||||||
} = reconfigure_request;
|
} = reconfigure_request;
|
||||||
|
|
||||||
let compute_pageservers = shards
|
let compute_pageservers = shards
|
||||||
.iter()
|
.into_iter()
|
||||||
.map(|shard| {
|
.map(|shard| {
|
||||||
let ps_conf = env
|
let ps_conf = env
|
||||||
.get_pageserver_conf(shard.node_id)
|
.get_pageserver_conf(shard.node_id)
|
||||||
@@ -309,10 +218,10 @@ impl ComputeHook {
|
|||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
for (endpoint_name, endpoint) in &cplane.endpoints {
|
for (endpoint_name, endpoint) in &cplane.endpoints {
|
||||||
if endpoint.tenant_id == *tenant_id && endpoint.status() == EndpointStatus::Running {
|
if endpoint.tenant_id == tenant_id && endpoint.status() == EndpointStatus::Running {
|
||||||
tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
|
tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
|
||||||
endpoint
|
endpoint
|
||||||
.reconfigure(compute_pageservers.clone(), *stripe_size)
|
.reconfigure(compute_pageservers.clone(), stripe_size)
|
||||||
.await?;
|
.await?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -322,11 +231,12 @@ impl ComputeHook {
|
|||||||
|
|
||||||
async fn do_notify_iteration(
|
async fn do_notify_iteration(
|
||||||
&self,
|
&self,
|
||||||
|
client: &reqwest::Client,
|
||||||
url: &String,
|
url: &String,
|
||||||
reconfigure_request: &ComputeHookNotifyRequest,
|
reconfigure_request: &ComputeHookNotifyRequest,
|
||||||
cancel: &CancellationToken,
|
cancel: &CancellationToken,
|
||||||
) -> Result<(), NotifyError> {
|
) -> Result<(), NotifyError> {
|
||||||
let req = self.client.request(Method::PUT, url);
|
let req = client.request(Method::PUT, url);
|
||||||
let req = if let Some(value) = &self.authorization_header {
|
let req = if let Some(value) = &self.authorization_header {
|
||||||
req.header(reqwest::header::AUTHORIZATION, value)
|
req.header(reqwest::header::AUTHORIZATION, value)
|
||||||
} else {
|
} else {
|
||||||
@@ -370,10 +280,11 @@ impl ComputeHook {
|
|||||||
Err(NotifyError::SlowDown)
|
Err(NotifyError::SlowDown)
|
||||||
}
|
}
|
||||||
StatusCode::LOCKED => {
|
StatusCode::LOCKED => {
|
||||||
// We consider this fatal, because it's possible that the operation blocking the control one is
|
// Delay our retry if busy: the usual fast exponential backoff in backoff::retry
|
||||||
// also the one that is waiting for this reconcile. We should let the reconciler calling
|
// is not appropriate
|
||||||
// this hook fail, to give control plane a chance to un-lock.
|
tokio::time::timeout(BUSY_DELAY, cancel.cancelled())
|
||||||
tracing::info!("Control plane reports tenant is locked, dropping out of notify");
|
.await
|
||||||
|
.ok();
|
||||||
Err(NotifyError::Busy)
|
Err(NotifyError::Busy)
|
||||||
}
|
}
|
||||||
StatusCode::SERVICE_UNAVAILABLE
|
StatusCode::SERVICE_UNAVAILABLE
|
||||||
@@ -389,27 +300,13 @@ impl ComputeHook {
|
|||||||
async fn do_notify(
|
async fn do_notify(
|
||||||
&self,
|
&self,
|
||||||
url: &String,
|
url: &String,
|
||||||
reconfigure_request: &ComputeHookNotifyRequest,
|
reconfigure_request: ComputeHookNotifyRequest,
|
||||||
cancel: &CancellationToken,
|
cancel: &CancellationToken,
|
||||||
) -> Result<(), NotifyError> {
|
) -> Result<(), NotifyError> {
|
||||||
// We hold these semaphore units across all retries, rather than only across each
|
let client = reqwest::Client::new();
|
||||||
// HTTP request: this is to preserve fairness and avoid a situation where a retry might
|
|
||||||
// time out waiting for a semaphore.
|
|
||||||
let _units = self
|
|
||||||
.api_concurrency
|
|
||||||
.acquire()
|
|
||||||
.await
|
|
||||||
// Interpret closed semaphore as shutdown
|
|
||||||
.map_err(|_| NotifyError::ShuttingDown)?;
|
|
||||||
|
|
||||||
backoff::retry(
|
backoff::retry(
|
||||||
|| self.do_notify_iteration(url, reconfigure_request, cancel),
|
|| self.do_notify_iteration(&client, url, &reconfigure_request, cancel),
|
||||||
|e| {
|
|e| matches!(e, NotifyError::Fatal(_) | NotifyError::Unexpected(_)),
|
||||||
matches!(
|
|
||||||
e,
|
|
||||||
NotifyError::Fatal(_) | NotifyError::Unexpected(_) | NotifyError::Busy
|
|
||||||
)
|
|
||||||
},
|
|
||||||
3,
|
3,
|
||||||
10,
|
10,
|
||||||
"Send compute notification",
|
"Send compute notification",
|
||||||
@@ -443,70 +340,42 @@ impl ComputeHook {
|
|||||||
stripe_size: ShardStripeSize,
|
stripe_size: ShardStripeSize,
|
||||||
cancel: &CancellationToken,
|
cancel: &CancellationToken,
|
||||||
) -> Result<(), NotifyError> {
|
) -> Result<(), NotifyError> {
|
||||||
let maybe_send_result = {
|
let mut locked = self.state.lock().await;
|
||||||
let mut state_locked = self.state.lock().unwrap();
|
|
||||||
|
|
||||||
use std::collections::hash_map::Entry;
|
use std::collections::hash_map::Entry;
|
||||||
let tenant = match state_locked.entry(tenant_shard_id.tenant_id) {
|
let tenant = match locked.entry(tenant_shard_id.tenant_id) {
|
||||||
Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
|
Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
|
||||||
tenant_shard_id,
|
tenant_shard_id,
|
||||||
stripe_size,
|
stripe_size,
|
||||||
node_id,
|
node_id,
|
||||||
)),
|
)),
|
||||||
Entry::Occupied(e) => {
|
Entry::Occupied(e) => {
|
||||||
let tenant = e.into_mut();
|
let tenant = e.into_mut();
|
||||||
tenant.update(tenant_shard_id, stripe_size, node_id);
|
tenant.update(tenant_shard_id, stripe_size, node_id);
|
||||||
tenant
|
tenant
|
||||||
}
|
}
|
||||||
};
|
|
||||||
tenant.maybe_send(tenant_shard_id.tenant_id, None)
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// Process result: we may get an update to send, or we may have to wait for a lock
|
let reconfigure_request = tenant.maybe_reconfigure(tenant_shard_id.tenant_id);
|
||||||
// before trying again.
|
let Some(reconfigure_request) = reconfigure_request else {
|
||||||
let (request, mut send_lock_guard) = match maybe_send_result {
|
// The tenant doesn't yet have pageservers for all its shards: we won't notify anything
|
||||||
MaybeSendResult::Noop => {
|
// until it does.
|
||||||
return Ok(());
|
tracing::info!("Tenant isn't yet ready to emit a notification");
|
||||||
}
|
return Ok(());
|
||||||
MaybeSendResult::AwaitLock(send_lock) => {
|
|
||||||
let send_locked = send_lock.lock_owned().await;
|
|
||||||
|
|
||||||
// Lock order: maybe_send is called within the `[Self::state]` lock, and takes the send lock, but here
|
|
||||||
// we have acquired the send lock and take `[Self::state]` lock. This is safe because maybe_send only uses
|
|
||||||
// try_lock.
|
|
||||||
let state_locked = self.state.lock().unwrap();
|
|
||||||
let Some(tenant) = state_locked.get(&tenant_shard_id.tenant_id) else {
|
|
||||||
return Ok(());
|
|
||||||
};
|
|
||||||
match tenant.maybe_send(tenant_shard_id.tenant_id, Some(send_locked)) {
|
|
||||||
MaybeSendResult::AwaitLock(_) => {
|
|
||||||
unreachable!("We supplied lock guard")
|
|
||||||
}
|
|
||||||
MaybeSendResult::Noop => {
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
MaybeSendResult::Transmit((request, lock)) => (request, lock),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
MaybeSendResult::Transmit((request, lock)) => (request, lock),
|
|
||||||
};
|
};
|
||||||
|
|
||||||
let result = if let Some(notify_url) = &self.config.compute_hook_url {
|
if let Some(notify_url) = &self.config.compute_hook_url {
|
||||||
self.do_notify(notify_url, &request, cancel).await
|
self.do_notify(notify_url, reconfigure_request, cancel)
|
||||||
|
.await
|
||||||
} else {
|
} else {
|
||||||
self.do_notify_local(&request).await.map_err(|e| {
|
self.do_notify_local(reconfigure_request)
|
||||||
// This path is for testing only, so munge the error into our prod-style error type.
|
.await
|
||||||
tracing::error!("Local notification hook failed: {e}");
|
.map_err(|e| {
|
||||||
NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR)
|
// This path is for testing only, so munge the error into our prod-style error type.
|
||||||
})
|
tracing::error!("Local notification hook failed: {e}");
|
||||||
};
|
NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR)
|
||||||
|
})
|
||||||
if result.is_ok() {
|
|
||||||
// Before dropping the send lock, stash the request we just sent so that
|
|
||||||
// subsequent callers can avoid redundantly re-sending the same thing.
|
|
||||||
*send_lock_guard = Some(request);
|
|
||||||
}
|
}
|
||||||
result
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -530,22 +399,21 @@ pub(crate) mod tests {
|
|||||||
NodeId(1),
|
NodeId(1),
|
||||||
);
|
);
|
||||||
|
|
||||||
// An unsharded tenant is always ready to emit a notification, but won't
|
// An unsharded tenant is always ready to emit a notification
|
||||||
// send the same one twice
|
assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
|
||||||
let send_result = tenant_state.maybe_send(tenant_id, None);
|
assert_eq!(
|
||||||
let MaybeSendResult::Transmit((request, mut guard)) = send_result else {
|
tenant_state
|
||||||
anyhow::bail!("Wrong send result");
|
.maybe_reconfigure(tenant_id)
|
||||||
};
|
.unwrap()
|
||||||
assert_eq!(request.shards.len(), 1);
|
.shards
|
||||||
assert!(request.stripe_size.is_none());
|
.len(),
|
||||||
|
1
|
||||||
// Simulate successful send
|
);
|
||||||
*guard = Some(request);
|
assert!(tenant_state
|
||||||
drop(guard);
|
.maybe_reconfigure(tenant_id)
|
||||||
|
.unwrap()
|
||||||
// Try asking again: this should be a no-op
|
.stripe_size
|
||||||
let send_result = tenant_state.maybe_send(tenant_id, None);
|
.is_none());
|
||||||
assert!(matches!(send_result, MaybeSendResult::Noop));
|
|
||||||
|
|
||||||
// Writing the first shard of a multi-sharded situation (i.e. in a split)
|
// Writing the first shard of a multi-sharded situation (i.e. in a split)
|
||||||
// resets the tenant state and puts it in an non-notifying state (need to
|
// resets the tenant state and puts it in an non-notifying state (need to
|
||||||
@@ -559,10 +427,7 @@ pub(crate) mod tests {
|
|||||||
ShardStripeSize(32768),
|
ShardStripeSize(32768),
|
||||||
NodeId(1),
|
NodeId(1),
|
||||||
);
|
);
|
||||||
assert!(matches!(
|
assert!(tenant_state.maybe_reconfigure(tenant_id).is_none());
|
||||||
tenant_state.maybe_send(tenant_id, None),
|
|
||||||
MaybeSendResult::Noop
|
|
||||||
));
|
|
||||||
|
|
||||||
// Writing the second shard makes it ready to notify
|
// Writing the second shard makes it ready to notify
|
||||||
tenant_state.update(
|
tenant_state.update(
|
||||||
@@ -575,16 +440,22 @@ pub(crate) mod tests {
|
|||||||
NodeId(1),
|
NodeId(1),
|
||||||
);
|
);
|
||||||
|
|
||||||
let send_result = tenant_state.maybe_send(tenant_id, None);
|
assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
|
||||||
let MaybeSendResult::Transmit((request, mut guard)) = send_result else {
|
assert_eq!(
|
||||||
anyhow::bail!("Wrong send result");
|
tenant_state
|
||||||
};
|
.maybe_reconfigure(tenant_id)
|
||||||
assert_eq!(request.shards.len(), 2);
|
.unwrap()
|
||||||
assert_eq!(request.stripe_size, Some(ShardStripeSize(32768)));
|
.shards
|
||||||
|
.len(),
|
||||||
// Simulate successful send
|
2
|
||||||
*guard = Some(request);
|
);
|
||||||
drop(guard);
|
assert_eq!(
|
||||||
|
tenant_state
|
||||||
|
.maybe_reconfigure(tenant_id)
|
||||||
|
.unwrap()
|
||||||
|
.stripe_size,
|
||||||
|
Some(ShardStripeSize(32768))
|
||||||
|
);
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -1,14 +1,7 @@
|
|||||||
use crate::metrics::{
|
|
||||||
HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup,
|
|
||||||
METRICS_REGISTRY,
|
|
||||||
};
|
|
||||||
use crate::reconciler::ReconcileError;
|
use crate::reconciler::ReconcileError;
|
||||||
use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
|
use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
|
||||||
use futures::Future;
|
|
||||||
use hyper::header::CONTENT_TYPE;
|
|
||||||
use hyper::{Body, Request, Response};
|
use hyper::{Body, Request, Response};
|
||||||
use hyper::{StatusCode, Uri};
|
use hyper::{StatusCode, Uri};
|
||||||
use metrics::{BuildInfo, NeonMetrics};
|
|
||||||
use pageserver_api::models::{
|
use pageserver_api::models::{
|
||||||
TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
|
TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
|
||||||
TenantTimeTravelRequest, TimelineCreateRequest,
|
TenantTimeTravelRequest, TimelineCreateRequest,
|
||||||
@@ -17,11 +10,9 @@ use pageserver_api::shard::TenantShardId;
|
|||||||
use pageserver_client::mgmt_api;
|
use pageserver_client::mgmt_api;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::time::{Duration, Instant};
|
use std::time::{Duration, Instant};
|
||||||
use tokio_util::sync::CancellationToken;
|
|
||||||
use utils::auth::{Scope, SwappableJwtAuth};
|
use utils::auth::{Scope, SwappableJwtAuth};
|
||||||
use utils::failpoint_support::failpoints_handler;
|
|
||||||
use utils::http::endpoint::{auth_middleware, check_permission_with, request_span};
|
use utils::http::endpoint::{auth_middleware, check_permission_with, request_span};
|
||||||
use utils::http::request::{must_get_query_param, parse_query_param, parse_request_param};
|
use utils::http::request::{must_get_query_param, parse_request_param};
|
||||||
use utils::id::{TenantId, TimelineId};
|
use utils::id::{TenantId, TimelineId};
|
||||||
|
|
||||||
use utils::{
|
use utils::{
|
||||||
@@ -35,29 +26,22 @@ use utils::{
|
|||||||
};
|
};
|
||||||
|
|
||||||
use pageserver_api::controller_api::{
|
use pageserver_api::controller_api::{
|
||||||
NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantPolicyRequest,
|
NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
|
||||||
TenantShardMigrateRequest,
|
|
||||||
};
|
};
|
||||||
use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
|
use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
|
||||||
|
|
||||||
use control_plane::storage_controller::{AttachHookRequest, InspectRequest};
|
use control_plane::storage_controller::{AttachHookRequest, InspectRequest};
|
||||||
|
|
||||||
use routerify::Middleware;
|
|
||||||
|
|
||||||
/// State available to HTTP request handlers
|
/// State available to HTTP request handlers
|
||||||
|
#[derive(Clone)]
|
||||||
pub struct HttpState {
|
pub struct HttpState {
|
||||||
service: Arc<crate::service::Service>,
|
service: Arc<crate::service::Service>,
|
||||||
auth: Option<Arc<SwappableJwtAuth>>,
|
auth: Option<Arc<SwappableJwtAuth>>,
|
||||||
neon_metrics: NeonMetrics,
|
|
||||||
allowlist_routes: Vec<Uri>,
|
allowlist_routes: Vec<Uri>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl HttpState {
|
impl HttpState {
|
||||||
pub fn new(
|
pub fn new(service: Arc<crate::service::Service>, auth: Option<Arc<SwappableJwtAuth>>) -> Self {
|
||||||
service: Arc<crate::service::Service>,
|
|
||||||
auth: Option<Arc<SwappableJwtAuth>>,
|
|
||||||
build_info: BuildInfo,
|
|
||||||
) -> Self {
|
|
||||||
let allowlist_routes = ["/status", "/ready", "/metrics"]
|
let allowlist_routes = ["/status", "/ready", "/metrics"]
|
||||||
.iter()
|
.iter()
|
||||||
.map(|v| v.parse().unwrap())
|
.map(|v| v.parse().unwrap())
|
||||||
@@ -65,7 +49,6 @@ impl HttpState {
|
|||||||
Self {
|
Self {
|
||||||
service,
|
service,
|
||||||
auth,
|
auth,
|
||||||
neon_metrics: NeonMetrics::new(build_info),
|
|
||||||
allowlist_routes,
|
allowlist_routes,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -263,10 +246,8 @@ async fn handle_tenant_secondary_download(
|
|||||||
req: Request<Body>,
|
req: Request<Body>,
|
||||||
) -> Result<Response<Body>, ApiError> {
|
) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||||
let wait = parse_query_param(&req, "wait_ms")?.map(Duration::from_millis);
|
service.tenant_secondary_download(tenant_id).await?;
|
||||||
|
json_response(StatusCode::OK, ())
|
||||||
let (status, progress) = service.tenant_secondary_download(tenant_id, wait).await?;
|
|
||||||
json_response(status, progress)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn handle_tenant_delete(
|
async fn handle_tenant_delete(
|
||||||
@@ -328,7 +309,7 @@ async fn handle_tenant_timeline_passthrough(
|
|||||||
tracing::info!("Proxying request for tenant {} ({})", tenant_id, path);
|
tracing::info!("Proxying request for tenant {} ({})", tenant_id, path);
|
||||||
|
|
||||||
// Find the node that holds shard zero
|
// Find the node that holds shard zero
|
||||||
let (node, tenant_shard_id) = service.tenant_shard0_node(tenant_id)?;
|
let (base_url, tenant_shard_id) = service.tenant_shard0_baseurl(tenant_id)?;
|
||||||
|
|
||||||
// Callers will always pass an unsharded tenant ID. Before proxying, we must
|
// Callers will always pass an unsharded tenant ID. Before proxying, we must
|
||||||
// rewrite this to a shard-aware shard zero ID.
|
// rewrite this to a shard-aware shard zero ID.
|
||||||
@@ -337,39 +318,12 @@ async fn handle_tenant_timeline_passthrough(
|
|||||||
let tenant_shard_str = format!("{}", tenant_shard_id);
|
let tenant_shard_str = format!("{}", tenant_shard_id);
|
||||||
let path = path.replace(&tenant_str, &tenant_shard_str);
|
let path = path.replace(&tenant_str, &tenant_shard_str);
|
||||||
|
|
||||||
let latency = &METRICS_REGISTRY
|
let client = mgmt_api::Client::new(base_url, service.get_config().jwt_token.as_deref());
|
||||||
.metrics_group
|
|
||||||
.storage_controller_passthrough_request_latency;
|
|
||||||
|
|
||||||
// This is a bit awkward. We remove the param from the request
|
|
||||||
// and join the words by '_' to get a label for the request.
|
|
||||||
let just_path = path.replace(&tenant_shard_str, "");
|
|
||||||
let path_label = just_path
|
|
||||||
.split('/')
|
|
||||||
.filter(|token| !token.is_empty())
|
|
||||||
.collect::<Vec<_>>()
|
|
||||||
.join("_");
|
|
||||||
let labels = PageserverRequestLabelGroup {
|
|
||||||
pageserver_id: &node.get_id().to_string(),
|
|
||||||
path: &path_label,
|
|
||||||
method: crate::metrics::Method::Get,
|
|
||||||
};
|
|
||||||
|
|
||||||
let _timer = latency.start_timer(labels.clone());
|
|
||||||
|
|
||||||
let client = mgmt_api::Client::new(node.base_url(), service.get_config().jwt_token.as_deref());
|
|
||||||
let resp = client.get_raw(path).await.map_err(|_e|
|
let resp = client.get_raw(path).await.map_err(|_e|
|
||||||
// FIXME: give APiError a proper Unavailable variant. We return 503 here because
|
// FIXME: give APiError a proper Unavailable variant. We return 503 here because
|
||||||
// if we can't successfully send a request to the pageserver, we aren't available.
|
// if we can't successfully send a request to the pageserver, we aren't available.
|
||||||
ApiError::ShuttingDown)?;
|
ApiError::ShuttingDown)?;
|
||||||
|
|
||||||
if !resp.status().is_success() {
|
|
||||||
let error_counter = &METRICS_REGISTRY
|
|
||||||
.metrics_group
|
|
||||||
.storage_controller_passthrough_request_error;
|
|
||||||
error_counter.inc(labels);
|
|
||||||
}
|
|
||||||
|
|
||||||
// We have a reqest::Response, would like a http::Response
|
// We have a reqest::Response, would like a http::Response
|
||||||
let mut builder = hyper::Response::builder()
|
let mut builder = hyper::Response::builder()
|
||||||
.status(resp.status())
|
.status(resp.status())
|
||||||
@@ -395,25 +349,6 @@ async fn handle_tenant_locate(
|
|||||||
json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
|
json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn handle_tenant_describe(
|
|
||||||
service: Arc<Service>,
|
|
||||||
req: Request<Body>,
|
|
||||||
) -> Result<Response<Body>, ApiError> {
|
|
||||||
check_permissions(&req, Scope::Admin)?;
|
|
||||||
|
|
||||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
|
||||||
json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn handle_tenant_list(
|
|
||||||
service: Arc<Service>,
|
|
||||||
req: Request<Body>,
|
|
||||||
) -> Result<Response<Body>, ApiError> {
|
|
||||||
check_permissions(&req, Scope::Admin)?;
|
|
||||||
|
|
||||||
json_response(StatusCode::OK, service.tenant_list())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
check_permissions(&req, Scope::Admin)?;
|
check_permissions(&req, Scope::Admin)?;
|
||||||
|
|
||||||
@@ -427,10 +362,7 @@ async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError
|
|||||||
check_permissions(&req, Scope::Admin)?;
|
check_permissions(&req, Scope::Admin)?;
|
||||||
|
|
||||||
let state = get_state(&req);
|
let state = get_state(&req);
|
||||||
let nodes = state.service.node_list().await?;
|
json_response(StatusCode::OK, state.service.node_list().await?)
|
||||||
let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::<Vec<_>>();
|
|
||||||
|
|
||||||
json_response(StatusCode::OK, api_nodes)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
@@ -455,14 +387,7 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
|
|||||||
|
|
||||||
json_response(
|
json_response(
|
||||||
StatusCode::OK,
|
StatusCode::OK,
|
||||||
state
|
state.service.node_configure(config_req).await?,
|
||||||
.service
|
|
||||||
.node_configure(
|
|
||||||
config_req.node_id,
|
|
||||||
config_req.availability.map(NodeAvailability::from),
|
|
||||||
config_req.scheduling,
|
|
||||||
)
|
|
||||||
.await?,
|
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -497,22 +422,6 @@ async fn handle_tenant_shard_migrate(
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn handle_tenant_update_policy(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
|
||||||
check_permissions(&req, Scope::Admin)?;
|
|
||||||
|
|
||||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
|
||||||
let update_req = json_request::<TenantPolicyRequest>(&mut req).await?;
|
|
||||||
let state = get_state(&req);
|
|
||||||
|
|
||||||
json_response(
|
|
||||||
StatusCode::OK,
|
|
||||||
state
|
|
||||||
.service
|
|
||||||
.tenant_update_policy(tenant_id, update_req)
|
|
||||||
.await?,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||||
check_permissions(&req, Scope::PageServerApi)?;
|
check_permissions(&req, Scope::PageServerApi)?;
|
||||||
@@ -522,18 +431,6 @@ async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiErr
|
|||||||
json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?)
|
json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?)
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn handle_tenant_import(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
|
||||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
|
||||||
check_permissions(&req, Scope::PageServerApi)?;
|
|
||||||
|
|
||||||
let state = get_state(&req);
|
|
||||||
|
|
||||||
json_response(
|
|
||||||
StatusCode::OK,
|
|
||||||
state.service.tenant_import(tenant_id).await?,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
check_permissions(&req, Scope::Admin)?;
|
check_permissions(&req, Scope::Admin)?;
|
||||||
|
|
||||||
@@ -556,14 +453,6 @@ async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>,
|
|||||||
json_response(StatusCode::OK, state.service.consistency_check().await?)
|
json_response(StatusCode::OK, state.service.consistency_check().await?)
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn handle_reconcile_all(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
|
||||||
check_permissions(&req, Scope::Admin)?;
|
|
||||||
|
|
||||||
let state = get_state(&req);
|
|
||||||
|
|
||||||
json_response(StatusCode::OK, state.service.reconcile_all_now().await?)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Status endpoint is just used for checking that our HTTP listener is up
|
/// Status endpoint is just used for checking that our HTTP listener is up
|
||||||
async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
json_response(StatusCode::OK, ())
|
json_response(StatusCode::OK, ())
|
||||||
@@ -588,11 +477,7 @@ impl From<ReconcileError> for ApiError {
|
|||||||
|
|
||||||
/// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
|
/// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
|
||||||
/// be allowed to run if Service has finished its initial reconciliation.
|
/// be allowed to run if Service has finished its initial reconciliation.
|
||||||
async fn tenant_service_handler<R, H>(
|
async fn tenant_service_handler<R, H>(request: Request<Body>, handler: H) -> R::Output
|
||||||
request: Request<Body>,
|
|
||||||
handler: H,
|
|
||||||
request_name: RequestName,
|
|
||||||
) -> R::Output
|
|
||||||
where
|
where
|
||||||
R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
|
R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
|
||||||
H: FnOnce(Arc<Service>, Request<Body>) -> R + Send + Sync + 'static,
|
H: FnOnce(Arc<Service>, Request<Body>) -> R + Send + Sync + 'static,
|
||||||
@@ -612,122 +497,24 @@ where
|
|||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
named_request_span(
|
request_span(
|
||||||
request,
|
request,
|
||||||
|request| async move { handler(service, request).await },
|
|request| async move { handler(service, request).await },
|
||||||
request_name,
|
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Check if the required scope is held in the request's token, or if the request has
|
|
||||||
/// a token with 'admin' scope then always permit it.
|
|
||||||
fn check_permissions(request: &Request<Body>, required_scope: Scope) -> Result<(), ApiError> {
|
fn check_permissions(request: &Request<Body>, required_scope: Scope) -> Result<(), ApiError> {
|
||||||
check_permission_with(request, |claims| {
|
check_permission_with(request, |claims| {
|
||||||
match crate::auth::check_permission(claims, required_scope) {
|
crate::auth::check_permission(claims, required_scope)
|
||||||
Err(e) => match crate::auth::check_permission(claims, Scope::Admin) {
|
|
||||||
Ok(()) => Ok(()),
|
|
||||||
Err(_) => Err(e),
|
|
||||||
},
|
|
||||||
Ok(()) => Ok(()),
|
|
||||||
}
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug)]
|
|
||||||
struct RequestMeta {
|
|
||||||
method: hyper::http::Method,
|
|
||||||
at: Instant,
|
|
||||||
}
|
|
||||||
|
|
||||||
fn prologue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
|
|
||||||
) -> Middleware<B, ApiError> {
|
|
||||||
Middleware::pre(move |req| async move {
|
|
||||||
let meta = RequestMeta {
|
|
||||||
method: req.method().clone(),
|
|
||||||
at: Instant::now(),
|
|
||||||
};
|
|
||||||
|
|
||||||
req.set_context(meta);
|
|
||||||
|
|
||||||
Ok(req)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
fn epilogue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
|
|
||||||
) -> Middleware<B, ApiError> {
|
|
||||||
Middleware::post_with_info(move |resp, req_info| async move {
|
|
||||||
let request_name = match req_info.context::<RequestName>() {
|
|
||||||
Some(name) => name,
|
|
||||||
None => {
|
|
||||||
return Ok(resp);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
if let Some(meta) = req_info.context::<RequestMeta>() {
|
|
||||||
let status = &crate::metrics::METRICS_REGISTRY
|
|
||||||
.metrics_group
|
|
||||||
.storage_controller_http_request_status;
|
|
||||||
let latency = &crate::metrics::METRICS_REGISTRY
|
|
||||||
.metrics_group
|
|
||||||
.storage_controller_http_request_latency;
|
|
||||||
|
|
||||||
status.inc(HttpRequestStatusLabelGroup {
|
|
||||||
path: request_name.0,
|
|
||||||
method: meta.method.clone().into(),
|
|
||||||
status: crate::metrics::StatusCode(resp.status()),
|
|
||||||
});
|
|
||||||
|
|
||||||
latency.observe(
|
|
||||||
HttpRequestLatencyLabelGroup {
|
|
||||||
path: request_name.0,
|
|
||||||
method: meta.method.into(),
|
|
||||||
},
|
|
||||||
meta.at.elapsed().as_secs_f64(),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
Ok(resp)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn measured_metrics_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
|
||||||
pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4";
|
|
||||||
|
|
||||||
let state = get_state(&req);
|
|
||||||
let payload = crate::metrics::METRICS_REGISTRY.encode(&state.neon_metrics);
|
|
||||||
let response = Response::builder()
|
|
||||||
.status(200)
|
|
||||||
.header(CONTENT_TYPE, TEXT_FORMAT)
|
|
||||||
.body(payload.into())
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
Ok(response)
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Clone)]
|
|
||||||
struct RequestName(&'static str);
|
|
||||||
|
|
||||||
async fn named_request_span<R, H>(
|
|
||||||
request: Request<Body>,
|
|
||||||
handler: H,
|
|
||||||
name: RequestName,
|
|
||||||
) -> R::Output
|
|
||||||
where
|
|
||||||
R: Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
|
|
||||||
H: FnOnce(Request<Body>) -> R + Send + Sync + 'static,
|
|
||||||
{
|
|
||||||
request.set_context(name);
|
|
||||||
request_span(request, handler).await
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn make_router(
|
pub fn make_router(
|
||||||
service: Arc<Service>,
|
service: Arc<Service>,
|
||||||
auth: Option<Arc<SwappableJwtAuth>>,
|
auth: Option<Arc<SwappableJwtAuth>>,
|
||||||
build_info: BuildInfo,
|
|
||||||
) -> RouterBuilder<hyper::Body, ApiError> {
|
) -> RouterBuilder<hyper::Body, ApiError> {
|
||||||
let mut router = endpoint::make_router()
|
let mut router = endpoint::make_router();
|
||||||
.middleware(prologue_metrics_middleware())
|
|
||||||
.middleware(epilogue_metrics_middleware());
|
|
||||||
if auth.is_some() {
|
if auth.is_some() {
|
||||||
router = router.middleware(auth_middleware(|request| {
|
router = router.middleware(auth_middleware(|request| {
|
||||||
let state = get_state(request);
|
let state = get_state(request);
|
||||||
@@ -736,186 +523,93 @@ pub fn make_router(
|
|||||||
} else {
|
} else {
|
||||||
state.auth.as_deref()
|
state.auth.as_deref()
|
||||||
}
|
}
|
||||||
}));
|
}))
|
||||||
}
|
}
|
||||||
|
|
||||||
router
|
router
|
||||||
.data(Arc::new(HttpState::new(service, auth, build_info)))
|
.data(Arc::new(HttpState::new(service, auth)))
|
||||||
.get("/metrics", |r| {
|
|
||||||
named_request_span(r, measured_metrics_handler, RequestName("metrics"))
|
|
||||||
})
|
|
||||||
// Non-prefixed generic endpoints (status, metrics)
|
// Non-prefixed generic endpoints (status, metrics)
|
||||||
.get("/status", |r| {
|
.get("/status", |r| request_span(r, handle_status))
|
||||||
named_request_span(r, handle_status, RequestName("status"))
|
.get("/ready", |r| request_span(r, handle_ready))
|
||||||
})
|
|
||||||
.get("/ready", |r| {
|
|
||||||
named_request_span(r, handle_ready, RequestName("ready"))
|
|
||||||
})
|
|
||||||
// Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix
|
// Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix
|
||||||
.post("/upcall/v1/re-attach", |r| {
|
.post("/upcall/v1/re-attach", |r| {
|
||||||
named_request_span(r, handle_re_attach, RequestName("upcall_v1_reattach"))
|
request_span(r, handle_re_attach)
|
||||||
})
|
|
||||||
.post("/upcall/v1/validate", |r| {
|
|
||||||
named_request_span(r, handle_validate, RequestName("upcall_v1_validate"))
|
|
||||||
})
|
})
|
||||||
|
.post("/upcall/v1/validate", |r| request_span(r, handle_validate))
|
||||||
// Test/dev/debug endpoints
|
// Test/dev/debug endpoints
|
||||||
.post("/debug/v1/attach-hook", |r| {
|
.post("/debug/v1/attach-hook", |r| {
|
||||||
named_request_span(r, handle_attach_hook, RequestName("debug_v1_attach_hook"))
|
request_span(r, handle_attach_hook)
|
||||||
})
|
|
||||||
.post("/debug/v1/inspect", |r| {
|
|
||||||
named_request_span(r, handle_inspect, RequestName("debug_v1_inspect"))
|
|
||||||
})
|
})
|
||||||
|
.post("/debug/v1/inspect", |r| request_span(r, handle_inspect))
|
||||||
.post("/debug/v1/tenant/:tenant_id/drop", |r| {
|
.post("/debug/v1/tenant/:tenant_id/drop", |r| {
|
||||||
named_request_span(r, handle_tenant_drop, RequestName("debug_v1_tenant_drop"))
|
request_span(r, handle_tenant_drop)
|
||||||
})
|
})
|
||||||
.post("/debug/v1/node/:node_id/drop", |r| {
|
.post("/debug/v1/node/:node_id/drop", |r| {
|
||||||
named_request_span(r, handle_node_drop, RequestName("debug_v1_node_drop"))
|
request_span(r, handle_node_drop)
|
||||||
})
|
|
||||||
.post("/debug/v1/tenant/:tenant_id/import", |r| {
|
|
||||||
named_request_span(
|
|
||||||
r,
|
|
||||||
handle_tenant_import,
|
|
||||||
RequestName("debug_v1_tenant_import"),
|
|
||||||
)
|
|
||||||
})
|
|
||||||
.get("/debug/v1/tenant", |r| {
|
|
||||||
named_request_span(r, handle_tenants_dump, RequestName("debug_v1_tenant"))
|
|
||||||
})
|
|
||||||
.get("/debug/v1/tenant/:tenant_id/locate", |r| {
|
|
||||||
tenant_service_handler(
|
|
||||||
r,
|
|
||||||
handle_tenant_locate,
|
|
||||||
RequestName("debug_v1_tenant_locate"),
|
|
||||||
)
|
|
||||||
})
|
})
|
||||||
|
.get("/debug/v1/tenant", |r| request_span(r, handle_tenants_dump))
|
||||||
.get("/debug/v1/scheduler", |r| {
|
.get("/debug/v1/scheduler", |r| {
|
||||||
named_request_span(r, handle_scheduler_dump, RequestName("debug_v1_scheduler"))
|
request_span(r, handle_scheduler_dump)
|
||||||
})
|
})
|
||||||
.post("/debug/v1/consistency_check", |r| {
|
.post("/debug/v1/consistency_check", |r| {
|
||||||
named_request_span(
|
request_span(r, handle_consistency_check)
|
||||||
r,
|
|
||||||
handle_consistency_check,
|
|
||||||
RequestName("debug_v1_consistency_check"),
|
|
||||||
)
|
|
||||||
})
|
})
|
||||||
.post("/debug/v1/reconcile_all", |r| {
|
.get("/control/v1/tenant/:tenant_id/locate", |r| {
|
||||||
request_span(r, handle_reconcile_all)
|
tenant_service_handler(r, handle_tenant_locate)
|
||||||
})
|
|
||||||
.put("/debug/v1/failpoints", |r| {
|
|
||||||
request_span(r, |r| failpoints_handler(r, CancellationToken::new()))
|
|
||||||
})
|
})
|
||||||
// Node operations
|
// Node operations
|
||||||
.post("/control/v1/node", |r| {
|
.post("/control/v1/node", |r| {
|
||||||
named_request_span(r, handle_node_register, RequestName("control_v1_node"))
|
request_span(r, handle_node_register)
|
||||||
})
|
|
||||||
.get("/control/v1/node", |r| {
|
|
||||||
named_request_span(r, handle_node_list, RequestName("control_v1_node"))
|
|
||||||
})
|
})
|
||||||
|
.get("/control/v1/node", |r| request_span(r, handle_node_list))
|
||||||
.put("/control/v1/node/:node_id/config", |r| {
|
.put("/control/v1/node/:node_id/config", |r| {
|
||||||
named_request_span(
|
request_span(r, handle_node_configure)
|
||||||
r,
|
|
||||||
handle_node_configure,
|
|
||||||
RequestName("control_v1_node_config"),
|
|
||||||
)
|
|
||||||
})
|
})
|
||||||
// Tenant Shard operations
|
// Tenant Shard operations
|
||||||
.put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
|
.put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
|
||||||
tenant_service_handler(
|
tenant_service_handler(r, handle_tenant_shard_migrate)
|
||||||
r,
|
|
||||||
handle_tenant_shard_migrate,
|
|
||||||
RequestName("control_v1_tenant_migrate"),
|
|
||||||
)
|
|
||||||
})
|
})
|
||||||
.put("/control/v1/tenant/:tenant_id/shard_split", |r| {
|
.put("/control/v1/tenant/:tenant_id/shard_split", |r| {
|
||||||
tenant_service_handler(
|
tenant_service_handler(r, handle_tenant_shard_split)
|
||||||
r,
|
|
||||||
handle_tenant_shard_split,
|
|
||||||
RequestName("control_v1_tenant_shard_split"),
|
|
||||||
)
|
|
||||||
})
|
|
||||||
.get("/control/v1/tenant/:tenant_id", |r| {
|
|
||||||
tenant_service_handler(
|
|
||||||
r,
|
|
||||||
handle_tenant_describe,
|
|
||||||
RequestName("control_v1_tenant_describe"),
|
|
||||||
)
|
|
||||||
})
|
|
||||||
.get("/control/v1/tenant", |r| {
|
|
||||||
tenant_service_handler(r, handle_tenant_list, RequestName("control_v1_tenant_list"))
|
|
||||||
})
|
|
||||||
.put("/control/v1/tenant/:tenant_id/policy", |r| {
|
|
||||||
named_request_span(
|
|
||||||
r,
|
|
||||||
handle_tenant_update_policy,
|
|
||||||
RequestName("control_v1_tenant_policy"),
|
|
||||||
)
|
|
||||||
})
|
})
|
||||||
// Tenant operations
|
// Tenant operations
|
||||||
// The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
|
// The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
|
||||||
// this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
|
// this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
|
||||||
.post("/v1/tenant", |r| {
|
.post("/v1/tenant", |r| {
|
||||||
tenant_service_handler(r, handle_tenant_create, RequestName("v1_tenant"))
|
tenant_service_handler(r, handle_tenant_create)
|
||||||
})
|
})
|
||||||
.delete("/v1/tenant/:tenant_id", |r| {
|
.delete("/v1/tenant/:tenant_id", |r| {
|
||||||
tenant_service_handler(r, handle_tenant_delete, RequestName("v1_tenant"))
|
tenant_service_handler(r, handle_tenant_delete)
|
||||||
})
|
})
|
||||||
.put("/v1/tenant/config", |r| {
|
.put("/v1/tenant/config", |r| {
|
||||||
tenant_service_handler(r, handle_tenant_config_set, RequestName("v1_tenant_config"))
|
tenant_service_handler(r, handle_tenant_config_set)
|
||||||
})
|
})
|
||||||
.get("/v1/tenant/:tenant_id/config", |r| {
|
.get("/v1/tenant/:tenant_id/config", |r| {
|
||||||
tenant_service_handler(r, handle_tenant_config_get, RequestName("v1_tenant_config"))
|
tenant_service_handler(r, handle_tenant_config_get)
|
||||||
})
|
})
|
||||||
.put("/v1/tenant/:tenant_shard_id/location_config", |r| {
|
.put("/v1/tenant/:tenant_shard_id/location_config", |r| {
|
||||||
tenant_service_handler(
|
tenant_service_handler(r, handle_tenant_location_config)
|
||||||
r,
|
|
||||||
handle_tenant_location_config,
|
|
||||||
RequestName("v1_tenant_location_config"),
|
|
||||||
)
|
|
||||||
})
|
})
|
||||||
.put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
|
.put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
|
||||||
tenant_service_handler(
|
tenant_service_handler(r, handle_tenant_time_travel_remote_storage)
|
||||||
r,
|
|
||||||
handle_tenant_time_travel_remote_storage,
|
|
||||||
RequestName("v1_tenant_time_travel_remote_storage"),
|
|
||||||
)
|
|
||||||
})
|
})
|
||||||
.post("/v1/tenant/:tenant_id/secondary/download", |r| {
|
.post("/v1/tenant/:tenant_id/secondary/download", |r| {
|
||||||
tenant_service_handler(
|
tenant_service_handler(r, handle_tenant_secondary_download)
|
||||||
r,
|
|
||||||
handle_tenant_secondary_download,
|
|
||||||
RequestName("v1_tenant_secondary_download"),
|
|
||||||
)
|
|
||||||
})
|
})
|
||||||
// Timeline operations
|
// Timeline operations
|
||||||
.delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
|
.delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
|
||||||
tenant_service_handler(
|
tenant_service_handler(r, handle_tenant_timeline_delete)
|
||||||
r,
|
|
||||||
handle_tenant_timeline_delete,
|
|
||||||
RequestName("v1_tenant_timeline"),
|
|
||||||
)
|
|
||||||
})
|
})
|
||||||
.post("/v1/tenant/:tenant_id/timeline", |r| {
|
.post("/v1/tenant/:tenant_id/timeline", |r| {
|
||||||
tenant_service_handler(
|
tenant_service_handler(r, handle_tenant_timeline_create)
|
||||||
r,
|
|
||||||
handle_tenant_timeline_create,
|
|
||||||
RequestName("v1_tenant_timeline"),
|
|
||||||
)
|
|
||||||
})
|
})
|
||||||
// Tenant detail GET passthrough to shard zero
|
// Tenant detail GET passthrough to shard zero
|
||||||
.get("/v1/tenant/:tenant_id", |r| {
|
.get("/v1/tenant/:tenant_id", |r| {
|
||||||
tenant_service_handler(
|
tenant_service_handler(r, handle_tenant_timeline_passthrough)
|
||||||
r,
|
|
||||||
handle_tenant_timeline_passthrough,
|
|
||||||
RequestName("v1_tenant_passthrough"),
|
|
||||||
)
|
|
||||||
})
|
})
|
||||||
// Timeline GET passthrough to shard zero. Note that the `*` in the URL is a wildcard: any future
|
// Timeline GET passthrough to shard zero. Note that the `*` in the URL is a wildcard: any future
|
||||||
// timeline GET APIs will be implicitly included.
|
// timeline GET APIs will be implicitly included.
|
||||||
.get("/v1/tenant/:tenant_id/timeline*", |r| {
|
.get("/v1/tenant/:tenant_id/timeline*", |r| {
|
||||||
tenant_service_handler(
|
tenant_service_handler(r, handle_tenant_timeline_passthrough)
|
||||||
r,
|
|
||||||
handle_tenant_timeline_passthrough,
|
|
||||||
RequestName("v1_tenant_timeline_passthrough"),
|
|
||||||
)
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@@ -3,18 +3,15 @@ use utils::seqwait::MonotonicCounter;
|
|||||||
|
|
||||||
mod auth;
|
mod auth;
|
||||||
mod compute_hook;
|
mod compute_hook;
|
||||||
mod heartbeater;
|
|
||||||
pub mod http;
|
pub mod http;
|
||||||
mod id_lock_map;
|
|
||||||
pub mod metrics;
|
pub mod metrics;
|
||||||
mod node;
|
mod node;
|
||||||
mod pageserver_client;
|
|
||||||
pub mod persistence;
|
pub mod persistence;
|
||||||
mod reconciler;
|
mod reconciler;
|
||||||
mod scheduler;
|
mod scheduler;
|
||||||
mod schema;
|
mod schema;
|
||||||
pub mod service;
|
pub mod service;
|
||||||
mod tenant_shard;
|
mod tenant_state;
|
||||||
|
|
||||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone, Serialize)]
|
#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone, Serialize)]
|
||||||
struct Sequence(u64);
|
struct Sequence(u64);
|
||||||
@@ -1,20 +1,19 @@
|
|||||||
use anyhow::{anyhow, Context};
|
use anyhow::{anyhow, Context};
|
||||||
|
use attachment_service::http::make_router;
|
||||||
|
use attachment_service::metrics::preinitialize_metrics;
|
||||||
|
use attachment_service::persistence::Persistence;
|
||||||
|
use attachment_service::service::{Config, Service};
|
||||||
|
use aws_config::{BehaviorVersion, Region};
|
||||||
use camino::Utf8PathBuf;
|
use camino::Utf8PathBuf;
|
||||||
use clap::Parser;
|
use clap::Parser;
|
||||||
use diesel::Connection;
|
use diesel::Connection;
|
||||||
use metrics::launch_timestamp::LaunchTimestamp;
|
use metrics::launch_timestamp::LaunchTimestamp;
|
||||||
use metrics::BuildInfo;
|
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use storage_controller::http::make_router;
|
|
||||||
use storage_controller::metrics::preinitialize_metrics;
|
|
||||||
use storage_controller::persistence::Persistence;
|
|
||||||
use storage_controller::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT};
|
|
||||||
use tokio::signal::unix::SignalKind;
|
use tokio::signal::unix::SignalKind;
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
use utils::auth::{JwtAuth, SwappableJwtAuth};
|
use utils::auth::{JwtAuth, SwappableJwtAuth};
|
||||||
use utils::logging::{self, LogFormat};
|
use utils::logging::{self, LogFormat};
|
||||||
|
|
||||||
use utils::sentry_init::init_sentry;
|
|
||||||
use utils::{project_build_tag, project_git_version, tcp_listener};
|
use utils::{project_build_tag, project_git_version, tcp_listener};
|
||||||
|
|
||||||
project_git_version!(GIT_VERSION);
|
project_git_version!(GIT_VERSION);
|
||||||
@@ -52,33 +51,9 @@ struct Cli {
|
|||||||
#[arg(short, long)]
|
#[arg(short, long)]
|
||||||
path: Option<Utf8PathBuf>,
|
path: Option<Utf8PathBuf>,
|
||||||
|
|
||||||
/// URL to connect to postgres, like postgresql://localhost:1234/storage_controller
|
/// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
database_url: Option<String>,
|
database_url: Option<String>,
|
||||||
|
|
||||||
/// Flag to enable dev mode, which permits running without auth
|
|
||||||
#[arg(long, default_value = "false")]
|
|
||||||
dev: bool,
|
|
||||||
|
|
||||||
/// Grace period before marking unresponsive pageserver offline
|
|
||||||
#[arg(long)]
|
|
||||||
max_unavailable_interval: Option<humantime::Duration>,
|
|
||||||
}
|
|
||||||
|
|
||||||
enum StrictMode {
|
|
||||||
/// In strict mode, we will require that all secrets are loaded, i.e. security features
|
|
||||||
/// may not be implicitly turned off by omitting secrets in the environment.
|
|
||||||
Strict,
|
|
||||||
/// In dev mode, secrets are optional, and omitting a particular secret will implicitly
|
|
||||||
/// disable the auth related to it (e.g. no pageserver jwt key -> send unauthenticated
|
|
||||||
/// requests, no public key -> don't authenticate incoming requests).
|
|
||||||
Dev,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Default for StrictMode {
|
|
||||||
fn default() -> Self {
|
|
||||||
Self::Strict
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Secrets may either be provided on the command line (for testing), or loaded from AWS SecretManager: this
|
/// Secrets may either be provided on the command line (for testing), or loaded from AWS SecretManager: this
|
||||||
@@ -91,6 +66,13 @@ struct Secrets {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Secrets {
|
impl Secrets {
|
||||||
|
const DATABASE_URL_SECRET: &'static str = "rds-neon-storage-controller-url";
|
||||||
|
const PAGESERVER_JWT_TOKEN_SECRET: &'static str =
|
||||||
|
"neon-storage-controller-pageserver-jwt-token";
|
||||||
|
const CONTROL_PLANE_JWT_TOKEN_SECRET: &'static str =
|
||||||
|
"neon-storage-controller-control-plane-jwt-token";
|
||||||
|
const PUBLIC_KEY_SECRET: &'static str = "neon-storage-controller-public-key";
|
||||||
|
|
||||||
const DATABASE_URL_ENV: &'static str = "DATABASE_URL";
|
const DATABASE_URL_ENV: &'static str = "DATABASE_URL";
|
||||||
const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN";
|
const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN";
|
||||||
const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN";
|
const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN";
|
||||||
@@ -101,41 +83,111 @@ impl Secrets {
|
|||||||
/// - Environment variables if DATABASE_URL is set.
|
/// - Environment variables if DATABASE_URL is set.
|
||||||
/// - AWS Secrets Manager secrets
|
/// - AWS Secrets Manager secrets
|
||||||
async fn load(args: &Cli) -> anyhow::Result<Self> {
|
async fn load(args: &Cli) -> anyhow::Result<Self> {
|
||||||
let Some(database_url) =
|
match &args.database_url {
|
||||||
Self::load_secret(&args.database_url, Self::DATABASE_URL_ENV).await
|
Some(url) => Self::load_cli(url, args),
|
||||||
else {
|
None => match std::env::var(Self::DATABASE_URL_ENV) {
|
||||||
anyhow::bail!(
|
Ok(database_url) => Self::load_env(database_url),
|
||||||
"Database URL is not set (set `--database-url`, or `DATABASE_URL` environment)"
|
Err(_) => Self::load_aws_sm().await,
|
||||||
)
|
},
|
||||||
};
|
}
|
||||||
|
|
||||||
let public_key = match Self::load_secret(&args.public_key, Self::PUBLIC_KEY_ENV).await {
|
|
||||||
Some(v) => Some(JwtAuth::from_key(v).context("Loading public key")?),
|
|
||||||
None => None,
|
|
||||||
};
|
|
||||||
|
|
||||||
let this = Self {
|
|
||||||
database_url,
|
|
||||||
public_key,
|
|
||||||
jwt_token: Self::load_secret(&args.jwt_token, Self::PAGESERVER_JWT_TOKEN_ENV).await,
|
|
||||||
control_plane_jwt_token: Self::load_secret(
|
|
||||||
&args.control_plane_jwt_token,
|
|
||||||
Self::CONTROL_PLANE_JWT_TOKEN_ENV,
|
|
||||||
)
|
|
||||||
.await,
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok(this)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn load_secret(cli: &Option<String>, env_name: &str) -> Option<String> {
|
fn load_env(database_url: String) -> anyhow::Result<Self> {
|
||||||
if let Some(v) = cli {
|
let public_key = match std::env::var(Self::PUBLIC_KEY_ENV) {
|
||||||
Some(v.clone())
|
Ok(public_key) => Some(JwtAuth::from_key(public_key).context("Loading public key")?),
|
||||||
} else if let Ok(v) = std::env::var(env_name) {
|
Err(_) => None,
|
||||||
Some(v)
|
};
|
||||||
} else {
|
Ok(Self {
|
||||||
None
|
database_url,
|
||||||
|
public_key,
|
||||||
|
jwt_token: std::env::var(Self::PAGESERVER_JWT_TOKEN_ENV).ok(),
|
||||||
|
control_plane_jwt_token: std::env::var(Self::CONTROL_PLANE_JWT_TOKEN_ENV).ok(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn load_aws_sm() -> anyhow::Result<Self> {
|
||||||
|
let Ok(region) = std::env::var("AWS_REGION") else {
|
||||||
|
anyhow::bail!("AWS_REGION is not set, cannot load secrets automatically: either set this, or use CLI args to supply secrets");
|
||||||
|
};
|
||||||
|
let config = aws_config::defaults(BehaviorVersion::v2023_11_09())
|
||||||
|
.region(Region::new(region.clone()))
|
||||||
|
.load()
|
||||||
|
.await;
|
||||||
|
|
||||||
|
let asm = aws_sdk_secretsmanager::Client::new(&config);
|
||||||
|
|
||||||
|
let Some(database_url) = asm
|
||||||
|
.get_secret_value()
|
||||||
|
.secret_id(Self::DATABASE_URL_SECRET)
|
||||||
|
.send()
|
||||||
|
.await?
|
||||||
|
.secret_string()
|
||||||
|
.map(str::to_string)
|
||||||
|
else {
|
||||||
|
anyhow::bail!(
|
||||||
|
"Database URL secret not found at {region}/{}",
|
||||||
|
Self::DATABASE_URL_SECRET
|
||||||
|
)
|
||||||
|
};
|
||||||
|
|
||||||
|
let jwt_token = asm
|
||||||
|
.get_secret_value()
|
||||||
|
.secret_id(Self::PAGESERVER_JWT_TOKEN_SECRET)
|
||||||
|
.send()
|
||||||
|
.await?
|
||||||
|
.secret_string()
|
||||||
|
.map(str::to_string);
|
||||||
|
if jwt_token.is_none() {
|
||||||
|
tracing::warn!("No pageserver JWT token set: this will only work if authentication is disabled on the pageserver");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let control_plane_jwt_token = asm
|
||||||
|
.get_secret_value()
|
||||||
|
.secret_id(Self::CONTROL_PLANE_JWT_TOKEN_SECRET)
|
||||||
|
.send()
|
||||||
|
.await?
|
||||||
|
.secret_string()
|
||||||
|
.map(str::to_string);
|
||||||
|
if jwt_token.is_none() {
|
||||||
|
tracing::warn!("No control plane JWT token set: this will only work if authentication is disabled on the pageserver");
|
||||||
|
}
|
||||||
|
|
||||||
|
let public_key = asm
|
||||||
|
.get_secret_value()
|
||||||
|
.secret_id(Self::PUBLIC_KEY_SECRET)
|
||||||
|
.send()
|
||||||
|
.await?
|
||||||
|
.secret_string()
|
||||||
|
.map(str::to_string);
|
||||||
|
let public_key = match public_key {
|
||||||
|
Some(key) => Some(JwtAuth::from_key(key)?),
|
||||||
|
None => {
|
||||||
|
tracing::warn!(
|
||||||
|
"No public key set: inccoming HTTP requests will not be authenticated"
|
||||||
|
);
|
||||||
|
None
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(Self {
|
||||||
|
database_url,
|
||||||
|
public_key,
|
||||||
|
jwt_token,
|
||||||
|
control_plane_jwt_token,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn load_cli(database_url: &str, args: &Cli) -> anyhow::Result<Self> {
|
||||||
|
let public_key = match &args.public_key {
|
||||||
|
None => None,
|
||||||
|
Some(key) => Some(JwtAuth::from_key(key.clone()).context("Loading public key")?),
|
||||||
|
};
|
||||||
|
Ok(Self {
|
||||||
|
database_url: database_url.to_owned(),
|
||||||
|
public_key,
|
||||||
|
jwt_token: args.jwt_token.clone(),
|
||||||
|
control_plane_jwt_token: args.control_plane_jwt_token.clone(),
|
||||||
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -154,14 +206,6 @@ async fn migration_run(database_url: &str) -> anyhow::Result<()> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn main() -> anyhow::Result<()> {
|
fn main() -> anyhow::Result<()> {
|
||||||
let default_panic = std::panic::take_hook();
|
|
||||||
std::panic::set_hook(Box::new(move |info| {
|
|
||||||
default_panic(info);
|
|
||||||
std::process::exit(1);
|
|
||||||
}));
|
|
||||||
|
|
||||||
let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
|
|
||||||
|
|
||||||
tokio::runtime::Builder::new_current_thread()
|
tokio::runtime::Builder::new_current_thread()
|
||||||
// We use spawn_blocking for database operations, so require approximately
|
// We use spawn_blocking for database operations, so require approximately
|
||||||
// as many blocking threads as we will open database connections.
|
// as many blocking threads as we will open database connections.
|
||||||
@@ -193,55 +237,12 @@ async fn async_main() -> anyhow::Result<()> {
|
|||||||
args.listen
|
args.listen
|
||||||
);
|
);
|
||||||
|
|
||||||
let build_info = BuildInfo {
|
|
||||||
revision: GIT_VERSION,
|
|
||||||
build_tag: BUILD_TAG,
|
|
||||||
};
|
|
||||||
|
|
||||||
let strict_mode = if args.dev {
|
|
||||||
StrictMode::Dev
|
|
||||||
} else {
|
|
||||||
StrictMode::Strict
|
|
||||||
};
|
|
||||||
|
|
||||||
let secrets = Secrets::load(&args).await?;
|
let secrets = Secrets::load(&args).await?;
|
||||||
|
|
||||||
// Validate required secrets and arguments are provided in strict mode
|
|
||||||
match strict_mode {
|
|
||||||
StrictMode::Strict
|
|
||||||
if (secrets.public_key.is_none()
|
|
||||||
|| secrets.jwt_token.is_none()
|
|
||||||
|| secrets.control_plane_jwt_token.is_none()) =>
|
|
||||||
{
|
|
||||||
// Production systems should always have secrets configured: if public_key was not set
|
|
||||||
// then we would implicitly disable auth.
|
|
||||||
anyhow::bail!(
|
|
||||||
"Insecure config! One or more secrets is not set. This is only permitted in `--dev` mode"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
StrictMode::Strict if args.compute_hook_url.is_none() => {
|
|
||||||
// Production systems should always have a compute hook set, to prevent falling
|
|
||||||
// back to trying to use neon_local.
|
|
||||||
anyhow::bail!(
|
|
||||||
"`--compute-hook-url` is not set: this is only permitted in `--dev` mode"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
StrictMode::Strict => {
|
|
||||||
tracing::info!("Starting in strict mode: configuration is OK.")
|
|
||||||
}
|
|
||||||
StrictMode::Dev => {
|
|
||||||
tracing::warn!("Starting in dev mode: this may be an insecure configuration.")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let config = Config {
|
let config = Config {
|
||||||
jwt_token: secrets.jwt_token,
|
jwt_token: secrets.jwt_token,
|
||||||
control_plane_jwt_token: secrets.control_plane_jwt_token,
|
control_plane_jwt_token: secrets.control_plane_jwt_token,
|
||||||
compute_hook_url: args.compute_hook_url,
|
compute_hook_url: args.compute_hook_url,
|
||||||
max_unavailable_interval: args
|
|
||||||
.max_unavailable_interval
|
|
||||||
.map(humantime::Duration::into)
|
|
||||||
.unwrap_or(MAX_UNAVAILABLE_INTERVAL_DEFAULT),
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// After loading secrets & config, but before starting anything else, apply database migrations
|
// After loading secrets & config, but before starting anything else, apply database migrations
|
||||||
@@ -259,7 +260,7 @@ async fn async_main() -> anyhow::Result<()> {
|
|||||||
let auth = secrets
|
let auth = secrets
|
||||||
.public_key
|
.public_key
|
||||||
.map(|jwt_auth| Arc::new(SwappableJwtAuth::new(jwt_auth)));
|
.map(|jwt_auth| Arc::new(SwappableJwtAuth::new(jwt_auth)));
|
||||||
let router = make_router(service.clone(), auth, build_info)
|
let router = make_router(service.clone(), auth)
|
||||||
.build()
|
.build()
|
||||||
.map_err(|err| anyhow!(err))?;
|
.map_err(|err| anyhow!(err))?;
|
||||||
let router_service = utils::http::RouterService::new(router).unwrap();
|
let router_service = utils::http::RouterService::new(router).unwrap();
|
||||||
32
control_plane/attachment_service/src/metrics.rs
Normal file
32
control_plane/attachment_service/src/metrics.rs
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
|
||||||
|
use once_cell::sync::Lazy;
|
||||||
|
|
||||||
|
pub(crate) struct ReconcilerMetrics {
|
||||||
|
pub(crate) spawned: IntCounter,
|
||||||
|
pub(crate) complete: IntCounterVec,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ReconcilerMetrics {
|
||||||
|
// Labels used on [`Self::complete`]
|
||||||
|
pub(crate) const SUCCESS: &'static str = "ok";
|
||||||
|
pub(crate) const ERROR: &'static str = "success";
|
||||||
|
pub(crate) const CANCEL: &'static str = "cancel";
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) static RECONCILER: Lazy<ReconcilerMetrics> = Lazy::new(|| ReconcilerMetrics {
|
||||||
|
spawned: register_int_counter!(
|
||||||
|
"storage_controller_reconcile_spawn",
|
||||||
|
"Count of how many times we spawn a reconcile task",
|
||||||
|
)
|
||||||
|
.expect("failed to define a metric"),
|
||||||
|
complete: register_int_counter_vec!(
|
||||||
|
"storage_controller_reconcile_complete",
|
||||||
|
"Reconciler tasks completed, broken down by success/failure/cancelled",
|
||||||
|
&["status"],
|
||||||
|
)
|
||||||
|
.expect("failed to define a metric"),
|
||||||
|
});
|
||||||
|
|
||||||
|
pub fn preinitialize_metrics() {
|
||||||
|
Lazy::force(&RECONCILER);
|
||||||
|
}
|
||||||
@@ -3,8 +3,7 @@ use std::{str::FromStr, time::Duration};
|
|||||||
use hyper::StatusCode;
|
use hyper::StatusCode;
|
||||||
use pageserver_api::{
|
use pageserver_api::{
|
||||||
controller_api::{
|
controller_api::{
|
||||||
NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy,
|
NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, TenantLocateResponseShard,
|
||||||
TenantLocateResponseShard,
|
|
||||||
},
|
},
|
||||||
shard::TenantShardId,
|
shard::TenantShardId,
|
||||||
};
|
};
|
||||||
@@ -13,9 +12,7 @@ use serde::Serialize;
|
|||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
use utils::{backoff, id::NodeId};
|
use utils::{backoff, id::NodeId};
|
||||||
|
|
||||||
use crate::{
|
use crate::persistence::NodePersistence;
|
||||||
pageserver_client::PageserverClient, persistence::NodePersistence, scheduler::MaySchedule,
|
|
||||||
};
|
|
||||||
|
|
||||||
/// Represents the in-memory description of a Node.
|
/// Represents the in-memory description of a Node.
|
||||||
///
|
///
|
||||||
@@ -86,38 +83,29 @@ impl Node {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn set_availability(&mut self, availability: NodeAvailability) {
|
pub(crate) fn set_availability(
|
||||||
match self.get_availability_transition(availability) {
|
&mut self,
|
||||||
AvailabilityTransition::ToActive => {
|
availability: NodeAvailability,
|
||||||
|
) -> AvailabilityTransition {
|
||||||
|
use NodeAvailability::*;
|
||||||
|
let transition = match (self.availability, availability) {
|
||||||
|
(Offline, Active) => {
|
||||||
// Give the node a new cancellation token, effectively resetting it to un-cancelled. Any
|
// Give the node a new cancellation token, effectively resetting it to un-cancelled. Any
|
||||||
// users of previously-cloned copies of the node will still see the old cancellation
|
// users of previously-cloned copies of the node will still see the old cancellation
|
||||||
// state. For example, Reconcilers in flight will have to complete and be spawned
|
// state. For example, Reconcilers in flight will have to complete and be spawned
|
||||||
// again to realize that the node has become available.
|
// again to realize that the node has become available.
|
||||||
self.cancel = CancellationToken::new();
|
self.cancel = CancellationToken::new();
|
||||||
|
AvailabilityTransition::ToActive
|
||||||
}
|
}
|
||||||
AvailabilityTransition::ToOffline => {
|
(Active, Offline) => {
|
||||||
// Fire the node's cancellation token to cancel any in-flight API requests to it
|
// Fire the node's cancellation token to cancel any in-flight API requests to it
|
||||||
self.cancel.cancel();
|
self.cancel.cancel();
|
||||||
|
AvailabilityTransition::ToOffline
|
||||||
}
|
}
|
||||||
AvailabilityTransition::Unchanged => {}
|
_ => AvailabilityTransition::Unchanged,
|
||||||
}
|
};
|
||||||
self.availability = availability;
|
self.availability = availability;
|
||||||
}
|
transition
|
||||||
|
|
||||||
/// Without modifying the availability of the node, convert the intended availability
|
|
||||||
/// into a description of the transition.
|
|
||||||
pub(crate) fn get_availability_transition(
|
|
||||||
&self,
|
|
||||||
availability: NodeAvailability,
|
|
||||||
) -> AvailabilityTransition {
|
|
||||||
use AvailabilityTransition::*;
|
|
||||||
use NodeAvailability::*;
|
|
||||||
|
|
||||||
match (self.availability, availability) {
|
|
||||||
(Offline, Active(_)) => ToActive,
|
|
||||||
(Active(_), Offline) => ToOffline,
|
|
||||||
_ => Unchanged,
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Whether we may send API requests to this node.
|
/// Whether we may send API requests to this node.
|
||||||
@@ -126,21 +114,21 @@ impl Node {
|
|||||||
// a reference to the original Node's cancellation status. Checking both of these results
|
// a reference to the original Node's cancellation status. Checking both of these results
|
||||||
// in a "pessimistic" check where we will consider a Node instance unavailable if it was unavailable
|
// in a "pessimistic" check where we will consider a Node instance unavailable if it was unavailable
|
||||||
// when we cloned it, or if the original Node instance's cancellation token was fired.
|
// when we cloned it, or if the original Node instance's cancellation token was fired.
|
||||||
matches!(self.availability, NodeAvailability::Active(_)) && !self.cancel.is_cancelled()
|
matches!(self.availability, NodeAvailability::Active) && !self.cancel.is_cancelled()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Is this node elegible to have work scheduled onto it?
|
/// Is this node elegible to have work scheduled onto it?
|
||||||
pub(crate) fn may_schedule(&self) -> MaySchedule {
|
pub(crate) fn may_schedule(&self) -> bool {
|
||||||
let score = match self.availability {
|
match self.availability {
|
||||||
NodeAvailability::Active(score) => score,
|
NodeAvailability::Active => {}
|
||||||
NodeAvailability::Offline => return MaySchedule::No,
|
NodeAvailability::Offline => return false,
|
||||||
};
|
}
|
||||||
|
|
||||||
match self.scheduling {
|
match self.scheduling {
|
||||||
NodeSchedulingPolicy::Active => MaySchedule::Yes(score),
|
NodeSchedulingPolicy::Active => true,
|
||||||
NodeSchedulingPolicy::Draining => MaySchedule::No,
|
NodeSchedulingPolicy::Draining => false,
|
||||||
NodeSchedulingPolicy::Filling => MaySchedule::Yes(score),
|
NodeSchedulingPolicy::Filling => true,
|
||||||
NodeSchedulingPolicy::Pause => MaySchedule::No,
|
NodeSchedulingPolicy::Pause => false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -158,7 +146,8 @@ impl Node {
|
|||||||
listen_pg_addr,
|
listen_pg_addr,
|
||||||
listen_pg_port,
|
listen_pg_port,
|
||||||
scheduling: NodeSchedulingPolicy::Filling,
|
scheduling: NodeSchedulingPolicy::Filling,
|
||||||
availability: NodeAvailability::Offline,
|
// TODO: we shouldn't really call this Active until we've heartbeated it.
|
||||||
|
availability: NodeAvailability::Active,
|
||||||
cancel: CancellationToken::new(),
|
cancel: CancellationToken::new(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -205,7 +194,7 @@ impl Node {
|
|||||||
cancel: &CancellationToken,
|
cancel: &CancellationToken,
|
||||||
) -> Option<mgmt_api::Result<T>>
|
) -> Option<mgmt_api::Result<T>>
|
||||||
where
|
where
|
||||||
O: FnMut(PageserverClient) -> F,
|
O: FnMut(mgmt_api::Client) -> F,
|
||||||
F: std::future::Future<Output = mgmt_api::Result<T>>,
|
F: std::future::Future<Output = mgmt_api::Result<T>>,
|
||||||
{
|
{
|
||||||
fn is_fatal(e: &mgmt_api::Error) -> bool {
|
fn is_fatal(e: &mgmt_api::Error) -> bool {
|
||||||
@@ -227,12 +216,8 @@ impl Node {
|
|||||||
.build()
|
.build()
|
||||||
.expect("Failed to construct HTTP client");
|
.expect("Failed to construct HTTP client");
|
||||||
|
|
||||||
let client = PageserverClient::from_client(
|
let client =
|
||||||
self.get_id(),
|
mgmt_api::Client::from_client(http_client, self.base_url(), jwt.as_deref());
|
||||||
http_client,
|
|
||||||
self.base_url(),
|
|
||||||
jwt.as_deref(),
|
|
||||||
);
|
|
||||||
|
|
||||||
let node_cancel_fut = self.cancel.cancelled();
|
let node_cancel_fut = self.cancel.cancelled();
|
||||||
|
|
||||||
@@ -257,19 +242,6 @@ impl Node {
|
|||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Generate the simplified API-friendly description of a node's state
|
|
||||||
pub(crate) fn describe(&self) -> NodeDescribeResponse {
|
|
||||||
NodeDescribeResponse {
|
|
||||||
id: self.id,
|
|
||||||
availability: self.availability.into(),
|
|
||||||
scheduling: self.scheduling,
|
|
||||||
listen_http_addr: self.listen_http_addr.clone(),
|
|
||||||
listen_http_port: self.listen_http_port,
|
|
||||||
listen_pg_addr: self.listen_pg_addr.clone(),
|
|
||||||
listen_pg_port: self.listen_pg_port,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl std::fmt::Display for Node {
|
impl std::fmt::Display for Node {
|
||||||
@@ -9,20 +9,13 @@ use camino::Utf8PathBuf;
|
|||||||
use diesel::pg::PgConnection;
|
use diesel::pg::PgConnection;
|
||||||
use diesel::prelude::*;
|
use diesel::prelude::*;
|
||||||
use diesel::Connection;
|
use diesel::Connection;
|
||||||
use pageserver_api::controller_api::ShardSchedulingPolicy;
|
|
||||||
use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
|
use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
|
||||||
use pageserver_api::models::TenantConfig;
|
use pageserver_api::models::TenantConfig;
|
||||||
use pageserver_api::shard::ShardConfigError;
|
|
||||||
use pageserver_api::shard::ShardIdentity;
|
|
||||||
use pageserver_api::shard::ShardStripeSize;
|
|
||||||
use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
|
use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use utils::generation::Generation;
|
use utils::generation::Generation;
|
||||||
use utils::id::{NodeId, TenantId};
|
use utils::id::{NodeId, TenantId};
|
||||||
|
|
||||||
use crate::metrics::{
|
|
||||||
DatabaseQueryErrorLabelGroup, DatabaseQueryLatencyLabelGroup, METRICS_REGISTRY,
|
|
||||||
};
|
|
||||||
use crate::node::Node;
|
use crate::node::Node;
|
||||||
|
|
||||||
/// ## What do we store?
|
/// ## What do we store?
|
||||||
@@ -79,41 +72,8 @@ pub(crate) enum DatabaseError {
|
|||||||
Logical(String),
|
Logical(String),
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(measured::FixedCardinalityLabel, Copy, Clone)]
|
|
||||||
pub(crate) enum DatabaseOperation {
|
|
||||||
InsertNode,
|
|
||||||
UpdateNode,
|
|
||||||
DeleteNode,
|
|
||||||
ListNodes,
|
|
||||||
BeginShardSplit,
|
|
||||||
CompleteShardSplit,
|
|
||||||
AbortShardSplit,
|
|
||||||
Detach,
|
|
||||||
ReAttach,
|
|
||||||
IncrementGeneration,
|
|
||||||
ListTenantShards,
|
|
||||||
InsertTenantShards,
|
|
||||||
UpdateTenantShard,
|
|
||||||
DeleteTenant,
|
|
||||||
UpdateTenantConfig,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[must_use]
|
|
||||||
pub(crate) enum AbortShardSplitStatus {
|
|
||||||
/// We aborted the split in the database by reverting to the parent shards
|
|
||||||
Aborted,
|
|
||||||
/// The split had already been persisted.
|
|
||||||
Complete,
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;
|
pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;
|
||||||
|
|
||||||
/// Some methods can operate on either a whole tenant or a single shard
|
|
||||||
pub(crate) enum TenantFilter {
|
|
||||||
Tenant(TenantId),
|
|
||||||
Shard(TenantShardId),
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Persistence {
|
impl Persistence {
|
||||||
// The default postgres connection limit is 100. We use up to 99, to leave one free for a human admin under
|
// The default postgres connection limit is 100. We use up to 99, to leave one free for a human admin under
|
||||||
// normal circumstances. This assumes we have exclusive use of the database cluster to which we connect.
|
// normal circumstances. This assumes we have exclusive use of the database cluster to which we connect.
|
||||||
@@ -144,36 +104,10 @@ impl Persistence {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Wraps `with_conn` in order to collect latency and error metrics
|
|
||||||
async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
|
|
||||||
where
|
|
||||||
F: FnOnce(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
|
|
||||||
R: Send + 'static,
|
|
||||||
{
|
|
||||||
let latency = &METRICS_REGISTRY
|
|
||||||
.metrics_group
|
|
||||||
.storage_controller_database_query_latency;
|
|
||||||
let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { operation: op });
|
|
||||||
|
|
||||||
let res = self.with_conn(func).await;
|
|
||||||
|
|
||||||
if let Err(err) = &res {
|
|
||||||
let error_counter = &METRICS_REGISTRY
|
|
||||||
.metrics_group
|
|
||||||
.storage_controller_database_query_error;
|
|
||||||
error_counter.inc(DatabaseQueryErrorLabelGroup {
|
|
||||||
error_type: err.error_label(),
|
|
||||||
operation: op,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
res
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Call the provided function in a tokio blocking thread, with a Diesel database connection.
|
/// Call the provided function in a tokio blocking thread, with a Diesel database connection.
|
||||||
async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
|
async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
|
||||||
where
|
where
|
||||||
F: FnOnce(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
|
F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
|
||||||
R: Send + 'static,
|
R: Send + 'static,
|
||||||
{
|
{
|
||||||
let mut conn = self.connection_pool.get()?;
|
let mut conn = self.connection_pool.get()?;
|
||||||
@@ -185,27 +119,21 @@ impl Persistence {
|
|||||||
/// When a node is first registered, persist it before using it for anything
|
/// When a node is first registered, persist it before using it for anything
|
||||||
pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> {
|
pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> {
|
||||||
let np = node.to_persistent();
|
let np = node.to_persistent();
|
||||||
self.with_measured_conn(
|
self.with_conn(move |conn| -> DatabaseResult<()> {
|
||||||
DatabaseOperation::InsertNode,
|
diesel::insert_into(crate::schema::nodes::table)
|
||||||
move |conn| -> DatabaseResult<()> {
|
.values(&np)
|
||||||
diesel::insert_into(crate::schema::nodes::table)
|
.execute(conn)?;
|
||||||
.values(&np)
|
Ok(())
|
||||||
.execute(conn)?;
|
})
|
||||||
Ok(())
|
|
||||||
},
|
|
||||||
)
|
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
/// At startup, populate the list of nodes which our shards may be placed on
|
/// At startup, populate the list of nodes which our shards may be placed on
|
||||||
pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<NodePersistence>> {
|
pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<NodePersistence>> {
|
||||||
let nodes: Vec<NodePersistence> = self
|
let nodes: Vec<NodePersistence> = self
|
||||||
.with_measured_conn(
|
.with_conn(move |conn| -> DatabaseResult<_> {
|
||||||
DatabaseOperation::ListNodes,
|
Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
|
||||||
move |conn| -> DatabaseResult<_> {
|
})
|
||||||
Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
|
|
||||||
},
|
|
||||||
)
|
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
tracing::info!("list_nodes: loaded {} nodes", nodes.len());
|
tracing::info!("list_nodes: loaded {} nodes", nodes.len());
|
||||||
@@ -220,7 +148,7 @@ impl Persistence {
|
|||||||
) -> DatabaseResult<()> {
|
) -> DatabaseResult<()> {
|
||||||
use crate::schema::nodes::dsl::*;
|
use crate::schema::nodes::dsl::*;
|
||||||
let updated = self
|
let updated = self
|
||||||
.with_measured_conn(DatabaseOperation::UpdateNode, move |conn| {
|
.with_conn(move |conn| {
|
||||||
let updated = diesel::update(nodes)
|
let updated = diesel::update(nodes)
|
||||||
.filter(node_id.eq(input_node_id.0 as i64))
|
.filter(node_id.eq(input_node_id.0 as i64))
|
||||||
.set((scheduling_policy.eq(String::from(input_scheduling)),))
|
.set((scheduling_policy.eq(String::from(input_scheduling)),))
|
||||||
@@ -242,12 +170,9 @@ impl Persistence {
|
|||||||
/// be enriched at runtime with state discovered on pageservers.
|
/// be enriched at runtime with state discovered on pageservers.
|
||||||
pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
|
pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
|
||||||
let loaded = self
|
let loaded = self
|
||||||
.with_measured_conn(
|
.with_conn(move |conn| -> DatabaseResult<_> {
|
||||||
DatabaseOperation::ListTenantShards,
|
Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
|
||||||
move |conn| -> DatabaseResult<_> {
|
})
|
||||||
Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
|
|
||||||
},
|
|
||||||
)
|
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
if loaded.is_empty() {
|
if loaded.is_empty() {
|
||||||
@@ -275,15 +200,15 @@ impl Persistence {
|
|||||||
|
|
||||||
let mut decoded = serde_json::from_slice::<JsonPersistence>(&bytes)
|
let mut decoded = serde_json::from_slice::<JsonPersistence>(&bytes)
|
||||||
.map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?;
|
.map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?;
|
||||||
for shard in decoded.tenants.values_mut() {
|
for (tenant_id, tenant) in &mut decoded.tenants {
|
||||||
if shard.placement_policy == "\"Single\"" {
|
// Backward compat: an old attachments.json from before PR #6251, replace
|
||||||
// Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165
|
// empty strings with proper defaults.
|
||||||
shard.placement_policy = "{\"Attached\":0}".to_string();
|
if tenant.tenant_id.is_empty() {
|
||||||
}
|
tenant.tenant_id = tenant_id.to_string();
|
||||||
|
tenant.config = serde_json::to_string(&TenantConfig::default())
|
||||||
if shard.scheduling_policy.is_empty() {
|
.map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
|
||||||
shard.scheduling_policy =
|
tenant.placement_policy = serde_json::to_string(&PlacementPolicy::Single)
|
||||||
serde_json::to_string(&ShardSchedulingPolicy::default()).unwrap();
|
.map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -329,20 +254,17 @@ impl Persistence {
|
|||||||
shards: Vec<TenantShardPersistence>,
|
shards: Vec<TenantShardPersistence>,
|
||||||
) -> DatabaseResult<()> {
|
) -> DatabaseResult<()> {
|
||||||
use crate::schema::tenant_shards::dsl::*;
|
use crate::schema::tenant_shards::dsl::*;
|
||||||
self.with_measured_conn(
|
self.with_conn(move |conn| -> DatabaseResult<()> {
|
||||||
DatabaseOperation::InsertTenantShards,
|
conn.transaction(|conn| -> QueryResult<()> {
|
||||||
move |conn| -> DatabaseResult<()> {
|
for tenant in &shards {
|
||||||
conn.transaction(|conn| -> QueryResult<()> {
|
diesel::insert_into(tenant_shards)
|
||||||
for tenant in &shards {
|
.values(tenant)
|
||||||
diesel::insert_into(tenant_shards)
|
.execute(conn)?;
|
||||||
.values(tenant)
|
}
|
||||||
.execute(conn)?;
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
})?;
|
|
||||||
Ok(())
|
Ok(())
|
||||||
},
|
})?;
|
||||||
)
|
Ok(())
|
||||||
|
})
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -350,31 +272,25 @@ impl Persistence {
|
|||||||
/// the tenant from memory on this server.
|
/// the tenant from memory on this server.
|
||||||
pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
|
pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
|
||||||
use crate::schema::tenant_shards::dsl::*;
|
use crate::schema::tenant_shards::dsl::*;
|
||||||
self.with_measured_conn(
|
self.with_conn(move |conn| -> DatabaseResult<()> {
|
||||||
DatabaseOperation::DeleteTenant,
|
diesel::delete(tenant_shards)
|
||||||
move |conn| -> DatabaseResult<()> {
|
.filter(tenant_id.eq(del_tenant_id.to_string()))
|
||||||
diesel::delete(tenant_shards)
|
.execute(conn)?;
|
||||||
.filter(tenant_id.eq(del_tenant_id.to_string()))
|
|
||||||
.execute(conn)?;
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
},
|
})
|
||||||
)
|
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) async fn delete_node(&self, del_node_id: NodeId) -> DatabaseResult<()> {
|
pub(crate) async fn delete_node(&self, del_node_id: NodeId) -> DatabaseResult<()> {
|
||||||
use crate::schema::nodes::dsl::*;
|
use crate::schema::nodes::dsl::*;
|
||||||
self.with_measured_conn(
|
self.with_conn(move |conn| -> DatabaseResult<()> {
|
||||||
DatabaseOperation::DeleteNode,
|
diesel::delete(nodes)
|
||||||
move |conn| -> DatabaseResult<()> {
|
.filter(node_id.eq(del_node_id.0 as i64))
|
||||||
diesel::delete(nodes)
|
.execute(conn)?;
|
||||||
.filter(node_id.eq(del_node_id.0 as i64))
|
|
||||||
.execute(conn)?;
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
},
|
})
|
||||||
)
|
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -388,7 +304,7 @@ impl Persistence {
|
|||||||
) -> DatabaseResult<HashMap<TenantShardId, Generation>> {
|
) -> DatabaseResult<HashMap<TenantShardId, Generation>> {
|
||||||
use crate::schema::tenant_shards::dsl::*;
|
use crate::schema::tenant_shards::dsl::*;
|
||||||
let updated = self
|
let updated = self
|
||||||
.with_measured_conn(DatabaseOperation::ReAttach, move |conn| {
|
.with_conn(move |conn| {
|
||||||
let rows_updated = diesel::update(tenant_shards)
|
let rows_updated = diesel::update(tenant_shards)
|
||||||
.filter(generation_pageserver.eq(node_id.0 as i64))
|
.filter(generation_pageserver.eq(node_id.0 as i64))
|
||||||
.set(generation.eq(generation + 1))
|
.set(generation.eq(generation + 1))
|
||||||
@@ -438,7 +354,7 @@ impl Persistence {
|
|||||||
) -> anyhow::Result<Generation> {
|
) -> anyhow::Result<Generation> {
|
||||||
use crate::schema::tenant_shards::dsl::*;
|
use crate::schema::tenant_shards::dsl::*;
|
||||||
let updated = self
|
let updated = self
|
||||||
.with_measured_conn(DatabaseOperation::IncrementGeneration, move |conn| {
|
.with_conn(move |conn| {
|
||||||
let updated = diesel::update(tenant_shards)
|
let updated = diesel::update(tenant_shards)
|
||||||
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
||||||
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
||||||
@@ -475,45 +391,59 @@ impl Persistence {
|
|||||||
/// that we only do the first time a tenant is set to an attached policy via /location_config.
|
/// that we only do the first time a tenant is set to an attached policy via /location_config.
|
||||||
pub(crate) async fn update_tenant_shard(
|
pub(crate) async fn update_tenant_shard(
|
||||||
&self,
|
&self,
|
||||||
tenant: TenantFilter,
|
tenant_shard_id: TenantShardId,
|
||||||
input_placement_policy: Option<PlacementPolicy>,
|
input_placement_policy: PlacementPolicy,
|
||||||
input_config: Option<TenantConfig>,
|
input_config: TenantConfig,
|
||||||
input_generation: Option<Generation>,
|
input_generation: Option<Generation>,
|
||||||
input_scheduling_policy: Option<ShardSchedulingPolicy>,
|
|
||||||
) -> DatabaseResult<()> {
|
) -> DatabaseResult<()> {
|
||||||
use crate::schema::tenant_shards::dsl::*;
|
use crate::schema::tenant_shards::dsl::*;
|
||||||
|
|
||||||
self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| {
|
self.with_conn(move |conn| {
|
||||||
let query = match tenant {
|
let query = diesel::update(tenant_shards)
|
||||||
TenantFilter::Shard(tenant_shard_id) => diesel::update(tenant_shards)
|
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
||||||
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
||||||
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32));
|
||||||
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
|
|
||||||
.into_boxed(),
|
|
||||||
TenantFilter::Tenant(input_tenant_id) => diesel::update(tenant_shards)
|
|
||||||
.filter(tenant_id.eq(input_tenant_id.to_string()))
|
|
||||||
.into_boxed(),
|
|
||||||
};
|
|
||||||
|
|
||||||
#[derive(AsChangeset)]
|
if let Some(input_generation) = input_generation {
|
||||||
#[diesel(table_name = crate::schema::tenant_shards)]
|
// Update includes generation column
|
||||||
struct ShardUpdate {
|
query
|
||||||
generation: Option<i32>,
|
.set((
|
||||||
placement_policy: Option<String>,
|
generation.eq(Some(input_generation.into().unwrap() as i32)),
|
||||||
config: Option<String>,
|
placement_policy
|
||||||
scheduling_policy: Option<String>,
|
.eq(serde_json::to_string(&input_placement_policy).unwrap()),
|
||||||
|
config.eq(serde_json::to_string(&input_config).unwrap()),
|
||||||
|
))
|
||||||
|
.execute(conn)?;
|
||||||
|
} else {
|
||||||
|
// Update does not include generation column
|
||||||
|
query
|
||||||
|
.set((
|
||||||
|
placement_policy
|
||||||
|
.eq(serde_json::to_string(&input_placement_policy).unwrap()),
|
||||||
|
config.eq(serde_json::to_string(&input_config).unwrap()),
|
||||||
|
))
|
||||||
|
.execute(conn)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let update = ShardUpdate {
|
Ok(())
|
||||||
generation: input_generation.map(|g| g.into().unwrap() as i32),
|
})
|
||||||
placement_policy: input_placement_policy
|
.await?;
|
||||||
.map(|p| serde_json::to_string(&p).unwrap()),
|
|
||||||
config: input_config.map(|c| serde_json::to_string(&c).unwrap()),
|
|
||||||
scheduling_policy: input_scheduling_policy
|
|
||||||
.map(|p| serde_json::to_string(&p).unwrap()),
|
|
||||||
};
|
|
||||||
|
|
||||||
query.set(update).execute(conn)?;
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) async fn update_tenant_config(
|
||||||
|
&self,
|
||||||
|
input_tenant_id: TenantId,
|
||||||
|
input_config: TenantConfig,
|
||||||
|
) -> DatabaseResult<()> {
|
||||||
|
use crate::schema::tenant_shards::dsl::*;
|
||||||
|
|
||||||
|
self.with_conn(move |conn| {
|
||||||
|
diesel::update(tenant_shards)
|
||||||
|
.filter(tenant_id.eq(input_tenant_id.to_string()))
|
||||||
|
.set((config.eq(serde_json::to_string(&input_config).unwrap()),))
|
||||||
|
.execute(conn)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
})
|
})
|
||||||
@@ -524,7 +454,7 @@ impl Persistence {
|
|||||||
|
|
||||||
pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
|
pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
|
||||||
use crate::schema::tenant_shards::dsl::*;
|
use crate::schema::tenant_shards::dsl::*;
|
||||||
self.with_measured_conn(DatabaseOperation::Detach, move |conn| {
|
self.with_conn(move |conn| {
|
||||||
let updated = diesel::update(tenant_shards)
|
let updated = diesel::update(tenant_shards)
|
||||||
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
||||||
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
||||||
@@ -554,7 +484,7 @@ impl Persistence {
|
|||||||
parent_to_children: Vec<(TenantShardId, Vec<TenantShardPersistence>)>,
|
parent_to_children: Vec<(TenantShardId, Vec<TenantShardPersistence>)>,
|
||||||
) -> DatabaseResult<()> {
|
) -> DatabaseResult<()> {
|
||||||
use crate::schema::tenant_shards::dsl::*;
|
use crate::schema::tenant_shards::dsl::*;
|
||||||
self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| -> DatabaseResult<()> {
|
self.with_conn(move |conn| -> DatabaseResult<()> {
|
||||||
conn.transaction(|conn| -> DatabaseResult<()> {
|
conn.transaction(|conn| -> DatabaseResult<()> {
|
||||||
// Mark parent shards as splitting
|
// Mark parent shards as splitting
|
||||||
|
|
||||||
@@ -618,83 +548,31 @@ impl Persistence {
|
|||||||
old_shard_count: ShardCount,
|
old_shard_count: ShardCount,
|
||||||
) -> DatabaseResult<()> {
|
) -> DatabaseResult<()> {
|
||||||
use crate::schema::tenant_shards::dsl::*;
|
use crate::schema::tenant_shards::dsl::*;
|
||||||
self.with_measured_conn(
|
self.with_conn(move |conn| -> DatabaseResult<()> {
|
||||||
DatabaseOperation::CompleteShardSplit,
|
conn.transaction(|conn| -> QueryResult<()> {
|
||||||
move |conn| -> DatabaseResult<()> {
|
// Drop parent shards
|
||||||
conn.transaction(|conn| -> QueryResult<()> {
|
diesel::delete(tenant_shards)
|
||||||
// Drop parent shards
|
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||||
diesel::delete(tenant_shards)
|
.filter(shard_count.eq(old_shard_count.literal() as i32))
|
||||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
.execute(conn)?;
|
||||||
.filter(shard_count.eq(old_shard_count.literal() as i32))
|
|
||||||
.execute(conn)?;
|
|
||||||
|
|
||||||
// Clear sharding flag
|
// Clear sharding flag
|
||||||
let updated = diesel::update(tenant_shards)
|
let updated = diesel::update(tenant_shards)
|
||||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||||
.set((splitting.eq(0),))
|
.set((splitting.eq(0),))
|
||||||
.execute(conn)?;
|
.execute(conn)?;
|
||||||
debug_assert!(updated > 0);
|
debug_assert!(updated > 0);
|
||||||
|
|
||||||
Ok(())
|
|
||||||
})?;
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
},
|
})?;
|
||||||
)
|
|
||||||
.await
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Used when the remote part of a shard split failed: we will revert the database state to have only
|
Ok(())
|
||||||
/// the parent shards, with SplitState::Idle.
|
})
|
||||||
pub(crate) async fn abort_shard_split(
|
|
||||||
&self,
|
|
||||||
split_tenant_id: TenantId,
|
|
||||||
new_shard_count: ShardCount,
|
|
||||||
) -> DatabaseResult<AbortShardSplitStatus> {
|
|
||||||
use crate::schema::tenant_shards::dsl::*;
|
|
||||||
self.with_measured_conn(
|
|
||||||
DatabaseOperation::AbortShardSplit,
|
|
||||||
move |conn| -> DatabaseResult<AbortShardSplitStatus> {
|
|
||||||
let aborted =
|
|
||||||
conn.transaction(|conn| -> DatabaseResult<AbortShardSplitStatus> {
|
|
||||||
// Clear the splitting state on parent shards
|
|
||||||
let updated = diesel::update(tenant_shards)
|
|
||||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
|
||||||
.filter(shard_count.ne(new_shard_count.literal() as i32))
|
|
||||||
.set((splitting.eq(0),))
|
|
||||||
.execute(conn)?;
|
|
||||||
|
|
||||||
// Parent shards are already gone: we cannot abort.
|
|
||||||
if updated == 0 {
|
|
||||||
return Ok(AbortShardSplitStatus::Complete);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Sanity check: if parent shards were present, their cardinality should
|
|
||||||
// be less than the number of child shards.
|
|
||||||
if updated >= new_shard_count.count() as usize {
|
|
||||||
return Err(DatabaseError::Logical(format!(
|
|
||||||
"Unexpected parent shard count {updated} while aborting split to \
|
|
||||||
count {new_shard_count:?} on tenant {split_tenant_id}"
|
|
||||||
)));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Erase child shards
|
|
||||||
diesel::delete(tenant_shards)
|
|
||||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
|
||||||
.filter(shard_count.eq(new_shard_count.literal() as i32))
|
|
||||||
.execute(conn)?;
|
|
||||||
|
|
||||||
Ok(AbortShardSplitStatus::Aborted)
|
|
||||||
})?;
|
|
||||||
|
|
||||||
Ok(aborted)
|
|
||||||
},
|
|
||||||
)
|
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
|
/// Parts of [`crate::tenant_state::TenantState`] that are stored durably
|
||||||
#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)]
|
#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)]
|
||||||
#[diesel(table_name = crate::schema::tenant_shards)]
|
#[diesel(table_name = crate::schema::tenant_shards)]
|
||||||
pub(crate) struct TenantShardPersistence {
|
pub(crate) struct TenantShardPersistence {
|
||||||
@@ -724,30 +602,6 @@ pub(crate) struct TenantShardPersistence {
|
|||||||
pub(crate) splitting: SplitState,
|
pub(crate) splitting: SplitState,
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub(crate) config: String,
|
pub(crate) config: String,
|
||||||
#[serde(default)]
|
|
||||||
pub(crate) scheduling_policy: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl TenantShardPersistence {
|
|
||||||
pub(crate) fn get_shard_identity(&self) -> Result<ShardIdentity, ShardConfigError> {
|
|
||||||
if self.shard_count == 0 {
|
|
||||||
Ok(ShardIdentity::unsharded())
|
|
||||||
} else {
|
|
||||||
Ok(ShardIdentity::new(
|
|
||||||
ShardNumber(self.shard_number as u8),
|
|
||||||
ShardCount::new(self.shard_count as u8),
|
|
||||||
ShardStripeSize(self.shard_stripe_size as u32),
|
|
||||||
)?)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn get_tenant_shard_id(&self) -> Result<TenantShardId, hex::FromHexError> {
|
|
||||||
Ok(TenantShardId {
|
|
||||||
tenant_id: TenantId::from_str(self.tenant_id.as_str())?,
|
|
||||||
shard_number: ShardNumber(self.shard_number as u8),
|
|
||||||
shard_count: ShardCount::new(self.shard_count as u8),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Parts of [`crate::node::Node`] that are stored durably
|
/// Parts of [`crate::node::Node`] that are stored durably
|
||||||
@@ -1,7 +1,5 @@
|
|||||||
use crate::pageserver_client::PageserverClient;
|
|
||||||
use crate::persistence::Persistence;
|
use crate::persistence::Persistence;
|
||||||
use crate::service;
|
use crate::service;
|
||||||
use hyper::StatusCode;
|
|
||||||
use pageserver_api::models::{
|
use pageserver_api::models::{
|
||||||
LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
|
LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
|
||||||
};
|
};
|
||||||
@@ -9,7 +7,7 @@ use pageserver_api::shard::{ShardIdentity, TenantShardId};
|
|||||||
use pageserver_client::mgmt_api;
|
use pageserver_client::mgmt_api;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::time::{Duration, Instant};
|
use std::time::Duration;
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
use utils::generation::Generation;
|
use utils::generation::Generation;
|
||||||
use utils::id::{NodeId, TimelineId};
|
use utils::id::{NodeId, TimelineId};
|
||||||
@@ -18,14 +16,12 @@ use utils::sync::gate::GateGuard;
|
|||||||
|
|
||||||
use crate::compute_hook::{ComputeHook, NotifyError};
|
use crate::compute_hook::{ComputeHook, NotifyError};
|
||||||
use crate::node::Node;
|
use crate::node::Node;
|
||||||
use crate::tenant_shard::{IntentState, ObservedState, ObservedStateLocation};
|
use crate::tenant_state::{IntentState, ObservedState, ObservedStateLocation};
|
||||||
|
|
||||||
const DEFAULT_HEATMAP_PERIOD: &str = "60s";
|
|
||||||
|
|
||||||
/// Object with the lifetime of the background reconcile task that is created
|
/// Object with the lifetime of the background reconcile task that is created
|
||||||
/// for tenants which have a difference between their intent and observed states.
|
/// for tenants which have a difference between their intent and observed states.
|
||||||
pub(super) struct Reconciler {
|
pub(super) struct Reconciler {
|
||||||
/// See [`crate::tenant_shard::TenantShard`] for the meanings of these fields: they are a snapshot
|
/// See [`crate::tenant_state::TenantState`] for the meanings of these fields: they are a snapshot
|
||||||
/// of a tenant's state from when we spawned a reconcile task.
|
/// of a tenant's state from when we spawned a reconcile task.
|
||||||
pub(super) tenant_shard_id: TenantShardId,
|
pub(super) tenant_shard_id: TenantShardId,
|
||||||
pub(crate) shard: ShardIdentity,
|
pub(crate) shard: ShardIdentity,
|
||||||
@@ -48,11 +44,11 @@ pub(super) struct Reconciler {
|
|||||||
|
|
||||||
/// To avoid stalling if the cloud control plane is unavailable, we may proceed
|
/// To avoid stalling if the cloud control plane is unavailable, we may proceed
|
||||||
/// past failures in [`ComputeHook::notify`], but we _must_ remember that we failed
|
/// past failures in [`ComputeHook::notify`], but we _must_ remember that we failed
|
||||||
/// so that we can set [`crate::tenant_shard::TenantShard::pending_compute_notification`] to ensure a later retry.
|
/// so that we can set [`crate::tenant_state::TenantState::pending_compute_notification`] to ensure a later retry.
|
||||||
pub(crate) compute_notify_failure: bool,
|
pub(crate) compute_notify_failure: bool,
|
||||||
|
|
||||||
/// A means to abort background reconciliation: it is essential to
|
/// A means to abort background reconciliation: it is essential to
|
||||||
/// call this when something changes in the original TenantShard that
|
/// call this when something changes in the original TenantState that
|
||||||
/// will make this reconciliation impossible or unnecessary, for
|
/// will make this reconciliation impossible or unnecessary, for
|
||||||
/// example when a pageserver node goes offline, or the PlacementPolicy for
|
/// example when a pageserver node goes offline, or the PlacementPolicy for
|
||||||
/// the tenant is changed.
|
/// the tenant is changed.
|
||||||
@@ -66,7 +62,7 @@ pub(super) struct Reconciler {
|
|||||||
pub(crate) persistence: Arc<Persistence>,
|
pub(crate) persistence: Arc<Persistence>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// This is a snapshot of [`crate::tenant_shard::IntentState`], but it does not do any
|
/// This is a snapshot of [`crate::tenant_state::IntentState`], but it does not do any
|
||||||
/// reference counting for Scheduler. The IntentState is what the scheduler works with,
|
/// reference counting for Scheduler. The IntentState is what the scheduler works with,
|
||||||
/// and the TargetState is just the instruction for a particular Reconciler run.
|
/// and the TargetState is just the instruction for a particular Reconciler run.
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
@@ -118,15 +114,6 @@ impl Reconciler {
|
|||||||
flush_ms: Option<Duration>,
|
flush_ms: Option<Duration>,
|
||||||
lazy: bool,
|
lazy: bool,
|
||||||
) -> Result<(), ReconcileError> {
|
) -> Result<(), ReconcileError> {
|
||||||
if !node.is_available() && config.mode == LocationConfigMode::Detached {
|
|
||||||
// Attempts to detach from offline nodes may be imitated without doing I/O: a node which is offline
|
|
||||||
// will get fully reconciled wrt the shard's intent state when it is reactivated, irrespective of
|
|
||||||
// what we put into `observed`, in [`crate::service::Service::node_activate_reconcile`]
|
|
||||||
tracing::info!("Node {node} is unavailable during detach: proceeding anyway, it will be detached on next activation");
|
|
||||||
self.observed.locations.remove(&node.get_id());
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
|
|
||||||
self.observed
|
self.observed
|
||||||
.locations
|
.locations
|
||||||
.insert(node.get_id(), ObservedStateLocation { conf: None });
|
.insert(node.get_id(), ObservedStateLocation { conf: None });
|
||||||
@@ -159,16 +146,9 @@ impl Reconciler {
|
|||||||
};
|
};
|
||||||
tracing::info!("location_config({node}) complete: {:?}", config);
|
tracing::info!("location_config({node}) complete: {:?}", config);
|
||||||
|
|
||||||
match config.mode {
|
self.observed
|
||||||
LocationConfigMode::Detached => {
|
.locations
|
||||||
self.observed.locations.remove(&node.get_id());
|
.insert(node.get_id(), ObservedStateLocation { conf: Some(config) });
|
||||||
}
|
|
||||||
_ => {
|
|
||||||
self.observed
|
|
||||||
.locations
|
|
||||||
.insert(node.get_id(), ObservedStateLocation { conf: Some(config) });
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -260,11 +240,8 @@ impl Reconciler {
|
|||||||
tenant_shard_id: TenantShardId,
|
tenant_shard_id: TenantShardId,
|
||||||
node: &Node,
|
node: &Node,
|
||||||
) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
|
) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
|
||||||
let client = PageserverClient::new(
|
let client =
|
||||||
node.get_id(),
|
mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
|
||||||
node.base_url(),
|
|
||||||
self.service_config.jwt_token.as_deref(),
|
|
||||||
);
|
|
||||||
|
|
||||||
let timelines = client.timeline_list(&tenant_shard_id).await?;
|
let timelines = client.timeline_list(&tenant_shard_id).await?;
|
||||||
Ok(timelines
|
Ok(timelines
|
||||||
@@ -278,81 +255,22 @@ impl Reconciler {
|
|||||||
tenant_shard_id: TenantShardId,
|
tenant_shard_id: TenantShardId,
|
||||||
node: &Node,
|
node: &Node,
|
||||||
) -> Result<(), ReconcileError> {
|
) -> Result<(), ReconcileError> {
|
||||||
// This is not the timeout for a request, but the total amount of time we're willing to wait
|
match node
|
||||||
// for a secondary location to get up to date before
|
.with_client_retries(
|
||||||
const TOTAL_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(300);
|
|client| async move { client.tenant_secondary_download(tenant_shard_id).await },
|
||||||
|
&self.service_config.jwt_token,
|
||||||
// This the long-polling interval for the secondary download requests we send to destination pageserver
|
1,
|
||||||
// during a migration.
|
1,
|
||||||
const REQUEST_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(20);
|
Duration::from_secs(60),
|
||||||
|
&self.cancel,
|
||||||
let started_at = Instant::now();
|
)
|
||||||
|
.await
|
||||||
loop {
|
{
|
||||||
let (status, progress) = match node
|
None => Err(ReconcileError::Cancel),
|
||||||
.with_client_retries(
|
Some(Ok(_)) => Ok(()),
|
||||||
|client| async move {
|
Some(Err(e)) => {
|
||||||
client
|
tracing::info!(" (skipping destination download: {})", e);
|
||||||
.tenant_secondary_download(
|
Ok(())
|
||||||
tenant_shard_id,
|
|
||||||
Some(REQUEST_DOWNLOAD_TIMEOUT),
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
},
|
|
||||||
&self.service_config.jwt_token,
|
|
||||||
1,
|
|
||||||
3,
|
|
||||||
REQUEST_DOWNLOAD_TIMEOUT * 2,
|
|
||||||
&self.cancel,
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
{
|
|
||||||
None => Err(ReconcileError::Cancel),
|
|
||||||
Some(Ok(v)) => Ok(v),
|
|
||||||
Some(Err(e)) => {
|
|
||||||
// Give up, but proceed: it's unfortunate if we couldn't freshen the destination before
|
|
||||||
// attaching, but we should not let an issue with a secondary location stop us proceeding
|
|
||||||
// with a live migration.
|
|
||||||
tracing::warn!("Failed to prepare by downloading layers on node {node}: {e})");
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
}?;
|
|
||||||
|
|
||||||
if status == StatusCode::OK {
|
|
||||||
tracing::info!(
|
|
||||||
"Downloads to {} complete: {}/{} layers, {}/{} bytes",
|
|
||||||
node,
|
|
||||||
progress.layers_downloaded,
|
|
||||||
progress.layers_total,
|
|
||||||
progress.bytes_downloaded,
|
|
||||||
progress.bytes_total
|
|
||||||
);
|
|
||||||
return Ok(());
|
|
||||||
} else if status == StatusCode::ACCEPTED {
|
|
||||||
let total_runtime = started_at.elapsed();
|
|
||||||
if total_runtime > TOTAL_DOWNLOAD_TIMEOUT {
|
|
||||||
tracing::warn!("Timed out after {}ms downloading layers to {node}. Progress so far: {}/{} layers, {}/{} bytes",
|
|
||||||
total_runtime.as_millis(),
|
|
||||||
progress.layers_downloaded,
|
|
||||||
progress.layers_total,
|
|
||||||
progress.bytes_downloaded,
|
|
||||||
progress.bytes_total
|
|
||||||
);
|
|
||||||
// Give up, but proceed: an incompletely warmed destination doesn't prevent migration working,
|
|
||||||
// it just makes the I/O performance for users less good.
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
|
|
||||||
// Log and proceed around the loop to retry. We don't sleep between requests, because our HTTP call
|
|
||||||
// to the pageserver is a long-poll.
|
|
||||||
tracing::info!(
|
|
||||||
"Downloads to {} not yet complete: {}/{} layers, {}/{} bytes",
|
|
||||||
node,
|
|
||||||
progress.layers_downloaded,
|
|
||||||
progress.layers_total,
|
|
||||||
progress.bytes_downloaded,
|
|
||||||
progress.bytes_total
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -487,7 +405,6 @@ impl Reconciler {
|
|||||||
while let Err(e) = self.compute_notify().await {
|
while let Err(e) = self.compute_notify().await {
|
||||||
match e {
|
match e {
|
||||||
NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)),
|
NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)),
|
||||||
NotifyError::ShuttingDown => return Err(ReconcileError::Cancel),
|
|
||||||
_ => {
|
_ => {
|
||||||
tracing::warn!(
|
tracing::warn!(
|
||||||
"Live migration blocked by compute notification error, retrying: {e}"
|
"Live migration blocked by compute notification error, retrying: {e}"
|
||||||
@@ -496,7 +413,7 @@ impl Reconciler {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Downgrade the origin to secondary. If the tenant's policy is PlacementPolicy::Attached(0), then
|
// Downgrade the origin to secondary. If the tenant's policy is PlacementPolicy::Single, then
|
||||||
// this location will be deleted in the general case reconciliation that runs after this.
|
// this location will be deleted in the general case reconciliation that runs after this.
|
||||||
let origin_secondary_conf = build_location_config(
|
let origin_secondary_conf = build_location_config(
|
||||||
&self.shard,
|
&self.shard,
|
||||||
@@ -568,29 +485,17 @@ impl Reconciler {
|
|||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
Some(Ok(observed)) => Some(observed),
|
Some(Ok(observed)) => observed,
|
||||||
Some(Err(mgmt_api::Error::ApiError(status, _msg)))
|
|
||||||
if status == StatusCode::NOT_FOUND =>
|
|
||||||
{
|
|
||||||
None
|
|
||||||
}
|
|
||||||
Some(Err(e)) => return Err(e.into()),
|
Some(Err(e)) => return Err(e.into()),
|
||||||
None => return Err(ReconcileError::Cancel),
|
None => return Err(ReconcileError::Cancel),
|
||||||
};
|
};
|
||||||
tracing::info!("Scanned location configuration on {attached_node}: {observed_conf:?}");
|
tracing::info!("Scanned location configuration on {attached_node}: {observed_conf:?}");
|
||||||
match observed_conf {
|
self.observed.locations.insert(
|
||||||
Some(conf) => {
|
attached_node.get_id(),
|
||||||
// Pageserver returned a state: update it in observed. This may still be an indeterminate (None) state,
|
ObservedStateLocation {
|
||||||
// if internally the pageserver's TenantSlot was being mutated (e.g. some long running API call is still running)
|
conf: observed_conf,
|
||||||
self.observed
|
},
|
||||||
.locations
|
);
|
||||||
.insert(attached_node.get_id(), ObservedStateLocation { conf });
|
|
||||||
}
|
|
||||||
None => {
|
|
||||||
// Pageserver returned 404: we have confirmation that there is no state for this shard on that pageserver.
|
|
||||||
self.observed.locations.remove(&attached_node.get_id());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -620,12 +525,7 @@ impl Reconciler {
|
|||||||
)));
|
)));
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut wanted_conf = attached_location_conf(
|
let mut wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
|
||||||
generation,
|
|
||||||
&self.shard,
|
|
||||||
&self.config,
|
|
||||||
!self.intent.secondary.is_empty(),
|
|
||||||
);
|
|
||||||
match self.observed.locations.get(&node.get_id()) {
|
match self.observed.locations.get(&node.get_id()) {
|
||||||
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
|
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
|
||||||
// Nothing to do
|
// Nothing to do
|
||||||
@@ -762,26 +662,10 @@ impl Reconciler {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// We tweak the externally-set TenantConfig while configuring
|
|
||||||
/// locations, using our awareness of whether secondary locations
|
|
||||||
/// are in use to automatically enable/disable heatmap uploads.
|
|
||||||
fn ha_aware_config(config: &TenantConfig, has_secondaries: bool) -> TenantConfig {
|
|
||||||
let mut config = config.clone();
|
|
||||||
if has_secondaries {
|
|
||||||
if config.heatmap_period.is_none() {
|
|
||||||
config.heatmap_period = Some(DEFAULT_HEATMAP_PERIOD.to_string());
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
config.heatmap_period = None;
|
|
||||||
}
|
|
||||||
config
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn attached_location_conf(
|
pub(crate) fn attached_location_conf(
|
||||||
generation: Generation,
|
generation: Generation,
|
||||||
shard: &ShardIdentity,
|
shard: &ShardIdentity,
|
||||||
config: &TenantConfig,
|
config: &TenantConfig,
|
||||||
has_secondaries: bool,
|
|
||||||
) -> LocationConfig {
|
) -> LocationConfig {
|
||||||
LocationConfig {
|
LocationConfig {
|
||||||
mode: LocationConfigMode::AttachedSingle,
|
mode: LocationConfigMode::AttachedSingle,
|
||||||
@@ -790,7 +674,7 @@ pub(crate) fn attached_location_conf(
|
|||||||
shard_number: shard.number.0,
|
shard_number: shard.number.0,
|
||||||
shard_count: shard.count.literal(),
|
shard_count: shard.count.literal(),
|
||||||
shard_stripe_size: shard.stripe_size.0,
|
shard_stripe_size: shard.stripe_size.0,
|
||||||
tenant_conf: ha_aware_config(config, has_secondaries),
|
tenant_conf: config.clone(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -805,6 +689,6 @@ pub(crate) fn secondary_location_conf(
|
|||||||
shard_number: shard.number.0,
|
shard_number: shard.number.0,
|
||||||
shard_count: shard.count.literal(),
|
shard_count: shard.count.literal(),
|
||||||
shard_stripe_size: shard.stripe_size.0,
|
shard_stripe_size: shard.stripe_size.0,
|
||||||
tenant_conf: ha_aware_config(config, true),
|
tenant_conf: config.clone(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1,5 +1,4 @@
|
|||||||
use crate::{node::Node, tenant_shard::TenantShard};
|
use crate::{node::Node, tenant_state::TenantState};
|
||||||
use pageserver_api::controller_api::UtilizationScore;
|
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use utils::{http::error::ApiError, id::NodeId};
|
use utils::{http::error::ApiError, id::NodeId};
|
||||||
@@ -20,34 +19,15 @@ impl From<ScheduleError> for ApiError {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Eq, PartialEq)]
|
#[derive(Serialize, Eq, PartialEq)]
|
||||||
pub enum MaySchedule {
|
|
||||||
Yes(UtilizationScore),
|
|
||||||
No,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Serialize)]
|
|
||||||
struct SchedulerNode {
|
struct SchedulerNode {
|
||||||
/// How many shards are currently scheduled on this node, via their [`crate::tenant_shard::IntentState`].
|
/// How many shards are currently scheduled on this node, via their [`crate::tenant_state::IntentState`].
|
||||||
shard_count: usize,
|
shard_count: usize,
|
||||||
|
|
||||||
/// Whether this node is currently elegible to have new shards scheduled (this is derived
|
/// Whether this node is currently elegible to have new shards scheduled (this is derived
|
||||||
/// from a node's availability state and scheduling policy).
|
/// from a node's availability state and scheduling policy).
|
||||||
may_schedule: MaySchedule,
|
may_schedule: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PartialEq for SchedulerNode {
|
|
||||||
fn eq(&self, other: &Self) -> bool {
|
|
||||||
let may_schedule_matches = matches!(
|
|
||||||
(&self.may_schedule, &other.may_schedule),
|
|
||||||
(MaySchedule::Yes(_), MaySchedule::Yes(_)) | (MaySchedule::No, MaySchedule::No)
|
|
||||||
);
|
|
||||||
|
|
||||||
may_schedule_matches && self.shard_count == other.shard_count
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Eq for SchedulerNode {}
|
|
||||||
|
|
||||||
/// This type is responsible for selecting which node is used when a tenant shard needs to choose a pageserver
|
/// This type is responsible for selecting which node is used when a tenant shard needs to choose a pageserver
|
||||||
/// on which to run.
|
/// on which to run.
|
||||||
///
|
///
|
||||||
@@ -58,86 +38,6 @@ pub(crate) struct Scheduler {
|
|||||||
nodes: HashMap<NodeId, SchedulerNode>,
|
nodes: HashMap<NodeId, SchedulerNode>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Score for soft constraint scheduling: lower scores are preferred to higher scores.
|
|
||||||
///
|
|
||||||
/// For example, we may set an affinity score based on the number of shards from the same
|
|
||||||
/// tenant already on a node, to implicitly prefer to balance out shards.
|
|
||||||
#[derive(Copy, Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
|
|
||||||
pub(crate) struct AffinityScore(pub(crate) usize);
|
|
||||||
|
|
||||||
impl AffinityScore {
|
|
||||||
/// If we have no anti-affinity at all toward a node, this is its score. It means
|
|
||||||
/// the scheduler has a free choice amongst nodes with this score, and may pick a node
|
|
||||||
/// based on other information such as total utilization.
|
|
||||||
pub(crate) const FREE: Self = Self(0);
|
|
||||||
|
|
||||||
pub(crate) fn inc(&mut self) {
|
|
||||||
self.0 += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl std::ops::Add for AffinityScore {
|
|
||||||
type Output = Self;
|
|
||||||
|
|
||||||
fn add(self, rhs: Self) -> Self::Output {
|
|
||||||
Self(self.0 + rhs.0)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Hint for whether this is a sincere attempt to schedule, or a speculative
|
|
||||||
/// check for where we _would_ schedule (done during optimization)
|
|
||||||
#[derive(Debug)]
|
|
||||||
pub(crate) enum ScheduleMode {
|
|
||||||
Normal,
|
|
||||||
Speculative,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Default for ScheduleMode {
|
|
||||||
fn default() -> Self {
|
|
||||||
Self::Normal
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// For carrying state between multiple calls to [`TenantShard::schedule`], e.g. when calling
|
|
||||||
// it for many shards in the same tenant.
|
|
||||||
#[derive(Debug, Default)]
|
|
||||||
pub(crate) struct ScheduleContext {
|
|
||||||
/// Sparse map of nodes: omitting a node implicitly makes its affinity [`AffinityScore::FREE`]
|
|
||||||
pub(crate) nodes: HashMap<NodeId, AffinityScore>,
|
|
||||||
|
|
||||||
/// Specifically how many _attached_ locations are on each node
|
|
||||||
pub(crate) attached_nodes: HashMap<NodeId, usize>,
|
|
||||||
|
|
||||||
pub(crate) mode: ScheduleMode,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ScheduleContext {
|
|
||||||
/// Input is a list of nodes we would like to avoid using again within this context. The more
|
|
||||||
/// times a node is passed into this call, the less inclined we are to use it.
|
|
||||||
pub(crate) fn avoid(&mut self, nodes: &[NodeId]) {
|
|
||||||
for node_id in nodes {
|
|
||||||
let entry = self.nodes.entry(*node_id).or_insert(AffinityScore::FREE);
|
|
||||||
entry.inc()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn push_attached(&mut self, node_id: NodeId) {
|
|
||||||
let entry = self.attached_nodes.entry(node_id).or_default();
|
|
||||||
*entry += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn get_node_affinity(&self, node_id: NodeId) -> AffinityScore {
|
|
||||||
self.nodes
|
|
||||||
.get(&node_id)
|
|
||||||
.copied()
|
|
||||||
.unwrap_or(AffinityScore::FREE)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn get_node_attachments(&self, node_id: NodeId) -> usize {
|
|
||||||
self.attached_nodes.get(&node_id).copied().unwrap_or(0)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Scheduler {
|
impl Scheduler {
|
||||||
pub(crate) fn new<'a>(nodes: impl Iterator<Item = &'a Node>) -> Self {
|
pub(crate) fn new<'a>(nodes: impl Iterator<Item = &'a Node>) -> Self {
|
||||||
let mut scheduler_nodes = HashMap::new();
|
let mut scheduler_nodes = HashMap::new();
|
||||||
@@ -163,7 +63,7 @@ impl Scheduler {
|
|||||||
pub(crate) fn consistency_check<'a>(
|
pub(crate) fn consistency_check<'a>(
|
||||||
&self,
|
&self,
|
||||||
nodes: impl Iterator<Item = &'a Node>,
|
nodes: impl Iterator<Item = &'a Node>,
|
||||||
shards: impl Iterator<Item = &'a TenantShard>,
|
shards: impl Iterator<Item = &'a TenantState>,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let mut expect_nodes: HashMap<NodeId, SchedulerNode> = HashMap::new();
|
let mut expect_nodes: HashMap<NodeId, SchedulerNode> = HashMap::new();
|
||||||
for node in nodes {
|
for node in nodes {
|
||||||
@@ -286,15 +186,13 @@ impl Scheduler {
|
|||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: When the utilization score returned by the pageserver becomes meaningful,
|
|
||||||
// schedule based on that instead of the shard count.
|
|
||||||
let node = nodes
|
let node = nodes
|
||||||
.iter()
|
.iter()
|
||||||
.map(|node_id| {
|
.map(|node_id| {
|
||||||
let may_schedule = self
|
let may_schedule = self
|
||||||
.nodes
|
.nodes
|
||||||
.get(node_id)
|
.get(node_id)
|
||||||
.map(|n| n.may_schedule != MaySchedule::No)
|
.map(|n| n.may_schedule)
|
||||||
.unwrap_or(false);
|
.unwrap_or(false);
|
||||||
(*node_id, may_schedule)
|
(*node_id, may_schedule)
|
||||||
})
|
})
|
||||||
@@ -304,94 +202,59 @@ impl Scheduler {
|
|||||||
node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None })
|
node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None })
|
||||||
}
|
}
|
||||||
|
|
||||||
/// hard_exclude: it is forbidden to use nodes in this list, typically becacuse they
|
pub(crate) fn schedule_shard(&self, hard_exclude: &[NodeId]) -> Result<NodeId, ScheduleError> {
|
||||||
/// are already in use by this shard -- we use this to avoid picking the same node
|
|
||||||
/// as both attached and secondary location. This is a hard constraint: if we cannot
|
|
||||||
/// find any nodes that aren't in this list, then we will return a [`ScheduleError::ImpossibleConstraint`].
|
|
||||||
///
|
|
||||||
/// context: we prefer to avoid using nodes identified in the context, according
|
|
||||||
/// to their anti-affinity score. We use this to prefeer to avoid placing shards in
|
|
||||||
/// the same tenant on the same node. This is a soft constraint: the context will never
|
|
||||||
/// cause us to fail to schedule a shard.
|
|
||||||
pub(crate) fn schedule_shard(
|
|
||||||
&self,
|
|
||||||
hard_exclude: &[NodeId],
|
|
||||||
context: &ScheduleContext,
|
|
||||||
) -> Result<NodeId, ScheduleError> {
|
|
||||||
if self.nodes.is_empty() {
|
if self.nodes.is_empty() {
|
||||||
return Err(ScheduleError::NoPageservers);
|
return Err(ScheduleError::NoPageservers);
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut scores: Vec<(NodeId, AffinityScore, usize)> = self
|
let mut tenant_counts: Vec<(NodeId, usize)> = self
|
||||||
.nodes
|
.nodes
|
||||||
.iter()
|
.iter()
|
||||||
.filter_map(|(k, v)| {
|
.filter_map(|(k, v)| {
|
||||||
if hard_exclude.contains(k) || v.may_schedule == MaySchedule::No {
|
if hard_exclude.contains(k) || !v.may_schedule {
|
||||||
None
|
None
|
||||||
} else {
|
} else {
|
||||||
Some((
|
Some((*k, v.shard_count))
|
||||||
*k,
|
|
||||||
context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE),
|
|
||||||
v.shard_count,
|
|
||||||
))
|
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
// Sort by, in order of precedence:
|
// Sort by tenant count. Nodes with the same tenant count are sorted by ID.
|
||||||
// 1st: Affinity score. We should never pick a higher-score node if a lower-score node is available
|
tenant_counts.sort_by_key(|i| (i.1, i.0));
|
||||||
// 2nd: Utilization. Within nodes with the same affinity, use the least loaded nodes.
|
|
||||||
// 3rd: Node ID. This is a convenience to make selection deterministic in tests and empty systems.
|
|
||||||
scores.sort_by_key(|i| (i.1, i.2, i.0));
|
|
||||||
|
|
||||||
if scores.is_empty() {
|
if tenant_counts.is_empty() {
|
||||||
// After applying constraints, no pageservers were left.
|
// After applying constraints, no pageservers were left. We log some detail about
|
||||||
if !matches!(context.mode, ScheduleMode::Speculative) {
|
// the state of nodes to help understand why this happened. This is not logged as an error because
|
||||||
// If this was not a speculative attempt, log details to understand why we couldn't
|
// it is legitimately possible for enough nodes to be Offline to prevent scheduling a shard.
|
||||||
// schedule: this may help an engineer understand if some nodes are marked offline
|
tracing::info!("Scheduling failure, while excluding {hard_exclude:?}, node states:");
|
||||||
// in a way that's preventing progress.
|
for (node_id, node) in &self.nodes {
|
||||||
tracing::info!(
|
tracing::info!(
|
||||||
"Scheduling failure, while excluding {hard_exclude:?}, node states:"
|
"Node {node_id}: may_schedule={} shards={}",
|
||||||
|
node.may_schedule,
|
||||||
|
node.shard_count
|
||||||
);
|
);
|
||||||
for (node_id, node) in &self.nodes {
|
|
||||||
tracing::info!(
|
|
||||||
"Node {node_id}: may_schedule={} shards={}",
|
|
||||||
node.may_schedule != MaySchedule::No,
|
|
||||||
node.shard_count
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return Err(ScheduleError::ImpossibleConstraint);
|
return Err(ScheduleError::ImpossibleConstraint);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Lowest score wins
|
let node_id = tenant_counts.first().unwrap().0;
|
||||||
let node_id = scores.first().unwrap().0;
|
tracing::info!(
|
||||||
|
"scheduler selected node {node_id} (elegible nodes {:?}, exclude: {hard_exclude:?})",
|
||||||
if !matches!(context.mode, ScheduleMode::Speculative) {
|
tenant_counts.iter().map(|i| i.0 .0).collect::<Vec<_>>()
|
||||||
tracing::info!(
|
|
||||||
"scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})",
|
|
||||||
scores.iter().map(|i| i.0 .0).collect::<Vec<_>>()
|
|
||||||
);
|
);
|
||||||
}
|
|
||||||
|
|
||||||
// Note that we do not update shard count here to reflect the scheduling: that
|
// Note that we do not update shard count here to reflect the scheduling: that
|
||||||
// is IntentState's job when the scheduled location is used.
|
// is IntentState's job when the scheduled location is used.
|
||||||
|
|
||||||
Ok(node_id)
|
Ok(node_id)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Unit test access to internal state
|
|
||||||
#[cfg(test)]
|
|
||||||
pub(crate) fn get_node_shard_count(&self, node_id: NodeId) -> usize {
|
|
||||||
self.nodes.get(&node_id).unwrap().shard_count
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub(crate) mod test_utils {
|
pub(crate) mod test_utils {
|
||||||
|
|
||||||
use crate::node::Node;
|
use crate::node::Node;
|
||||||
use pageserver_api::controller_api::{NodeAvailability, UtilizationScore};
|
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use utils::id::NodeId;
|
use utils::id::NodeId;
|
||||||
/// Test helper: synthesize the requested number of nodes, all in active state.
|
/// Test helper: synthesize the requested number of nodes, all in active state.
|
||||||
@@ -401,14 +264,13 @@ pub(crate) mod test_utils {
|
|||||||
(1..n + 1)
|
(1..n + 1)
|
||||||
.map(|i| {
|
.map(|i| {
|
||||||
(NodeId(i), {
|
(NodeId(i), {
|
||||||
let mut node = Node::new(
|
let node = Node::new(
|
||||||
NodeId(i),
|
NodeId(i),
|
||||||
format!("httphost-{i}"),
|
format!("httphost-{i}"),
|
||||||
80 + i as u16,
|
80 + i as u16,
|
||||||
format!("pghost-{i}"),
|
format!("pghost-{i}"),
|
||||||
5432 + i as u16,
|
5432 + i as u16,
|
||||||
);
|
);
|
||||||
node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
|
|
||||||
assert!(node.is_available());
|
assert!(node.is_available());
|
||||||
node
|
node
|
||||||
})
|
})
|
||||||
@@ -421,7 +283,7 @@ pub(crate) mod test_utils {
|
|||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
use crate::tenant_shard::IntentState;
|
use crate::tenant_state::IntentState;
|
||||||
#[test]
|
#[test]
|
||||||
fn scheduler_basic() -> anyhow::Result<()> {
|
fn scheduler_basic() -> anyhow::Result<()> {
|
||||||
let nodes = test_utils::make_test_nodes(2);
|
let nodes = test_utils::make_test_nodes(2);
|
||||||
@@ -430,17 +292,15 @@ mod tests {
|
|||||||
let mut t1_intent = IntentState::new();
|
let mut t1_intent = IntentState::new();
|
||||||
let mut t2_intent = IntentState::new();
|
let mut t2_intent = IntentState::new();
|
||||||
|
|
||||||
let context = ScheduleContext::default();
|
let scheduled = scheduler.schedule_shard(&[])?;
|
||||||
|
|
||||||
let scheduled = scheduler.schedule_shard(&[], &context)?;
|
|
||||||
t1_intent.set_attached(&mut scheduler, Some(scheduled));
|
t1_intent.set_attached(&mut scheduler, Some(scheduled));
|
||||||
let scheduled = scheduler.schedule_shard(&[], &context)?;
|
let scheduled = scheduler.schedule_shard(&[])?;
|
||||||
t2_intent.set_attached(&mut scheduler, Some(scheduled));
|
t2_intent.set_attached(&mut scheduler, Some(scheduled));
|
||||||
|
|
||||||
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
|
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
|
||||||
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);
|
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);
|
||||||
|
|
||||||
let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers(), &context)?;
|
let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers())?;
|
||||||
t1_intent.push_secondary(&mut scheduler, scheduled);
|
t1_intent.push_secondary(&mut scheduler, scheduled);
|
||||||
|
|
||||||
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
|
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
|
||||||
@@ -22,7 +22,6 @@ diesel::table! {
|
|||||||
placement_policy -> Varchar,
|
placement_policy -> Varchar,
|
||||||
splitting -> Int2,
|
splitting -> Int2,
|
||||||
config -> Text,
|
config -> Text,
|
||||||
scheduling_policy -> Varchar,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -4,12 +4,8 @@ use std::{
|
|||||||
time::Duration,
|
time::Duration,
|
||||||
};
|
};
|
||||||
|
|
||||||
use crate::{
|
use crate::{metrics, persistence::TenantShardPersistence};
|
||||||
metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
|
use pageserver_api::controller_api::PlacementPolicy;
|
||||||
persistence::TenantShardPersistence,
|
|
||||||
scheduler::{AffinityScore, MaySchedule, ScheduleContext},
|
|
||||||
};
|
|
||||||
use pageserver_api::controller_api::{PlacementPolicy, ShardSchedulingPolicy};
|
|
||||||
use pageserver_api::{
|
use pageserver_api::{
|
||||||
models::{LocationConfig, LocationConfigMode, TenantConfig},
|
models::{LocationConfig, LocationConfigMode, TenantConfig},
|
||||||
shard::{ShardIdentity, TenantShardId},
|
shard::{ShardIdentity, TenantShardId},
|
||||||
@@ -50,7 +46,7 @@ where
|
|||||||
/// This struct implement Serialize for debugging purposes, but is _not_ persisted
|
/// This struct implement Serialize for debugging purposes, but is _not_ persisted
|
||||||
/// itself: see [`crate::persistence`] for the subset of tenant shard state that is persisted.
|
/// itself: see [`crate::persistence`] for the subset of tenant shard state that is persisted.
|
||||||
#[derive(Serialize)]
|
#[derive(Serialize)]
|
||||||
pub(crate) struct TenantShard {
|
pub(crate) struct TenantState {
|
||||||
pub(crate) tenant_shard_id: TenantShardId,
|
pub(crate) tenant_shard_id: TenantShardId,
|
||||||
|
|
||||||
pub(crate) shard: ShardIdentity,
|
pub(crate) shard: ShardIdentity,
|
||||||
@@ -117,10 +113,6 @@ pub(crate) struct TenantShard {
|
|||||||
/// sending it. This is the mechanism by which compute notifications are included in the scope
|
/// sending it. This is the mechanism by which compute notifications are included in the scope
|
||||||
/// of state that we publish externally in an eventually consistent way.
|
/// of state that we publish externally in an eventually consistent way.
|
||||||
pub(crate) pending_compute_notification: bool,
|
pub(crate) pending_compute_notification: bool,
|
||||||
|
|
||||||
// Support/debug tool: if something is going wrong or flapping with scheduling, this may
|
|
||||||
// be set to a non-active state to avoid making changes while the issue is fixed.
|
|
||||||
scheduling_policy: ShardSchedulingPolicy,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Default, Clone, Debug, Serialize)]
|
#[derive(Default, Clone, Debug, Serialize)]
|
||||||
@@ -251,13 +243,8 @@ impl IntentState {
|
|||||||
|
|
||||||
impl Drop for IntentState {
|
impl Drop for IntentState {
|
||||||
fn drop(&mut self) {
|
fn drop(&mut self) {
|
||||||
// Must clear before dropping, to avoid leaving stale refcounts in the Scheduler.
|
// Must clear before dropping, to avoid leaving stale refcounts in the Scheduler
|
||||||
// We do not check this while panicking, to avoid polluting unit test failures or
|
debug_assert!(self.attached.is_none() && self.secondary.is_empty());
|
||||||
// other assertions with this assertion's output. It's still wrong to leak these,
|
|
||||||
// but if we already have a panic then we don't need to independently flag this case.
|
|
||||||
if !(std::thread::panicking()) {
|
|
||||||
debug_assert!(self.attached.is_none() && self.secondary.is_empty());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -302,26 +289,6 @@ pub enum ReconcileWaitError {
|
|||||||
Failed(TenantShardId, String),
|
Failed(TenantShardId, String),
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Eq, PartialEq, Debug)]
|
|
||||||
pub(crate) struct ReplaceSecondary {
|
|
||||||
old_node_id: NodeId,
|
|
||||||
new_node_id: NodeId,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Eq, PartialEq, Debug)]
|
|
||||||
pub(crate) struct MigrateAttachment {
|
|
||||||
old_attached_node_id: NodeId,
|
|
||||||
new_attached_node_id: NodeId,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Eq, PartialEq, Debug)]
|
|
||||||
pub(crate) enum ScheduleOptimization {
|
|
||||||
// Replace one of our secondary locations with a different node
|
|
||||||
ReplaceSecondary(ReplaceSecondary),
|
|
||||||
// Migrate attachment to an existing secondary location
|
|
||||||
MigrateAttachment(MigrateAttachment),
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ReconcilerWaiter {
|
impl ReconcilerWaiter {
|
||||||
pub(crate) async fn wait_timeout(&self, timeout: Duration) -> Result<(), ReconcileWaitError> {
|
pub(crate) async fn wait_timeout(&self, timeout: Duration) -> Result<(), ReconcileWaitError> {
|
||||||
tokio::select! {
|
tokio::select! {
|
||||||
@@ -354,7 +321,7 @@ pub(crate) struct ReconcilerHandle {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// When a reconcile task completes, it sends this result object
|
/// When a reconcile task completes, it sends this result object
|
||||||
/// to be applied to the primary TenantShard.
|
/// to be applied to the primary TenantState.
|
||||||
pub(crate) struct ReconcileResult {
|
pub(crate) struct ReconcileResult {
|
||||||
pub(crate) sequence: Sequence,
|
pub(crate) sequence: Sequence,
|
||||||
/// On errors, `observed` should be treated as an incompleted description
|
/// On errors, `observed` should be treated as an incompleted description
|
||||||
@@ -367,7 +334,7 @@ pub(crate) struct ReconcileResult {
|
|||||||
pub(crate) generation: Option<Generation>,
|
pub(crate) generation: Option<Generation>,
|
||||||
pub(crate) observed: ObservedState,
|
pub(crate) observed: ObservedState,
|
||||||
|
|
||||||
/// Set [`TenantShard::pending_compute_notification`] from this flag
|
/// Set [`TenantState::pending_compute_notification`] from this flag
|
||||||
pub(crate) pending_compute_notification: bool,
|
pub(crate) pending_compute_notification: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -379,7 +346,7 @@ impl ObservedState {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TenantShard {
|
impl TenantState {
|
||||||
pub(crate) fn new(
|
pub(crate) fn new(
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_shard_id: TenantShardId,
|
||||||
shard: ShardIdentity,
|
shard: ShardIdentity,
|
||||||
@@ -400,7 +367,6 @@ impl TenantShard {
|
|||||||
error_waiter: Arc::new(SeqWait::new(Sequence(0))),
|
error_waiter: Arc::new(SeqWait::new(Sequence(0))),
|
||||||
last_error: Arc::default(),
|
last_error: Arc::default(),
|
||||||
pending_compute_notification: false,
|
pending_compute_notification: false,
|
||||||
scheduling_policy: ShardSchedulingPolicy::default(),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -456,7 +422,6 @@ impl TenantShard {
|
|||||||
fn schedule_attached(
|
fn schedule_attached(
|
||||||
&mut self,
|
&mut self,
|
||||||
scheduler: &mut Scheduler,
|
scheduler: &mut Scheduler,
|
||||||
context: &ScheduleContext,
|
|
||||||
) -> Result<(bool, NodeId), ScheduleError> {
|
) -> Result<(bool, NodeId), ScheduleError> {
|
||||||
// No work to do if we already have an attached tenant
|
// No work to do if we already have an attached tenant
|
||||||
if let Some(node_id) = self.intent.attached {
|
if let Some(node_id) = self.intent.attached {
|
||||||
@@ -470,33 +435,14 @@ impl TenantShard {
|
|||||||
Ok((true, promote_secondary))
|
Ok((true, promote_secondary))
|
||||||
} else {
|
} else {
|
||||||
// Pick a fresh node: either we had no secondaries or none were schedulable
|
// Pick a fresh node: either we had no secondaries or none were schedulable
|
||||||
let node_id = scheduler.schedule_shard(&self.intent.secondary, context)?;
|
let node_id = scheduler.schedule_shard(&self.intent.secondary)?;
|
||||||
tracing::debug!("Selected {} as attached", node_id);
|
tracing::debug!("Selected {} as attached", node_id);
|
||||||
self.intent.set_attached(scheduler, Some(node_id));
|
self.intent.set_attached(scheduler, Some(node_id));
|
||||||
Ok((true, node_id))
|
Ok((true, node_id))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn schedule(
|
pub(crate) fn schedule(&mut self, scheduler: &mut Scheduler) -> Result<(), ScheduleError> {
|
||||||
&mut self,
|
|
||||||
scheduler: &mut Scheduler,
|
|
||||||
context: &mut ScheduleContext,
|
|
||||||
) -> Result<(), ScheduleError> {
|
|
||||||
let r = self.do_schedule(scheduler, context);
|
|
||||||
|
|
||||||
context.avoid(&self.intent.all_pageservers());
|
|
||||||
if let Some(attached) = self.intent.get_attached() {
|
|
||||||
context.push_attached(*attached);
|
|
||||||
}
|
|
||||||
|
|
||||||
r
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn do_schedule(
|
|
||||||
&mut self,
|
|
||||||
scheduler: &mut Scheduler,
|
|
||||||
context: &ScheduleContext,
|
|
||||||
) -> Result<(), ScheduleError> {
|
|
||||||
// TODO: before scheduling new nodes, check if any existing content in
|
// TODO: before scheduling new nodes, check if any existing content in
|
||||||
// self.intent refers to pageservers that are offline, and pick other
|
// self.intent refers to pageservers that are offline, and pick other
|
||||||
// pageservers if so.
|
// pageservers if so.
|
||||||
@@ -504,16 +450,6 @@ impl TenantShard {
|
|||||||
// TODO: respect the splitting bit on tenants: if they are currently splitting then we may not
|
// TODO: respect the splitting bit on tenants: if they are currently splitting then we may not
|
||||||
// change their attach location.
|
// change their attach location.
|
||||||
|
|
||||||
match self.scheduling_policy {
|
|
||||||
ShardSchedulingPolicy::Active | ShardSchedulingPolicy::Essential => {}
|
|
||||||
ShardSchedulingPolicy::Pause | ShardSchedulingPolicy::Stop => {
|
|
||||||
// Warn to make it obvious why other things aren't happening/working, if we skip scheduling
|
|
||||||
tracing::warn!(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(),
|
|
||||||
"Scheduling is disabled by policy {:?}", self.scheduling_policy);
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Build the set of pageservers already in use by this tenant, to avoid scheduling
|
// Build the set of pageservers already in use by this tenant, to avoid scheduling
|
||||||
// more work on the same pageservers we're already using.
|
// more work on the same pageservers we're already using.
|
||||||
let mut modified = false;
|
let mut modified = false;
|
||||||
@@ -521,7 +457,22 @@ impl TenantShard {
|
|||||||
// Add/remove nodes to fulfil policy
|
// Add/remove nodes to fulfil policy
|
||||||
use PlacementPolicy::*;
|
use PlacementPolicy::*;
|
||||||
match self.policy {
|
match self.policy {
|
||||||
Attached(secondary_count) => {
|
Single => {
|
||||||
|
// Should have exactly one attached, and zero secondaries
|
||||||
|
if !self.intent.secondary.is_empty() {
|
||||||
|
self.intent.clear_secondary(scheduler);
|
||||||
|
modified = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
let (modified_attached, _attached_node_id) = self.schedule_attached(scheduler)?;
|
||||||
|
modified |= modified_attached;
|
||||||
|
|
||||||
|
if !self.intent.secondary.is_empty() {
|
||||||
|
self.intent.clear_secondary(scheduler);
|
||||||
|
modified = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Double(secondary_count) => {
|
||||||
let retain_secondaries = if self.intent.attached.is_none()
|
let retain_secondaries = if self.intent.attached.is_none()
|
||||||
&& scheduler.node_preferred(&self.intent.secondary).is_some()
|
&& scheduler.node_preferred(&self.intent.secondary).is_some()
|
||||||
{
|
{
|
||||||
@@ -540,13 +491,12 @@ impl TenantShard {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Should have exactly one attached, and N secondaries
|
// Should have exactly one attached, and N secondaries
|
||||||
let (modified_attached, attached_node_id) =
|
let (modified_attached, attached_node_id) = self.schedule_attached(scheduler)?;
|
||||||
self.schedule_attached(scheduler, context)?;
|
|
||||||
modified |= modified_attached;
|
modified |= modified_attached;
|
||||||
|
|
||||||
let mut used_pageservers = vec![attached_node_id];
|
let mut used_pageservers = vec![attached_node_id];
|
||||||
while self.intent.secondary.len() < secondary_count {
|
while self.intent.secondary.len() < secondary_count {
|
||||||
let node_id = scheduler.schedule_shard(&used_pageservers, context)?;
|
let node_id = scheduler.schedule_shard(&used_pageservers)?;
|
||||||
self.intent.push_secondary(scheduler, node_id);
|
self.intent.push_secondary(scheduler, node_id);
|
||||||
used_pageservers.push(node_id);
|
used_pageservers.push(node_id);
|
||||||
modified = true;
|
modified = true;
|
||||||
@@ -559,7 +509,7 @@ impl TenantShard {
|
|||||||
modified = true;
|
modified = true;
|
||||||
} else if self.intent.secondary.is_empty() {
|
} else if self.intent.secondary.is_empty() {
|
||||||
// Populate secondary by scheduling a fresh node
|
// Populate secondary by scheduling a fresh node
|
||||||
let node_id = scheduler.schedule_shard(&[], context)?;
|
let node_id = scheduler.schedule_shard(&[])?;
|
||||||
self.intent.push_secondary(scheduler, node_id);
|
self.intent.push_secondary(scheduler, node_id);
|
||||||
modified = true;
|
modified = true;
|
||||||
}
|
}
|
||||||
@@ -586,167 +536,6 @@ impl TenantShard {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Optimize attachments: if a shard has a secondary location that is preferable to
|
|
||||||
/// its primary location based on soft constraints, switch that secondary location
|
|
||||||
/// to be attached.
|
|
||||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
|
|
||||||
pub(crate) fn optimize_attachment(
|
|
||||||
&self,
|
|
||||||
nodes: &HashMap<NodeId, Node>,
|
|
||||||
schedule_context: &ScheduleContext,
|
|
||||||
) -> Option<ScheduleOptimization> {
|
|
||||||
let attached = (*self.intent.get_attached())?;
|
|
||||||
if self.intent.secondary.is_empty() {
|
|
||||||
// We can only do useful work if we have both attached and secondary locations: this
|
|
||||||
// function doesn't schedule new locations, only swaps between attached and secondaries.
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
|
|
||||||
let current_affinity_score = schedule_context.get_node_affinity(attached);
|
|
||||||
let current_attachment_count = schedule_context.get_node_attachments(attached);
|
|
||||||
|
|
||||||
// Generate score for each node, dropping any un-schedulable nodes.
|
|
||||||
let all_pageservers = self.intent.all_pageservers();
|
|
||||||
let mut scores = all_pageservers
|
|
||||||
.iter()
|
|
||||||
.flat_map(|node_id| {
|
|
||||||
if matches!(
|
|
||||||
nodes
|
|
||||||
.get(node_id)
|
|
||||||
.map(|n| n.may_schedule())
|
|
||||||
.unwrap_or(MaySchedule::No),
|
|
||||||
MaySchedule::No
|
|
||||||
) {
|
|
||||||
None
|
|
||||||
} else {
|
|
||||||
let affinity_score = schedule_context.get_node_affinity(*node_id);
|
|
||||||
let attachment_count = schedule_context.get_node_attachments(*node_id);
|
|
||||||
Some((*node_id, affinity_score, attachment_count))
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
|
|
||||||
// Sort precedence:
|
|
||||||
// 1st - prefer nodes with the lowest total affinity score
|
|
||||||
// 2nd - prefer nodes with the lowest number of attachments in this context
|
|
||||||
// 3rd - if all else is equal, sort by node ID for determinism in tests.
|
|
||||||
scores.sort_by_key(|i| (i.1, i.2, i.0));
|
|
||||||
|
|
||||||
if let Some((preferred_node, preferred_affinity_score, preferred_attachment_count)) =
|
|
||||||
scores.first()
|
|
||||||
{
|
|
||||||
if attached != *preferred_node {
|
|
||||||
// The best alternative must be more than 1 better than us, otherwise we could end
|
|
||||||
// up flapping back next time we're called (e.g. there's no point migrating from
|
|
||||||
// a location with score 1 to a score zero, because on next location the situation
|
|
||||||
// would be the same, but in reverse).
|
|
||||||
if current_affinity_score > *preferred_affinity_score + AffinityScore(1)
|
|
||||||
|| current_attachment_count > *preferred_attachment_count + 1
|
|
||||||
{
|
|
||||||
tracing::info!(
|
|
||||||
"Identified optimization: migrate attachment {attached}->{preferred_node} (secondaries {:?})",
|
|
||||||
self.intent.get_secondary()
|
|
||||||
);
|
|
||||||
return Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
|
|
||||||
old_attached_node_id: attached,
|
|
||||||
new_attached_node_id: *preferred_node,
|
|
||||||
}));
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
tracing::debug!(
|
|
||||||
"Node {} is already preferred (score {:?})",
|
|
||||||
preferred_node,
|
|
||||||
preferred_affinity_score
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Fall-through: we didn't find an optimization
|
|
||||||
None
|
|
||||||
}
|
|
||||||
|
|
||||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
|
|
||||||
pub(crate) fn optimize_secondary(
|
|
||||||
&self,
|
|
||||||
scheduler: &Scheduler,
|
|
||||||
schedule_context: &ScheduleContext,
|
|
||||||
) -> Option<ScheduleOptimization> {
|
|
||||||
if self.intent.secondary.is_empty() {
|
|
||||||
// We can only do useful work if we have both attached and secondary locations: this
|
|
||||||
// function doesn't schedule new locations, only swaps between attached and secondaries.
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
|
|
||||||
for secondary in self.intent.get_secondary() {
|
|
||||||
let Some(affinity_score) = schedule_context.nodes.get(secondary) else {
|
|
||||||
// We're already on a node unaffected any affinity constraints,
|
|
||||||
// so we won't change it.
|
|
||||||
continue;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Let the scheduler suggest a node, where it would put us if we were scheduling afresh
|
|
||||||
// This implicitly limits the choice to nodes that are available, and prefers nodes
|
|
||||||
// with lower utilization.
|
|
||||||
let Ok(candidate_node) =
|
|
||||||
scheduler.schedule_shard(&self.intent.all_pageservers(), schedule_context)
|
|
||||||
else {
|
|
||||||
// A scheduling error means we have no possible candidate replacements
|
|
||||||
continue;
|
|
||||||
};
|
|
||||||
|
|
||||||
let candidate_affinity_score = schedule_context
|
|
||||||
.nodes
|
|
||||||
.get(&candidate_node)
|
|
||||||
.unwrap_or(&AffinityScore::FREE);
|
|
||||||
|
|
||||||
// The best alternative must be more than 1 better than us, otherwise we could end
|
|
||||||
// up flapping back next time we're called.
|
|
||||||
if *candidate_affinity_score + AffinityScore(1) < *affinity_score {
|
|
||||||
// If some other node is available and has a lower score than this node, then
|
|
||||||
// that other node is a good place to migrate to.
|
|
||||||
tracing::info!(
|
|
||||||
"Identified optimization: replace secondary {secondary}->{candidate_node} (current secondaries {:?})",
|
|
||||||
self.intent.get_secondary()
|
|
||||||
);
|
|
||||||
return Some(ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
|
|
||||||
old_node_id: *secondary,
|
|
||||||
new_node_id: candidate_node,
|
|
||||||
}));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
None
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn apply_optimization(
|
|
||||||
&mut self,
|
|
||||||
scheduler: &mut Scheduler,
|
|
||||||
optimization: ScheduleOptimization,
|
|
||||||
) {
|
|
||||||
metrics::METRICS_REGISTRY
|
|
||||||
.metrics_group
|
|
||||||
.storage_controller_schedule_optimization
|
|
||||||
.inc();
|
|
||||||
|
|
||||||
match optimization {
|
|
||||||
ScheduleOptimization::MigrateAttachment(MigrateAttachment {
|
|
||||||
old_attached_node_id,
|
|
||||||
new_attached_node_id,
|
|
||||||
}) => {
|
|
||||||
self.intent.demote_attached(old_attached_node_id);
|
|
||||||
self.intent
|
|
||||||
.promote_attached(scheduler, new_attached_node_id);
|
|
||||||
}
|
|
||||||
ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
|
|
||||||
old_node_id,
|
|
||||||
new_node_id,
|
|
||||||
}) => {
|
|
||||||
self.intent.remove_secondary(scheduler, old_node_id);
|
|
||||||
self.intent.push_secondary(scheduler, new_node_id);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Query whether the tenant's observed state for attached node matches its intent state, and if so,
|
/// Query whether the tenant's observed state for attached node matches its intent state, and if so,
|
||||||
/// yield the node ID. This is appropriate for emitting compute hook notifications: we are checking that
|
/// yield the node ID. This is appropriate for emitting compute hook notifications: we are checking that
|
||||||
/// the node in question is not only where we intend to attach, but that the tenant is indeed already attached there.
|
/// the node in question is not only where we intend to attach, but that the tenant is indeed already attached there.
|
||||||
@@ -788,12 +577,7 @@ impl TenantShard {
|
|||||||
.generation
|
.generation
|
||||||
.expect("Attempted to enter attached state without a generation");
|
.expect("Attempted to enter attached state without a generation");
|
||||||
|
|
||||||
let wanted_conf = attached_location_conf(
|
let wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
|
||||||
generation,
|
|
||||||
&self.shard,
|
|
||||||
&self.config,
|
|
||||||
!self.intent.secondary.is_empty(),
|
|
||||||
);
|
|
||||||
match self.observed.locations.get(&node_id) {
|
match self.observed.locations.get(&node_id) {
|
||||||
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
|
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
|
||||||
Some(_) | None => {
|
Some(_) | None => {
|
||||||
@@ -891,19 +675,6 @@ impl TenantShard {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Pre-checks done: finally check whether we may actually do the work
|
|
||||||
match self.scheduling_policy {
|
|
||||||
ShardSchedulingPolicy::Active
|
|
||||||
| ShardSchedulingPolicy::Essential
|
|
||||||
| ShardSchedulingPolicy::Pause => {}
|
|
||||||
ShardSchedulingPolicy::Stop => {
|
|
||||||
// We only reach this point if there is work to do and we're going to skip
|
|
||||||
// doing it: warn it obvious why this tenant isn't doing what it ought to.
|
|
||||||
tracing::warn!("Skipping reconcile for policy {:?}", self.scheduling_policy);
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Build list of nodes from which the reconciler should detach
|
// Build list of nodes from which the reconciler should detach
|
||||||
let mut detach = Vec::new();
|
let mut detach = Vec::new();
|
||||||
for node_id in self.observed.locations.keys() {
|
for node_id in self.observed.locations.keys() {
|
||||||
@@ -957,10 +728,7 @@ impl TenantShard {
|
|||||||
let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq,
|
let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq,
|
||||||
tenant_id=%reconciler.tenant_shard_id.tenant_id,
|
tenant_id=%reconciler.tenant_shard_id.tenant_id,
|
||||||
shard_id=%reconciler.tenant_shard_id.shard_slug());
|
shard_id=%reconciler.tenant_shard_id.shard_slug());
|
||||||
metrics::METRICS_REGISTRY
|
metrics::RECONCILER.spawned.inc();
|
||||||
.metrics_group
|
|
||||||
.storage_controller_reconcile_spawn
|
|
||||||
.inc();
|
|
||||||
let result_tx = result_tx.clone();
|
let result_tx = result_tx.clone();
|
||||||
let join_handle = tokio::task::spawn(
|
let join_handle = tokio::task::spawn(
|
||||||
async move {
|
async move {
|
||||||
@@ -978,12 +746,10 @@ impl TenantShard {
|
|||||||
// TODO: wrap all remote API operations in cancellation check
|
// TODO: wrap all remote API operations in cancellation check
|
||||||
// as well.
|
// as well.
|
||||||
if reconciler.cancel.is_cancelled() {
|
if reconciler.cancel.is_cancelled() {
|
||||||
metrics::METRICS_REGISTRY
|
metrics::RECONCILER
|
||||||
.metrics_group
|
.complete
|
||||||
.storage_controller_reconcile_complete
|
.with_label_values(&[metrics::ReconcilerMetrics::CANCEL])
|
||||||
.inc(ReconcileCompleteLabelGroup {
|
.inc();
|
||||||
status: ReconcileOutcome::Cancel,
|
|
||||||
});
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -998,18 +764,18 @@ impl TenantShard {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Update result counter
|
// Update result counter
|
||||||
let outcome_label = match &result {
|
match &result {
|
||||||
Ok(_) => ReconcileOutcome::Success,
|
Ok(_) => metrics::RECONCILER
|
||||||
Err(ReconcileError::Cancel) => ReconcileOutcome::Cancel,
|
.complete
|
||||||
Err(_) => ReconcileOutcome::Error,
|
.with_label_values(&[metrics::ReconcilerMetrics::SUCCESS]),
|
||||||
};
|
Err(ReconcileError::Cancel) => metrics::RECONCILER
|
||||||
|
.complete
|
||||||
metrics::METRICS_REGISTRY
|
.with_label_values(&[metrics::ReconcilerMetrics::CANCEL]),
|
||||||
.metrics_group
|
Err(_) => metrics::RECONCILER
|
||||||
.storage_controller_reconcile_complete
|
.complete
|
||||||
.inc(ReconcileCompleteLabelGroup {
|
.with_label_values(&[metrics::ReconcilerMetrics::ERROR]),
|
||||||
status: outcome_label,
|
}
|
||||||
});
|
.inc();
|
||||||
|
|
||||||
result_tx
|
result_tx
|
||||||
.send(ReconcileResult {
|
.send(ReconcileResult {
|
||||||
@@ -1040,22 +806,6 @@ impl TenantShard {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get a waiter for any reconciliation in flight, but do not start reconciliation
|
|
||||||
/// if it is not already running
|
|
||||||
pub(crate) fn get_waiter(&self) -> Option<ReconcilerWaiter> {
|
|
||||||
if self.reconciler.is_some() {
|
|
||||||
Some(ReconcilerWaiter {
|
|
||||||
tenant_shard_id: self.tenant_shard_id,
|
|
||||||
seq_wait: self.waiter.clone(),
|
|
||||||
error_seq_wait: self.error_waiter.clone(),
|
|
||||||
error: self.last_error.clone(),
|
|
||||||
seq: self.sequence,
|
|
||||||
})
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Called when a ReconcileResult has been emitted and the service is updating
|
/// Called when a ReconcileResult has been emitted and the service is updating
|
||||||
/// our state: if the result is from a sequence >= my ReconcileHandle, then drop
|
/// our state: if the result is from a sequence >= my ReconcileHandle, then drop
|
||||||
/// the handle to indicate there is no longer a reconciliation in progress.
|
/// the handle to indicate there is no longer a reconciliation in progress.
|
||||||
@@ -1081,40 +831,6 @@ impl TenantShard {
|
|||||||
debug_assert!(!self.intent.all_pageservers().contains(&node_id));
|
debug_assert!(!self.intent.all_pageservers().contains(&node_id));
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn set_scheduling_policy(&mut self, p: ShardSchedulingPolicy) {
|
|
||||||
self.scheduling_policy = p;
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn get_scheduling_policy(&self) -> &ShardSchedulingPolicy {
|
|
||||||
&self.scheduling_policy
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn from_persistent(
|
|
||||||
tsp: TenantShardPersistence,
|
|
||||||
intent: IntentState,
|
|
||||||
) -> anyhow::Result<Self> {
|
|
||||||
let tenant_shard_id = tsp.get_tenant_shard_id()?;
|
|
||||||
let shard_identity = tsp.get_shard_identity()?;
|
|
||||||
|
|
||||||
Ok(Self {
|
|
||||||
tenant_shard_id,
|
|
||||||
shard: shard_identity,
|
|
||||||
sequence: Sequence::initial(),
|
|
||||||
generation: tsp.generation.map(|g| Generation::new(g as u32)),
|
|
||||||
policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
|
|
||||||
intent,
|
|
||||||
observed: ObservedState::new(),
|
|
||||||
config: serde_json::from_str(&tsp.config).unwrap(),
|
|
||||||
reconciler: None,
|
|
||||||
splitting: tsp.splitting,
|
|
||||||
waiter: Arc::new(SeqWait::new(Sequence::initial())),
|
|
||||||
error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
|
|
||||||
last_error: Arc::default(),
|
|
||||||
pending_compute_notification: false,
|
|
||||||
scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn to_persistent(&self) -> TenantShardPersistence {
|
pub(crate) fn to_persistent(&self) -> TenantShardPersistence {
|
||||||
TenantShardPersistence {
|
TenantShardPersistence {
|
||||||
tenant_id: self.tenant_shard_id.tenant_id.to_string(),
|
tenant_id: self.tenant_shard_id.tenant_id.to_string(),
|
||||||
@@ -1126,7 +842,6 @@ impl TenantShard {
|
|||||||
placement_policy: serde_json::to_string(&self.policy).unwrap(),
|
placement_policy: serde_json::to_string(&self.policy).unwrap(),
|
||||||
config: serde_json::to_string(&self.config).unwrap(),
|
config: serde_json::to_string(&self.config).unwrap(),
|
||||||
splitting: SplitState::default(),
|
splitting: SplitState::default(),
|
||||||
scheduling_policy: serde_json::to_string(&self.scheduling_policy).unwrap(),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1143,7 +858,7 @@ pub(crate) mod tests {
|
|||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantShard {
|
fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantState {
|
||||||
let tenant_id = TenantId::generate();
|
let tenant_id = TenantId::generate();
|
||||||
let shard_number = ShardNumber(0);
|
let shard_number = ShardNumber(0);
|
||||||
let shard_count = ShardCount::new(1);
|
let shard_count = ShardCount::new(1);
|
||||||
@@ -1153,7 +868,7 @@ pub(crate) mod tests {
|
|||||||
shard_number,
|
shard_number,
|
||||||
shard_count,
|
shard_count,
|
||||||
};
|
};
|
||||||
TenantShard::new(
|
TenantState::new(
|
||||||
tenant_shard_id,
|
tenant_shard_id,
|
||||||
ShardIdentity::new(
|
ShardIdentity::new(
|
||||||
shard_number,
|
shard_number,
|
||||||
@@ -1165,32 +880,6 @@ pub(crate) mod tests {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn make_test_tenant(policy: PlacementPolicy, shard_count: ShardCount) -> Vec<TenantShard> {
|
|
||||||
let tenant_id = TenantId::generate();
|
|
||||||
|
|
||||||
(0..shard_count.count())
|
|
||||||
.map(|i| {
|
|
||||||
let shard_number = ShardNumber(i);
|
|
||||||
|
|
||||||
let tenant_shard_id = TenantShardId {
|
|
||||||
tenant_id,
|
|
||||||
shard_number,
|
|
||||||
shard_count,
|
|
||||||
};
|
|
||||||
TenantShard::new(
|
|
||||||
tenant_shard_id,
|
|
||||||
ShardIdentity::new(
|
|
||||||
shard_number,
|
|
||||||
shard_count,
|
|
||||||
pageserver_api::shard::ShardStripeSize(32768),
|
|
||||||
)
|
|
||||||
.unwrap(),
|
|
||||||
policy.clone(),
|
|
||||||
)
|
|
||||||
})
|
|
||||||
.collect()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Test the scheduling behaviors used when a tenant configured for HA is subject
|
/// Test the scheduling behaviors used when a tenant configured for HA is subject
|
||||||
/// to nodes being marked offline.
|
/// to nodes being marked offline.
|
||||||
#[test]
|
#[test]
|
||||||
@@ -1200,26 +889,25 @@ pub(crate) mod tests {
|
|||||||
let mut nodes = make_test_nodes(3);
|
let mut nodes = make_test_nodes(3);
|
||||||
|
|
||||||
let mut scheduler = Scheduler::new(nodes.values());
|
let mut scheduler = Scheduler::new(nodes.values());
|
||||||
let mut context = ScheduleContext::default();
|
|
||||||
|
|
||||||
let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));
|
||||||
tenant_shard
|
tenant_state
|
||||||
.schedule(&mut scheduler, &mut context)
|
.schedule(&mut scheduler)
|
||||||
.expect("we have enough nodes, scheduling should work");
|
.expect("we have enough nodes, scheduling should work");
|
||||||
|
|
||||||
// Expect to initially be schedule on to different nodes
|
// Expect to initially be schedule on to different nodes
|
||||||
assert_eq!(tenant_shard.intent.secondary.len(), 1);
|
assert_eq!(tenant_state.intent.secondary.len(), 1);
|
||||||
assert!(tenant_shard.intent.attached.is_some());
|
assert!(tenant_state.intent.attached.is_some());
|
||||||
|
|
||||||
let attached_node_id = tenant_shard.intent.attached.unwrap();
|
let attached_node_id = tenant_state.intent.attached.unwrap();
|
||||||
let secondary_node_id = *tenant_shard.intent.secondary.iter().last().unwrap();
|
let secondary_node_id = *tenant_state.intent.secondary.iter().last().unwrap();
|
||||||
assert_ne!(attached_node_id, secondary_node_id);
|
assert_ne!(attached_node_id, secondary_node_id);
|
||||||
|
|
||||||
// Notifying the attached node is offline should demote it to a secondary
|
// Notifying the attached node is offline should demote it to a secondary
|
||||||
let changed = tenant_shard.intent.demote_attached(attached_node_id);
|
let changed = tenant_state.intent.demote_attached(attached_node_id);
|
||||||
assert!(changed);
|
assert!(changed);
|
||||||
assert!(tenant_shard.intent.attached.is_none());
|
assert!(tenant_state.intent.attached.is_none());
|
||||||
assert_eq!(tenant_shard.intent.secondary.len(), 2);
|
assert_eq!(tenant_state.intent.secondary.len(), 2);
|
||||||
|
|
||||||
// Update the scheduler state to indicate the node is offline
|
// Update the scheduler state to indicate the node is offline
|
||||||
nodes
|
nodes
|
||||||
@@ -1229,18 +917,18 @@ pub(crate) mod tests {
|
|||||||
scheduler.node_upsert(nodes.get(&attached_node_id).unwrap());
|
scheduler.node_upsert(nodes.get(&attached_node_id).unwrap());
|
||||||
|
|
||||||
// Scheduling the node should promote the still-available secondary node to attached
|
// Scheduling the node should promote the still-available secondary node to attached
|
||||||
tenant_shard
|
tenant_state
|
||||||
.schedule(&mut scheduler, &mut context)
|
.schedule(&mut scheduler)
|
||||||
.expect("active nodes are available");
|
.expect("active nodes are available");
|
||||||
assert_eq!(tenant_shard.intent.attached.unwrap(), secondary_node_id);
|
assert_eq!(tenant_state.intent.attached.unwrap(), secondary_node_id);
|
||||||
|
|
||||||
// The original attached node should have been retained as a secondary
|
// The original attached node should have been retained as a secondary
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
*tenant_shard.intent.secondary.iter().last().unwrap(),
|
*tenant_state.intent.secondary.iter().last().unwrap(),
|
||||||
attached_node_id
|
attached_node_id
|
||||||
);
|
);
|
||||||
|
|
||||||
tenant_shard.intent.clear(&mut scheduler);
|
tenant_state.intent.clear(&mut scheduler);
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -1250,263 +938,48 @@ pub(crate) mod tests {
|
|||||||
let nodes = make_test_nodes(3);
|
let nodes = make_test_nodes(3);
|
||||||
let mut scheduler = Scheduler::new(nodes.values());
|
let mut scheduler = Scheduler::new(nodes.values());
|
||||||
|
|
||||||
let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));
|
||||||
|
|
||||||
tenant_shard.observed.locations.insert(
|
tenant_state.observed.locations.insert(
|
||||||
NodeId(3),
|
NodeId(3),
|
||||||
ObservedStateLocation {
|
ObservedStateLocation {
|
||||||
conf: Some(LocationConfig {
|
conf: Some(LocationConfig {
|
||||||
mode: LocationConfigMode::AttachedMulti,
|
mode: LocationConfigMode::AttachedMulti,
|
||||||
generation: Some(2),
|
generation: Some(2),
|
||||||
secondary_conf: None,
|
secondary_conf: None,
|
||||||
shard_number: tenant_shard.shard.number.0,
|
shard_number: tenant_state.shard.number.0,
|
||||||
shard_count: tenant_shard.shard.count.literal(),
|
shard_count: tenant_state.shard.count.literal(),
|
||||||
shard_stripe_size: tenant_shard.shard.stripe_size.0,
|
shard_stripe_size: tenant_state.shard.stripe_size.0,
|
||||||
tenant_conf: TenantConfig::default(),
|
tenant_conf: TenantConfig::default(),
|
||||||
}),
|
}),
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
tenant_shard.observed.locations.insert(
|
tenant_state.observed.locations.insert(
|
||||||
NodeId(2),
|
NodeId(2),
|
||||||
ObservedStateLocation {
|
ObservedStateLocation {
|
||||||
conf: Some(LocationConfig {
|
conf: Some(LocationConfig {
|
||||||
mode: LocationConfigMode::AttachedStale,
|
mode: LocationConfigMode::AttachedStale,
|
||||||
generation: Some(1),
|
generation: Some(1),
|
||||||
secondary_conf: None,
|
secondary_conf: None,
|
||||||
shard_number: tenant_shard.shard.number.0,
|
shard_number: tenant_state.shard.number.0,
|
||||||
shard_count: tenant_shard.shard.count.literal(),
|
shard_count: tenant_state.shard.count.literal(),
|
||||||
shard_stripe_size: tenant_shard.shard.stripe_size.0,
|
shard_stripe_size: tenant_state.shard.stripe_size.0,
|
||||||
tenant_conf: TenantConfig::default(),
|
tenant_conf: TenantConfig::default(),
|
||||||
}),
|
}),
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
tenant_shard.intent_from_observed(&mut scheduler);
|
tenant_state.intent_from_observed(&mut scheduler);
|
||||||
|
|
||||||
// The highest generationed attached location gets used as attached
|
// The highest generationed attached location gets used as attached
|
||||||
assert_eq!(tenant_shard.intent.attached, Some(NodeId(3)));
|
assert_eq!(tenant_state.intent.attached, Some(NodeId(3)));
|
||||||
// Other locations get used as secondary
|
// Other locations get used as secondary
|
||||||
assert_eq!(tenant_shard.intent.secondary, vec![NodeId(2)]);
|
assert_eq!(tenant_state.intent.secondary, vec![NodeId(2)]);
|
||||||
|
|
||||||
scheduler.consistency_check(nodes.values(), [&tenant_shard].into_iter())?;
|
scheduler.consistency_check(nodes.values(), [&tenant_state].into_iter())?;
|
||||||
|
|
||||||
tenant_shard.intent.clear(&mut scheduler);
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn scheduling_mode() -> anyhow::Result<()> {
|
|
||||||
let nodes = make_test_nodes(3);
|
|
||||||
let mut scheduler = Scheduler::new(nodes.values());
|
|
||||||
|
|
||||||
let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
|
||||||
|
|
||||||
// In pause mode, schedule() shouldn't do anything
|
|
||||||
tenant_shard.scheduling_policy = ShardSchedulingPolicy::Pause;
|
|
||||||
assert!(tenant_shard
|
|
||||||
.schedule(&mut scheduler, &mut ScheduleContext::default())
|
|
||||||
.is_ok());
|
|
||||||
assert!(tenant_shard.intent.all_pageservers().is_empty());
|
|
||||||
|
|
||||||
// In active mode, schedule() works
|
|
||||||
tenant_shard.scheduling_policy = ShardSchedulingPolicy::Active;
|
|
||||||
assert!(tenant_shard
|
|
||||||
.schedule(&mut scheduler, &mut ScheduleContext::default())
|
|
||||||
.is_ok());
|
|
||||||
assert!(!tenant_shard.intent.all_pageservers().is_empty());
|
|
||||||
|
|
||||||
tenant_shard.intent.clear(&mut scheduler);
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn optimize_attachment() -> anyhow::Result<()> {
|
|
||||||
let nodes = make_test_nodes(3);
|
|
||||||
let mut scheduler = Scheduler::new(nodes.values());
|
|
||||||
|
|
||||||
let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
|
||||||
let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
|
||||||
|
|
||||||
// Initially: both nodes attached on shard 1, and both have secondary locations
|
|
||||||
// on different nodes.
|
|
||||||
shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1)));
|
|
||||||
shard_a.intent.push_secondary(&mut scheduler, NodeId(2));
|
|
||||||
shard_b.intent.set_attached(&mut scheduler, Some(NodeId(1)));
|
|
||||||
shard_b.intent.push_secondary(&mut scheduler, NodeId(3));
|
|
||||||
|
|
||||||
let mut schedule_context = ScheduleContext::default();
|
|
||||||
schedule_context.avoid(&shard_a.intent.all_pageservers());
|
|
||||||
schedule_context.push_attached(shard_a.intent.get_attached().unwrap());
|
|
||||||
schedule_context.avoid(&shard_b.intent.all_pageservers());
|
|
||||||
schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
|
|
||||||
|
|
||||||
let optimization_a = shard_a.optimize_attachment(&nodes, &schedule_context);
|
|
||||||
|
|
||||||
// Either shard should recognize that it has the option to switch to a secondary location where there
|
|
||||||
// would be no other shards from the same tenant, and request to do so.
|
|
||||||
assert_eq!(
|
|
||||||
optimization_a,
|
|
||||||
Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
|
|
||||||
old_attached_node_id: NodeId(1),
|
|
||||||
new_attached_node_id: NodeId(2)
|
|
||||||
}))
|
|
||||||
);
|
|
||||||
|
|
||||||
// Note that these optimizing two shards in the same tenant with the same ScheduleContext is
|
|
||||||
// mutually exclusive (the optimization of one invalidates the stats) -- it is the responsibility
|
|
||||||
// of [`Service::optimize_all`] to avoid trying
|
|
||||||
// to do optimizations for multiple shards in the same tenant at the same time. Generating
|
|
||||||
// both optimizations is just done for test purposes
|
|
||||||
let optimization_b = shard_b.optimize_attachment(&nodes, &schedule_context);
|
|
||||||
assert_eq!(
|
|
||||||
optimization_b,
|
|
||||||
Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
|
|
||||||
old_attached_node_id: NodeId(1),
|
|
||||||
new_attached_node_id: NodeId(3)
|
|
||||||
}))
|
|
||||||
);
|
|
||||||
|
|
||||||
// Applying these optimizations should result in the end state proposed
|
|
||||||
shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap());
|
|
||||||
assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(2)));
|
|
||||||
assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(1)]);
|
|
||||||
shard_b.apply_optimization(&mut scheduler, optimization_b.unwrap());
|
|
||||||
assert_eq!(shard_b.intent.get_attached(), &Some(NodeId(3)));
|
|
||||||
assert_eq!(shard_b.intent.get_secondary(), &vec![NodeId(1)]);
|
|
||||||
|
|
||||||
shard_a.intent.clear(&mut scheduler);
|
|
||||||
shard_b.intent.clear(&mut scheduler);
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn optimize_secondary() -> anyhow::Result<()> {
|
|
||||||
let nodes = make_test_nodes(4);
|
|
||||||
let mut scheduler = Scheduler::new(nodes.values());
|
|
||||||
|
|
||||||
let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
|
||||||
let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
|
||||||
|
|
||||||
// Initially: both nodes attached on shard 1, and both have secondary locations
|
|
||||||
// on different nodes.
|
|
||||||
shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1)));
|
|
||||||
shard_a.intent.push_secondary(&mut scheduler, NodeId(3));
|
|
||||||
shard_b.intent.set_attached(&mut scheduler, Some(NodeId(2)));
|
|
||||||
shard_b.intent.push_secondary(&mut scheduler, NodeId(3));
|
|
||||||
|
|
||||||
let mut schedule_context = ScheduleContext::default();
|
|
||||||
schedule_context.avoid(&shard_a.intent.all_pageservers());
|
|
||||||
schedule_context.push_attached(shard_a.intent.get_attached().unwrap());
|
|
||||||
schedule_context.avoid(&shard_b.intent.all_pageservers());
|
|
||||||
schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
|
|
||||||
|
|
||||||
let optimization_a = shard_a.optimize_secondary(&scheduler, &schedule_context);
|
|
||||||
|
|
||||||
// Since there is a node with no locations available, the node with two locations for the
|
|
||||||
// same tenant should generate an optimization to move one away
|
|
||||||
assert_eq!(
|
|
||||||
optimization_a,
|
|
||||||
Some(ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
|
|
||||||
old_node_id: NodeId(3),
|
|
||||||
new_node_id: NodeId(4)
|
|
||||||
}))
|
|
||||||
);
|
|
||||||
|
|
||||||
shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap());
|
|
||||||
assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(1)));
|
|
||||||
assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(4)]);
|
|
||||||
|
|
||||||
shard_a.intent.clear(&mut scheduler);
|
|
||||||
shard_b.intent.clear(&mut scheduler);
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
// Optimize til quiescent: this emulates what Service::optimize_all does, when
|
|
||||||
// called repeatedly in the background.
|
|
||||||
fn optimize_til_idle(
|
|
||||||
nodes: &HashMap<NodeId, Node>,
|
|
||||||
scheduler: &mut Scheduler,
|
|
||||||
shards: &mut [TenantShard],
|
|
||||||
) {
|
|
||||||
let mut loop_n = 0;
|
|
||||||
loop {
|
|
||||||
let mut schedule_context = ScheduleContext::default();
|
|
||||||
let mut any_changed = false;
|
|
||||||
|
|
||||||
for shard in shards.iter() {
|
|
||||||
schedule_context.avoid(&shard.intent.all_pageservers());
|
|
||||||
if let Some(attached) = shard.intent.get_attached() {
|
|
||||||
schedule_context.push_attached(*attached);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for shard in shards.iter_mut() {
|
|
||||||
let optimization = shard.optimize_attachment(nodes, &schedule_context);
|
|
||||||
if let Some(optimization) = optimization {
|
|
||||||
shard.apply_optimization(scheduler, optimization);
|
|
||||||
any_changed = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
let optimization = shard.optimize_secondary(scheduler, &schedule_context);
|
|
||||||
if let Some(optimization) = optimization {
|
|
||||||
shard.apply_optimization(scheduler, optimization);
|
|
||||||
any_changed = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if !any_changed {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Assert no infinite loop
|
|
||||||
loop_n += 1;
|
|
||||||
assert!(loop_n < 1000);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Test the balancing behavior of shard scheduling: that it achieves a balance, and
|
|
||||||
/// that it converges.
|
|
||||||
#[test]
|
|
||||||
fn optimize_add_nodes() -> anyhow::Result<()> {
|
|
||||||
let nodes = make_test_nodes(4);
|
|
||||||
|
|
||||||
// Only show the scheduler a couple of nodes
|
|
||||||
let mut scheduler = Scheduler::new([].iter());
|
|
||||||
scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap());
|
|
||||||
scheduler.node_upsert(nodes.get(&NodeId(2)).unwrap());
|
|
||||||
|
|
||||||
let mut shards = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4));
|
|
||||||
let mut schedule_context = ScheduleContext::default();
|
|
||||||
for shard in &mut shards {
|
|
||||||
assert!(shard
|
|
||||||
.schedule(&mut scheduler, &mut schedule_context)
|
|
||||||
.is_ok());
|
|
||||||
}
|
|
||||||
|
|
||||||
// We should see equal number of locations on the two nodes.
|
|
||||||
assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4);
|
|
||||||
assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4);
|
|
||||||
|
|
||||||
// Add another two nodes: we should see the shards spread out when their optimize
|
|
||||||
// methods are called
|
|
||||||
scheduler.node_upsert(nodes.get(&NodeId(3)).unwrap());
|
|
||||||
scheduler.node_upsert(nodes.get(&NodeId(4)).unwrap());
|
|
||||||
optimize_til_idle(&nodes, &mut scheduler, &mut shards);
|
|
||||||
|
|
||||||
assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 2);
|
|
||||||
assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2);
|
|
||||||
assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 2);
|
|
||||||
assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 2);
|
|
||||||
|
|
||||||
for shard in shards.iter_mut() {
|
|
||||||
shard.intent.clear(&mut scheduler);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
tenant_state.intent.clear(&mut scheduler);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -86,10 +86,7 @@ where
|
|||||||
.stdout(process_log_file)
|
.stdout(process_log_file)
|
||||||
.stderr(same_file_for_stderr)
|
.stderr(same_file_for_stderr)
|
||||||
.args(args);
|
.args(args);
|
||||||
|
let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command));
|
||||||
let filled_cmd = fill_env_vars_prefixed_neon(fill_remote_storage_secrets_vars(
|
|
||||||
fill_rust_env_vars(background_command),
|
|
||||||
));
|
|
||||||
filled_cmd.envs(envs);
|
filled_cmd.envs(envs);
|
||||||
|
|
||||||
let pid_file_to_check = match &initial_pid_file {
|
let pid_file_to_check = match &initial_pid_file {
|
||||||
@@ -271,15 +268,6 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
|
|||||||
cmd
|
cmd
|
||||||
}
|
}
|
||||||
|
|
||||||
fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command {
|
|
||||||
for (var, val) in std::env::vars() {
|
|
||||||
if var.starts_with("NEON_PAGESERVER_") {
|
|
||||||
cmd = cmd.env(var, val);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
cmd
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Add a `pre_exec` to the cmd that, inbetween fork() and exec(),
|
/// Add a `pre_exec` to the cmd that, inbetween fork() and exec(),
|
||||||
/// 1. Claims a pidfile with a fcntl lock on it and
|
/// 1. Claims a pidfile with a fcntl lock on it and
|
||||||
/// 2. Sets up the pidfile's file descriptor so that it (and the lock)
|
/// 2. Sets up the pidfile's file descriptor so that it (and the lock)
|
||||||
@@ -306,7 +294,7 @@ where
|
|||||||
// is in state 'taken' but the thread that would unlock it is
|
// is in state 'taken' but the thread that would unlock it is
|
||||||
// not there.
|
// not there.
|
||||||
// 2. A rust object that represented some external resource in the
|
// 2. A rust object that represented some external resource in the
|
||||||
// parent now got implicitly copied by the fork, even though
|
// parent now got implicitly copied by the the fork, even though
|
||||||
// the object's type is not `Copy`. The parent program may use
|
// the object's type is not `Copy`. The parent program may use
|
||||||
// non-copyability as way to enforce unique ownership of an
|
// non-copyability as way to enforce unique ownership of an
|
||||||
// external resource in the typesystem. The fork breaks that
|
// external resource in the typesystem. The fork breaks that
|
||||||
|
|||||||
@@ -14,7 +14,9 @@ use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
|
|||||||
use control_plane::safekeeper::SafekeeperNode;
|
use control_plane::safekeeper::SafekeeperNode;
|
||||||
use control_plane::storage_controller::StorageController;
|
use control_plane::storage_controller::StorageController;
|
||||||
use control_plane::{broker, local_env};
|
use control_plane::{broker, local_env};
|
||||||
use pageserver_api::controller_api::PlacementPolicy;
|
use pageserver_api::controller_api::{
|
||||||
|
NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, PlacementPolicy,
|
||||||
|
};
|
||||||
use pageserver_api::models::{
|
use pageserver_api::models::{
|
||||||
ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
|
ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
|
||||||
};
|
};
|
||||||
@@ -417,54 +419,6 @@ async fn handle_tenant(
|
|||||||
println!("{} {:?}", t.id, t.state);
|
println!("{} {:?}", t.id, t.state);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Some(("import", import_match)) => {
|
|
||||||
let tenant_id = parse_tenant_id(import_match)?.unwrap_or_else(TenantId::generate);
|
|
||||||
|
|
||||||
let storage_controller = StorageController::from_env(env);
|
|
||||||
let create_response = storage_controller.tenant_import(tenant_id).await?;
|
|
||||||
|
|
||||||
let shard_zero = create_response
|
|
||||||
.shards
|
|
||||||
.first()
|
|
||||||
.expect("Import response omitted shards");
|
|
||||||
|
|
||||||
let attached_pageserver_id = shard_zero.node_id;
|
|
||||||
let pageserver =
|
|
||||||
PageServerNode::from_env(env, env.get_pageserver_conf(attached_pageserver_id)?);
|
|
||||||
|
|
||||||
println!(
|
|
||||||
"Imported tenant {tenant_id}, attached to pageserver {attached_pageserver_id}"
|
|
||||||
);
|
|
||||||
|
|
||||||
let timelines = pageserver
|
|
||||||
.http_client
|
|
||||||
.list_timelines(shard_zero.shard_id)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
// Pick a 'main' timeline that has no ancestors, the rest will get arbitrary names
|
|
||||||
let main_timeline = timelines
|
|
||||||
.iter()
|
|
||||||
.find(|t| t.ancestor_timeline_id.is_none())
|
|
||||||
.expect("No timelines found")
|
|
||||||
.timeline_id;
|
|
||||||
|
|
||||||
let mut branch_i = 0;
|
|
||||||
for timeline in timelines.iter() {
|
|
||||||
let branch_name = if timeline.timeline_id == main_timeline {
|
|
||||||
"main".to_string()
|
|
||||||
} else {
|
|
||||||
branch_i += 1;
|
|
||||||
format!("branch_{branch_i}")
|
|
||||||
};
|
|
||||||
|
|
||||||
println!(
|
|
||||||
"Importing timeline {tenant_id}/{} as branch {branch_name}",
|
|
||||||
timeline.timeline_id
|
|
||||||
);
|
|
||||||
|
|
||||||
env.register_branch_mapping(branch_name, tenant_id, timeline.timeline_id)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Some(("create", create_match)) => {
|
Some(("create", create_match)) => {
|
||||||
let tenant_conf: HashMap<_, _> = create_match
|
let tenant_conf: HashMap<_, _> = create_match
|
||||||
.get_many::<String>("config")
|
.get_many::<String>("config")
|
||||||
@@ -483,7 +437,7 @@ async fn handle_tenant(
|
|||||||
|
|
||||||
let placement_policy = match create_match.get_one::<String>("placement-policy") {
|
let placement_policy = match create_match.get_one::<String>("placement-policy") {
|
||||||
Some(s) if !s.is_empty() => serde_json::from_str::<PlacementPolicy>(s)?,
|
Some(s) if !s.is_empty() => serde_json::from_str::<PlacementPolicy>(s)?,
|
||||||
_ => PlacementPolicy::Attached(0),
|
_ => PlacementPolicy::Single,
|
||||||
};
|
};
|
||||||
|
|
||||||
let tenant_conf = PageServerNode::parse_config(tenant_conf)?;
|
let tenant_conf = PageServerNode::parse_config(tenant_conf)?;
|
||||||
@@ -569,6 +523,88 @@ async fn handle_tenant(
|
|||||||
.with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
|
.with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
|
||||||
println!("tenant {tenant_id} successfully configured on the pageserver");
|
println!("tenant {tenant_id} successfully configured on the pageserver");
|
||||||
}
|
}
|
||||||
|
Some(("migrate", matches)) => {
|
||||||
|
let tenant_shard_id = get_tenant_shard_id(matches, env)?;
|
||||||
|
let new_pageserver = get_pageserver(env, matches)?;
|
||||||
|
let new_pageserver_id = new_pageserver.conf.id;
|
||||||
|
|
||||||
|
let storage_controller = StorageController::from_env(env);
|
||||||
|
storage_controller
|
||||||
|
.tenant_migrate(tenant_shard_id, new_pageserver_id)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
println!("tenant {tenant_shard_id} migrated to {}", new_pageserver_id);
|
||||||
|
}
|
||||||
|
Some(("status", matches)) => {
|
||||||
|
let tenant_id = get_tenant_id(matches, env)?;
|
||||||
|
|
||||||
|
let mut shard_table = comfy_table::Table::new();
|
||||||
|
shard_table.set_header(["Shard", "Pageserver", "Physical Size"]);
|
||||||
|
|
||||||
|
let mut tenant_synthetic_size = None;
|
||||||
|
|
||||||
|
let storage_controller = StorageController::from_env(env);
|
||||||
|
for shard in storage_controller.tenant_locate(tenant_id).await?.shards {
|
||||||
|
let pageserver =
|
||||||
|
PageServerNode::from_env(env, env.get_pageserver_conf(shard.node_id)?);
|
||||||
|
|
||||||
|
let size = pageserver
|
||||||
|
.http_client
|
||||||
|
.tenant_details(shard.shard_id)
|
||||||
|
.await?
|
||||||
|
.tenant_info
|
||||||
|
.current_physical_size
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
shard_table.add_row([
|
||||||
|
format!("{}", shard.shard_id.shard_slug()),
|
||||||
|
format!("{}", shard.node_id.0),
|
||||||
|
format!("{} MiB", size / (1024 * 1024)),
|
||||||
|
]);
|
||||||
|
|
||||||
|
if shard.shard_id.is_zero() {
|
||||||
|
tenant_synthetic_size =
|
||||||
|
Some(pageserver.tenant_synthetic_size(shard.shard_id).await?);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let Some(synthetic_size) = tenant_synthetic_size else {
|
||||||
|
bail!("Shard 0 not found")
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut tenant_table = comfy_table::Table::new();
|
||||||
|
tenant_table.add_row(["Tenant ID".to_string(), tenant_id.to_string()]);
|
||||||
|
tenant_table.add_row([
|
||||||
|
"Synthetic size".to_string(),
|
||||||
|
format!("{} MiB", synthetic_size.size.unwrap_or(0) / (1024 * 1024)),
|
||||||
|
]);
|
||||||
|
|
||||||
|
println!("{tenant_table}");
|
||||||
|
println!("{shard_table}");
|
||||||
|
}
|
||||||
|
Some(("shard-split", matches)) => {
|
||||||
|
let tenant_id = get_tenant_id(matches, env)?;
|
||||||
|
let shard_count: u8 = matches.get_one::<u8>("shard-count").cloned().unwrap_or(0);
|
||||||
|
let shard_stripe_size: Option<ShardStripeSize> = matches
|
||||||
|
.get_one::<Option<ShardStripeSize>>("shard-stripe-size")
|
||||||
|
.cloned()
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let storage_controller = StorageController::from_env(env);
|
||||||
|
let result = storage_controller
|
||||||
|
.tenant_split(tenant_id, shard_count, shard_stripe_size)
|
||||||
|
.await?;
|
||||||
|
println!(
|
||||||
|
"Split tenant {} into shards {}",
|
||||||
|
tenant_id,
|
||||||
|
result
|
||||||
|
.new_shards
|
||||||
|
.iter()
|
||||||
|
.map(|s| format!("{:?}", s))
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
.join(",")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
|
Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
|
||||||
None => bail!("no tenant subcommand provided"),
|
None => bail!("no tenant subcommand provided"),
|
||||||
@@ -1106,6 +1142,21 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Some(("set-state", subcommand_args)) => {
|
||||||
|
let pageserver = get_pageserver(env, subcommand_args)?;
|
||||||
|
let scheduling = subcommand_args.get_one("scheduling");
|
||||||
|
let availability = subcommand_args.get_one("availability");
|
||||||
|
|
||||||
|
let storage_controller = StorageController::from_env(env);
|
||||||
|
storage_controller
|
||||||
|
.node_configure(NodeConfigureRequest {
|
||||||
|
node_id: pageserver.conf.id,
|
||||||
|
scheduling: scheduling.cloned(),
|
||||||
|
availability: availability.cloned(),
|
||||||
|
})
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
|
|
||||||
Some(("status", subcommand_args)) => {
|
Some(("status", subcommand_args)) => {
|
||||||
match get_pageserver(env, subcommand_args)?.check_status().await {
|
match get_pageserver(env, subcommand_args)?.check_status().await {
|
||||||
Ok(_) => println!("Page server is up and running"),
|
Ok(_) => println!("Page server is up and running"),
|
||||||
@@ -1279,7 +1330,7 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
|
|||||||
match ComputeControlPlane::load(env.clone()) {
|
match ComputeControlPlane::load(env.clone()) {
|
||||||
Ok(cplane) => {
|
Ok(cplane) => {
|
||||||
for (_k, node) in cplane.endpoints {
|
for (_k, node) in cplane.endpoints {
|
||||||
if let Err(e) = node.stop(if immediate { "immediate" } else { "fast" }, false) {
|
if let Err(e) = node.stop(if immediate { "immediate" } else { "fast " }, false) {
|
||||||
eprintln!("postgres stop failed: {e:#}");
|
eprintln!("postgres stop failed: {e:#}");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1465,7 +1516,6 @@ fn cli() -> Command {
|
|||||||
.subcommand(
|
.subcommand(
|
||||||
Command::new("timeline")
|
Command::new("timeline")
|
||||||
.about("Manage timelines")
|
.about("Manage timelines")
|
||||||
.arg_required_else_help(true)
|
|
||||||
.subcommand(Command::new("list")
|
.subcommand(Command::new("list")
|
||||||
.about("List all timelines, available to this pageserver")
|
.about("List all timelines, available to this pageserver")
|
||||||
.arg(tenant_id_arg.clone()))
|
.arg(tenant_id_arg.clone()))
|
||||||
@@ -1528,8 +1578,19 @@ fn cli() -> Command {
|
|||||||
.subcommand(Command::new("config")
|
.subcommand(Command::new("config")
|
||||||
.arg(tenant_id_arg.clone())
|
.arg(tenant_id_arg.clone())
|
||||||
.arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
|
.arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
|
||||||
.subcommand(Command::new("import").arg(tenant_id_arg.clone().required(true))
|
.subcommand(Command::new("migrate")
|
||||||
.about("Import a tenant that is present in remote storage, and create branches for its timelines"))
|
.about("Migrate a tenant from one pageserver to another")
|
||||||
|
.arg(tenant_id_arg.clone())
|
||||||
|
.arg(pageserver_id_arg.clone()))
|
||||||
|
.subcommand(Command::new("status")
|
||||||
|
.about("Human readable summary of the tenant's shards and attachment locations")
|
||||||
|
.arg(tenant_id_arg.clone()))
|
||||||
|
.subcommand(Command::new("shard-split")
|
||||||
|
.about("Increase the number of shards in the tenant")
|
||||||
|
.arg(tenant_id_arg.clone())
|
||||||
|
.arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
|
||||||
|
.arg(Arg::new("shard-stripe-size").value_parser(value_parser!(u32)).long("shard-stripe-size").action(ArgAction::Set).help("Sharding stripe size in pages"))
|
||||||
|
)
|
||||||
)
|
)
|
||||||
.subcommand(
|
.subcommand(
|
||||||
Command::new("pageserver")
|
Command::new("pageserver")
|
||||||
@@ -1549,6 +1610,12 @@ fn cli() -> Command {
|
|||||||
.about("Restart local pageserver")
|
.about("Restart local pageserver")
|
||||||
.arg(pageserver_config_args.clone())
|
.arg(pageserver_config_args.clone())
|
||||||
)
|
)
|
||||||
|
.subcommand(Command::new("set-state")
|
||||||
|
.arg(Arg::new("availability").value_parser(value_parser!(NodeAvailability)).long("availability").action(ArgAction::Set).help("Availability state: offline,active"))
|
||||||
|
.arg(Arg::new("scheduling").value_parser(value_parser!(NodeSchedulingPolicy)).long("scheduling").action(ArgAction::Set).help("Scheduling state: draining,pause,filling,active"))
|
||||||
|
.about("Set scheduling or availability state of pageserver node")
|
||||||
|
.arg(pageserver_config_args.clone())
|
||||||
|
)
|
||||||
)
|
)
|
||||||
.subcommand(
|
.subcommand(
|
||||||
Command::new("storage_controller")
|
Command::new("storage_controller")
|
||||||
|
|||||||
@@ -12,7 +12,7 @@
|
|||||||
//!
|
//!
|
||||||
//! The endpoint is managed by the `compute_ctl` binary. When an endpoint is
|
//! The endpoint is managed by the `compute_ctl` binary. When an endpoint is
|
||||||
//! started, we launch `compute_ctl` It synchronizes the safekeepers, downloads
|
//! started, we launch `compute_ctl` It synchronizes the safekeepers, downloads
|
||||||
//! the basebackup from the pageserver to initialize the data directory, and
|
//! the basebackup from the pageserver to initialize the the data directory, and
|
||||||
//! finally launches the PostgreSQL process. It watches the PostgreSQL process
|
//! finally launches the PostgreSQL process. It watches the PostgreSQL process
|
||||||
//! until it exits.
|
//! until it exits.
|
||||||
//!
|
//!
|
||||||
|
|||||||
@@ -114,7 +114,7 @@ impl NeonBroker {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||||
#[serde(default, deny_unknown_fields)]
|
#[serde(default)]
|
||||||
pub struct PageServerConf {
|
pub struct PageServerConf {
|
||||||
// node id
|
// node id
|
||||||
pub id: NodeId,
|
pub id: NodeId,
|
||||||
@@ -126,9 +126,6 @@ pub struct PageServerConf {
|
|||||||
// auth type used for the PG and HTTP ports
|
// auth type used for the PG and HTTP ports
|
||||||
pub pg_auth_type: AuthType,
|
pub pg_auth_type: AuthType,
|
||||||
pub http_auth_type: AuthType,
|
pub http_auth_type: AuthType,
|
||||||
|
|
||||||
pub(crate) virtual_file_io_engine: Option<String>,
|
|
||||||
pub(crate) get_vectored_impl: Option<String>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for PageServerConf {
|
impl Default for PageServerConf {
|
||||||
@@ -139,8 +136,6 @@ impl Default for PageServerConf {
|
|||||||
listen_http_addr: String::new(),
|
listen_http_addr: String::new(),
|
||||||
pg_auth_type: AuthType::Trust,
|
pg_auth_type: AuthType::Trust,
|
||||||
http_auth_type: AuthType::Trust,
|
http_auth_type: AuthType::Trust,
|
||||||
virtual_file_io_engine: None,
|
|
||||||
get_vectored_impl: None,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -156,7 +151,6 @@ pub struct SafekeeperConf {
|
|||||||
pub remote_storage: Option<String>,
|
pub remote_storage: Option<String>,
|
||||||
pub backup_threads: Option<u32>,
|
pub backup_threads: Option<u32>,
|
||||||
pub auth_enabled: bool,
|
pub auth_enabled: bool,
|
||||||
pub listen_addr: Option<String>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for SafekeeperConf {
|
impl Default for SafekeeperConf {
|
||||||
@@ -170,7 +164,6 @@ impl Default for SafekeeperConf {
|
|||||||
remote_storage: None,
|
remote_storage: None,
|
||||||
backup_threads: None,
|
backup_threads: None,
|
||||||
auth_enabled: false,
|
auth_enabled: false,
|
||||||
listen_addr: None,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -78,39 +78,18 @@ impl PageServerNode {
|
|||||||
///
|
///
|
||||||
/// These all end up on the command line of the `pageserver` binary.
|
/// These all end up on the command line of the `pageserver` binary.
|
||||||
fn neon_local_overrides(&self, cli_overrides: &[&str]) -> Vec<String> {
|
fn neon_local_overrides(&self, cli_overrides: &[&str]) -> Vec<String> {
|
||||||
|
let id = format!("id={}", self.conf.id);
|
||||||
// FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
|
// FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
|
||||||
let pg_distrib_dir_param = format!(
|
let pg_distrib_dir_param = format!(
|
||||||
"pg_distrib_dir='{}'",
|
"pg_distrib_dir='{}'",
|
||||||
self.env.pg_distrib_dir_raw().display()
|
self.env.pg_distrib_dir_raw().display()
|
||||||
);
|
);
|
||||||
|
|
||||||
let PageServerConf {
|
let http_auth_type_param = format!("http_auth_type='{}'", self.conf.http_auth_type);
|
||||||
id,
|
let listen_http_addr_param = format!("listen_http_addr='{}'", self.conf.listen_http_addr);
|
||||||
listen_pg_addr,
|
|
||||||
listen_http_addr,
|
|
||||||
pg_auth_type,
|
|
||||||
http_auth_type,
|
|
||||||
virtual_file_io_engine,
|
|
||||||
get_vectored_impl,
|
|
||||||
} = &self.conf;
|
|
||||||
|
|
||||||
let id = format!("id={}", id);
|
let pg_auth_type_param = format!("pg_auth_type='{}'", self.conf.pg_auth_type);
|
||||||
|
let listen_pg_addr_param = format!("listen_pg_addr='{}'", self.conf.listen_pg_addr);
|
||||||
let http_auth_type_param = format!("http_auth_type='{}'", http_auth_type);
|
|
||||||
let listen_http_addr_param = format!("listen_http_addr='{}'", listen_http_addr);
|
|
||||||
|
|
||||||
let pg_auth_type_param = format!("pg_auth_type='{}'", pg_auth_type);
|
|
||||||
let listen_pg_addr_param = format!("listen_pg_addr='{}'", listen_pg_addr);
|
|
||||||
let virtual_file_io_engine = if let Some(virtual_file_io_engine) = virtual_file_io_engine {
|
|
||||||
format!("virtual_file_io_engine='{virtual_file_io_engine}'")
|
|
||||||
} else {
|
|
||||||
String::new()
|
|
||||||
};
|
|
||||||
let get_vectored_impl = if let Some(get_vectored_impl) = get_vectored_impl {
|
|
||||||
format!("get_vectored_impl='{get_vectored_impl}'")
|
|
||||||
} else {
|
|
||||||
String::new()
|
|
||||||
};
|
|
||||||
|
|
||||||
let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
|
let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
|
||||||
|
|
||||||
@@ -122,8 +101,6 @@ impl PageServerNode {
|
|||||||
listen_http_addr_param,
|
listen_http_addr_param,
|
||||||
listen_pg_addr_param,
|
listen_pg_addr_param,
|
||||||
broker_endpoint_param,
|
broker_endpoint_param,
|
||||||
virtual_file_io_engine,
|
|
||||||
get_vectored_impl,
|
|
||||||
];
|
];
|
||||||
|
|
||||||
if let Some(control_plane_api) = &self.env.control_plane_api {
|
if let Some(control_plane_api) = &self.env.control_plane_api {
|
||||||
@@ -134,7 +111,7 @@ impl PageServerNode {
|
|||||||
|
|
||||||
// Storage controller uses the same auth as pageserver: if JWT is enabled
|
// Storage controller uses the same auth as pageserver: if JWT is enabled
|
||||||
// for us, we will also need it to talk to them.
|
// for us, we will also need it to talk to them.
|
||||||
if matches!(http_auth_type, AuthType::NeonJWT) {
|
if matches!(self.conf.http_auth_type, AuthType::NeonJWT) {
|
||||||
let jwt_token = self
|
let jwt_token = self
|
||||||
.env
|
.env
|
||||||
.generate_auth_token(&Claims::new(None, Scope::GenerationsApi))
|
.generate_auth_token(&Claims::new(None, Scope::GenerationsApi))
|
||||||
@@ -152,7 +129,8 @@ impl PageServerNode {
|
|||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
if *http_auth_type != AuthType::Trust || *pg_auth_type != AuthType::Trust {
|
if self.conf.http_auth_type != AuthType::Trust || self.conf.pg_auth_type != AuthType::Trust
|
||||||
|
{
|
||||||
// Keys are generated in the toplevel repo dir, pageservers' workdirs
|
// Keys are generated in the toplevel repo dir, pageservers' workdirs
|
||||||
// are one level below that, so refer to keys with ../
|
// are one level below that, so refer to keys with ../
|
||||||
overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
|
overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
|
||||||
@@ -389,10 +367,6 @@ impl PageServerNode {
|
|||||||
.remove("image_creation_threshold")
|
.remove("image_creation_threshold")
|
||||||
.map(|x| x.parse::<usize>())
|
.map(|x| x.parse::<usize>())
|
||||||
.transpose()?,
|
.transpose()?,
|
||||||
image_layer_creation_check_threshold: settings
|
|
||||||
.remove("image_layer_creation_check_threshold")
|
|
||||||
.map(|x| x.parse::<u8>())
|
|
||||||
.transpose()?,
|
|
||||||
pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
|
pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
|
||||||
walreceiver_connect_timeout: settings
|
walreceiver_connect_timeout: settings
|
||||||
.remove("walreceiver_connect_timeout")
|
.remove("walreceiver_connect_timeout")
|
||||||
@@ -410,6 +384,11 @@ impl PageServerNode {
|
|||||||
.map(|x| x.parse::<bool>())
|
.map(|x| x.parse::<bool>())
|
||||||
.transpose()
|
.transpose()
|
||||||
.context("Failed to parse 'trace_read_requests' as bool")?,
|
.context("Failed to parse 'trace_read_requests' as bool")?,
|
||||||
|
image_layer_compression: settings
|
||||||
|
.remove("image_layer_compression")
|
||||||
|
.map(serde_json::from_str)
|
||||||
|
.transpose()
|
||||||
|
.context("Failed to parse 'image_layer_compression' json")?,
|
||||||
eviction_policy: settings
|
eviction_policy: settings
|
||||||
.remove("eviction_policy")
|
.remove("eviction_policy")
|
||||||
.map(serde_json::from_str)
|
.map(serde_json::from_str)
|
||||||
@@ -505,12 +484,6 @@ impl PageServerNode {
|
|||||||
.map(|x| x.parse::<usize>())
|
.map(|x| x.parse::<usize>())
|
||||||
.transpose()
|
.transpose()
|
||||||
.context("Failed to parse 'image_creation_threshold' as non zero integer")?,
|
.context("Failed to parse 'image_creation_threshold' as non zero integer")?,
|
||||||
image_layer_creation_check_threshold: settings
|
|
||||||
.remove("image_layer_creation_check_threshold")
|
|
||||||
.map(|x| x.parse::<u8>())
|
|
||||||
.transpose()
|
|
||||||
.context("Failed to parse 'image_creation_check_threshold' as integer")?,
|
|
||||||
|
|
||||||
pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
|
pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
|
||||||
walreceiver_connect_timeout: settings
|
walreceiver_connect_timeout: settings
|
||||||
.remove("walreceiver_connect_timeout")
|
.remove("walreceiver_connect_timeout")
|
||||||
@@ -528,6 +501,11 @@ impl PageServerNode {
|
|||||||
.map(|x| x.parse::<bool>())
|
.map(|x| x.parse::<bool>())
|
||||||
.transpose()
|
.transpose()
|
||||||
.context("Failed to parse 'trace_read_requests' as bool")?,
|
.context("Failed to parse 'trace_read_requests' as bool")?,
|
||||||
|
image_layer_compression: settings
|
||||||
|
.remove("image_layer_compression")
|
||||||
|
.map(serde_json::from_str)
|
||||||
|
.transpose()
|
||||||
|
.context("Failed to parse 'image_layer_compression' json")?,
|
||||||
eviction_policy: settings
|
eviction_policy: settings
|
||||||
.remove("eviction_policy")
|
.remove("eviction_policy")
|
||||||
.map(serde_json::from_str)
|
.map(serde_json::from_str)
|
||||||
@@ -586,6 +564,13 @@ impl PageServerNode {
|
|||||||
Ok(self.http_client.list_timelines(*tenant_shard_id).await?)
|
Ok(self.http_client.list_timelines(*tenant_shard_id).await?)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn tenant_secondary_download(&self, tenant_id: &TenantShardId) -> anyhow::Result<()> {
|
||||||
|
Ok(self
|
||||||
|
.http_client
|
||||||
|
.tenant_secondary_download(*tenant_id)
|
||||||
|
.await?)
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn timeline_create(
|
pub async fn timeline_create(
|
||||||
&self,
|
&self,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_shard_id: TenantShardId,
|
||||||
|
|||||||
@@ -70,31 +70,24 @@ pub struct SafekeeperNode {
|
|||||||
pub pg_connection_config: PgConnectionConfig,
|
pub pg_connection_config: PgConnectionConfig,
|
||||||
pub env: LocalEnv,
|
pub env: LocalEnv,
|
||||||
pub http_client: reqwest::Client,
|
pub http_client: reqwest::Client,
|
||||||
pub listen_addr: String,
|
|
||||||
pub http_base_url: String,
|
pub http_base_url: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SafekeeperNode {
|
impl SafekeeperNode {
|
||||||
pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode {
|
pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode {
|
||||||
let listen_addr = if let Some(ref listen_addr) = conf.listen_addr {
|
|
||||||
listen_addr.clone()
|
|
||||||
} else {
|
|
||||||
"127.0.0.1".to_string()
|
|
||||||
};
|
|
||||||
SafekeeperNode {
|
SafekeeperNode {
|
||||||
id: conf.id,
|
id: conf.id,
|
||||||
conf: conf.clone(),
|
conf: conf.clone(),
|
||||||
pg_connection_config: Self::safekeeper_connection_config(&listen_addr, conf.pg_port),
|
pg_connection_config: Self::safekeeper_connection_config(conf.pg_port),
|
||||||
env: env.clone(),
|
env: env.clone(),
|
||||||
http_client: reqwest::Client::new(),
|
http_client: reqwest::Client::new(),
|
||||||
http_base_url: format!("http://{}:{}/v1", listen_addr, conf.http_port),
|
http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
|
||||||
listen_addr,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Construct libpq connection string for connecting to this safekeeper.
|
/// Construct libpq connection string for connecting to this safekeeper.
|
||||||
fn safekeeper_connection_config(addr: &str, port: u16) -> PgConnectionConfig {
|
fn safekeeper_connection_config(port: u16) -> PgConnectionConfig {
|
||||||
PgConnectionConfig::new_host_port(url::Host::parse(addr).unwrap(), port)
|
PgConnectionConfig::new_host_port(url::Host::parse("127.0.0.1").unwrap(), port)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf {
|
pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf {
|
||||||
@@ -118,8 +111,8 @@ impl SafekeeperNode {
|
|||||||
);
|
);
|
||||||
io::stdout().flush().unwrap();
|
io::stdout().flush().unwrap();
|
||||||
|
|
||||||
let listen_pg = format!("{}:{}", self.listen_addr, self.conf.pg_port);
|
let listen_pg = format!("127.0.0.1:{}", self.conf.pg_port);
|
||||||
let listen_http = format!("{}:{}", self.listen_addr, self.conf.http_port);
|
let listen_http = format!("127.0.0.1:{}", self.conf.http_port);
|
||||||
let id = self.id;
|
let id = self.id;
|
||||||
let datadir = self.datadir_path();
|
let datadir = self.datadir_path();
|
||||||
|
|
||||||
@@ -146,7 +139,7 @@ impl SafekeeperNode {
|
|||||||
availability_zone,
|
availability_zone,
|
||||||
];
|
];
|
||||||
if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port {
|
if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port {
|
||||||
let listen_pg_tenant_only = format!("{}:{}", self.listen_addr, pg_tenant_only_port);
|
let listen_pg_tenant_only = format!("127.0.0.1:{}", pg_tenant_only_port);
|
||||||
args.extend(["--listen-pg-tenant-only".to_owned(), listen_pg_tenant_only]);
|
args.extend(["--listen-pg-tenant-only".to_owned(), listen_pg_tenant_only]);
|
||||||
}
|
}
|
||||||
if !self.conf.sync {
|
if !self.conf.sync {
|
||||||
|
|||||||
@@ -38,9 +38,6 @@ const COMMAND: &str = "storage_controller";
|
|||||||
|
|
||||||
const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
|
const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
|
||||||
|
|
||||||
// Use a shorter pageserver unavailability interval than the default to speed up tests.
|
|
||||||
const NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
|
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize)]
|
||||||
pub struct AttachHookRequest {
|
pub struct AttachHookRequest {
|
||||||
pub tenant_shard_id: TenantShardId,
|
pub tenant_shard_id: TenantShardId,
|
||||||
@@ -272,18 +269,13 @@ impl StorageController {
|
|||||||
// Run migrations on every startup, in case something changed.
|
// Run migrations on every startup, in case something changed.
|
||||||
let database_url = self.setup_database().await?;
|
let database_url = self.setup_database().await?;
|
||||||
|
|
||||||
let max_unavailable: humantime::Duration = NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL.into();
|
|
||||||
|
|
||||||
let mut args = vec![
|
let mut args = vec![
|
||||||
"-l",
|
"-l",
|
||||||
&self.listen,
|
&self.listen,
|
||||||
"-p",
|
"-p",
|
||||||
self.path.as_ref(),
|
self.path.as_ref(),
|
||||||
"--dev",
|
|
||||||
"--database-url",
|
"--database-url",
|
||||||
&database_url,
|
&database_url,
|
||||||
"--max-unavailable-interval",
|
|
||||||
&max_unavailable.to_string(),
|
|
||||||
]
|
]
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|s| s.to_string())
|
.map(|s| s.to_string())
|
||||||
@@ -472,21 +464,11 @@ impl StorageController {
|
|||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
#[instrument(skip(self))]
|
|
||||||
pub async fn tenant_import(&self, tenant_id: TenantId) -> anyhow::Result<TenantCreateResponse> {
|
|
||||||
self.dispatch::<(), TenantCreateResponse>(
|
|
||||||
Method::POST,
|
|
||||||
format!("debug/v1/tenant/{tenant_id}/import"),
|
|
||||||
None,
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
}
|
|
||||||
|
|
||||||
#[instrument(skip(self))]
|
#[instrument(skip(self))]
|
||||||
pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
|
pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
|
||||||
self.dispatch::<(), _>(
|
self.dispatch::<(), _>(
|
||||||
Method::GET,
|
Method::GET,
|
||||||
format!("debug/v1/tenant/{tenant_id}/locate"),
|
format!("control/v1/tenant/{tenant_id}/locate"),
|
||||||
None,
|
None,
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
|
|||||||
@@ -1,23 +0,0 @@
|
|||||||
[package]
|
|
||||||
name = "storcon_cli"
|
|
||||||
version = "0.1.0"
|
|
||||||
edition.workspace = true
|
|
||||||
license.workspace = true
|
|
||||||
|
|
||||||
|
|
||||||
[dependencies]
|
|
||||||
anyhow.workspace = true
|
|
||||||
clap.workspace = true
|
|
||||||
comfy-table.workspace = true
|
|
||||||
hyper.workspace = true
|
|
||||||
pageserver_api.workspace = true
|
|
||||||
pageserver_client.workspace = true
|
|
||||||
reqwest.workspace = true
|
|
||||||
serde.workspace = true
|
|
||||||
serde_json = { workspace = true, features = ["raw_value"] }
|
|
||||||
thiserror.workspace = true
|
|
||||||
tokio.workspace = true
|
|
||||||
tracing.workspace = true
|
|
||||||
utils.workspace = true
|
|
||||||
workspace_hack.workspace = true
|
|
||||||
|
|
||||||
@@ -1,681 +0,0 @@
|
|||||||
use std::{collections::HashMap, str::FromStr, time::Duration};
|
|
||||||
|
|
||||||
use clap::{Parser, Subcommand};
|
|
||||||
use hyper::{Method, StatusCode};
|
|
||||||
use pageserver_api::{
|
|
||||||
controller_api::{
|
|
||||||
NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
|
|
||||||
TenantDescribeResponse, TenantPolicyRequest,
|
|
||||||
},
|
|
||||||
models::{
|
|
||||||
LocationConfigSecondary, ShardParameters, TenantConfig, TenantConfigRequest,
|
|
||||||
TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
|
|
||||||
},
|
|
||||||
shard::{ShardStripeSize, TenantShardId},
|
|
||||||
};
|
|
||||||
use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
|
|
||||||
use reqwest::Url;
|
|
||||||
use serde::{de::DeserializeOwned, Serialize};
|
|
||||||
use utils::id::{NodeId, TenantId};
|
|
||||||
|
|
||||||
use pageserver_api::controller_api::{
|
|
||||||
NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
|
|
||||||
TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse,
|
|
||||||
};
|
|
||||||
|
|
||||||
#[derive(Subcommand, Debug)]
|
|
||||||
enum Command {
|
|
||||||
/// Register a pageserver with the storage controller. This shouldn't usually be necessary,
|
|
||||||
/// since pageservers auto-register when they start up
|
|
||||||
NodeRegister {
|
|
||||||
#[arg(long)]
|
|
||||||
node_id: NodeId,
|
|
||||||
|
|
||||||
#[arg(long)]
|
|
||||||
listen_pg_addr: String,
|
|
||||||
#[arg(long)]
|
|
||||||
listen_pg_port: u16,
|
|
||||||
|
|
||||||
#[arg(long)]
|
|
||||||
listen_http_addr: String,
|
|
||||||
#[arg(long)]
|
|
||||||
listen_http_port: u16,
|
|
||||||
},
|
|
||||||
|
|
||||||
/// Modify a node's configuration in the storage controller
|
|
||||||
NodeConfigure {
|
|
||||||
#[arg(long)]
|
|
||||||
node_id: NodeId,
|
|
||||||
|
|
||||||
/// Availability is usually auto-detected based on heartbeats. Set 'offline' here to
|
|
||||||
/// manually mark a node offline
|
|
||||||
#[arg(long)]
|
|
||||||
availability: Option<NodeAvailabilityArg>,
|
|
||||||
/// Scheduling policy controls whether tenant shards may be scheduled onto this node.
|
|
||||||
#[arg(long)]
|
|
||||||
scheduling: Option<NodeSchedulingPolicy>,
|
|
||||||
},
|
|
||||||
/// Modify a tenant's policies in the storage controller
|
|
||||||
TenantPolicy {
|
|
||||||
#[arg(long)]
|
|
||||||
tenant_id: TenantId,
|
|
||||||
/// Placement policy controls whether a tenant is `detached`, has only a secondary location (`secondary`),
|
|
||||||
/// or is in the normal attached state with N secondary locations (`attached:N`)
|
|
||||||
#[arg(long)]
|
|
||||||
placement: Option<PlacementPolicyArg>,
|
|
||||||
/// Scheduling policy enables pausing the controller's scheduling activity involving this tenant. `active` is normal,
|
|
||||||
/// `essential` disables optimization scheduling changes, `pause` disables all scheduling changes, and `stop` prevents
|
|
||||||
/// all reconciliation activity including for scheduling changes already made. `pause` and `stop` can make a tenant
|
|
||||||
/// unavailable, and are only for use in emergencies.
|
|
||||||
#[arg(long)]
|
|
||||||
scheduling: Option<ShardSchedulingPolicyArg>,
|
|
||||||
},
|
|
||||||
/// List nodes known to the storage controller
|
|
||||||
Nodes {},
|
|
||||||
/// List tenants known to the storage controller
|
|
||||||
Tenants {},
|
|
||||||
/// Create a new tenant in the storage controller, and by extension on pageservers.
|
|
||||||
TenantCreate {
|
|
||||||
#[arg(long)]
|
|
||||||
tenant_id: TenantId,
|
|
||||||
},
|
|
||||||
/// Delete a tenant in the storage controller, and by extension on pageservers.
|
|
||||||
TenantDelete {
|
|
||||||
#[arg(long)]
|
|
||||||
tenant_id: TenantId,
|
|
||||||
},
|
|
||||||
/// Split an existing tenant into a higher number of shards than its current shard count.
|
|
||||||
TenantShardSplit {
|
|
||||||
#[arg(long)]
|
|
||||||
tenant_id: TenantId,
|
|
||||||
#[arg(long)]
|
|
||||||
shard_count: u8,
|
|
||||||
/// Optional, in 8kiB pages. e.g. set 2048 for 16MB stripes.
|
|
||||||
#[arg(long)]
|
|
||||||
stripe_size: Option<u32>,
|
|
||||||
},
|
|
||||||
/// Migrate the attached location for a tenant shard to a specific pageserver.
|
|
||||||
TenantShardMigrate {
|
|
||||||
#[arg(long)]
|
|
||||||
tenant_shard_id: TenantShardId,
|
|
||||||
#[arg(long)]
|
|
||||||
node: NodeId,
|
|
||||||
},
|
|
||||||
/// Modify the pageserver tenant configuration of a tenant: this is the configuration structure
|
|
||||||
/// that is passed through to pageservers, and does not affect storage controller behavior.
|
|
||||||
TenantConfig {
|
|
||||||
#[arg(long)]
|
|
||||||
tenant_id: TenantId,
|
|
||||||
#[arg(long)]
|
|
||||||
config: String,
|
|
||||||
},
|
|
||||||
/// Attempt to balance the locations for a tenant across pageservers. This is a client-side
|
|
||||||
/// alternative to the storage controller's scheduling optimization behavior.
|
|
||||||
TenantScatter {
|
|
||||||
#[arg(long)]
|
|
||||||
tenant_id: TenantId,
|
|
||||||
},
|
|
||||||
/// Print details about a particular tenant, including all its shards' states.
|
|
||||||
TenantDescribe {
|
|
||||||
#[arg(long)]
|
|
||||||
tenant_id: TenantId,
|
|
||||||
},
|
|
||||||
/// For a tenant which hasn't been onboarded to the storage controller yet, add it in secondary
|
|
||||||
/// mode so that it can warm up content on a pageserver.
|
|
||||||
TenantWarmup {
|
|
||||||
#[arg(long)]
|
|
||||||
tenant_id: TenantId,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Parser)]
|
|
||||||
#[command(
|
|
||||||
author,
|
|
||||||
version,
|
|
||||||
about,
|
|
||||||
long_about = "CLI for Storage Controller Support/Debug"
|
|
||||||
)]
|
|
||||||
#[command(arg_required_else_help(true))]
|
|
||||||
struct Cli {
|
|
||||||
#[arg(long)]
|
|
||||||
/// URL to storage controller. e.g. http://127.0.0.1:1234 when using `neon_local`
|
|
||||||
api: Url,
|
|
||||||
|
|
||||||
#[arg(long)]
|
|
||||||
/// JWT token for authenticating with storage controller. Depending on the API used, this
|
|
||||||
/// should have either `pageserverapi` or `admin` scopes: for convenience, you should mint
|
|
||||||
/// a token with both scopes to use with this tool.
|
|
||||||
jwt: Option<String>,
|
|
||||||
|
|
||||||
#[command(subcommand)]
|
|
||||||
command: Command,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
|
||||||
struct PlacementPolicyArg(PlacementPolicy);
|
|
||||||
|
|
||||||
impl FromStr for PlacementPolicyArg {
|
|
||||||
type Err = anyhow::Error;
|
|
||||||
|
|
||||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
|
||||||
match s {
|
|
||||||
"detached" => Ok(Self(PlacementPolicy::Detached)),
|
|
||||||
"secondary" => Ok(Self(PlacementPolicy::Secondary)),
|
|
||||||
_ if s.starts_with("attached:") => {
|
|
||||||
let mut splitter = s.split(':');
|
|
||||||
let _prefix = splitter.next().unwrap();
|
|
||||||
match splitter.next().and_then(|s| s.parse::<usize>().ok()) {
|
|
||||||
Some(n) => Ok(Self(PlacementPolicy::Attached(n))),
|
|
||||||
None => Err(anyhow::anyhow!(
|
|
||||||
"Invalid format '{s}', a valid example is 'attached:1'"
|
|
||||||
)),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_ => Err(anyhow::anyhow!(
|
|
||||||
"Unknown placement policy '{s}', try detached,secondary,attached:<n>"
|
|
||||||
)),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
|
||||||
struct ShardSchedulingPolicyArg(ShardSchedulingPolicy);
|
|
||||||
|
|
||||||
impl FromStr for ShardSchedulingPolicyArg {
|
|
||||||
type Err = anyhow::Error;
|
|
||||||
|
|
||||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
|
||||||
match s {
|
|
||||||
"active" => Ok(Self(ShardSchedulingPolicy::Active)),
|
|
||||||
"essential" => Ok(Self(ShardSchedulingPolicy::Essential)),
|
|
||||||
"pause" => Ok(Self(ShardSchedulingPolicy::Pause)),
|
|
||||||
"stop" => Ok(Self(ShardSchedulingPolicy::Stop)),
|
|
||||||
_ => Err(anyhow::anyhow!(
|
|
||||||
"Unknown scheduling policy '{s}', try active,essential,pause,stop"
|
|
||||||
)),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
|
||||||
struct NodeAvailabilityArg(NodeAvailabilityWrapper);
|
|
||||||
|
|
||||||
impl FromStr for NodeAvailabilityArg {
|
|
||||||
type Err = anyhow::Error;
|
|
||||||
|
|
||||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
|
||||||
match s {
|
|
||||||
"active" => Ok(Self(NodeAvailabilityWrapper::Active)),
|
|
||||||
"offline" => Ok(Self(NodeAvailabilityWrapper::Offline)),
|
|
||||||
_ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct Client {
|
|
||||||
base_url: Url,
|
|
||||||
jwt_token: Option<String>,
|
|
||||||
client: reqwest::Client,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Client {
|
|
||||||
fn new(base_url: Url, jwt_token: Option<String>) -> Self {
|
|
||||||
Self {
|
|
||||||
base_url,
|
|
||||||
jwt_token,
|
|
||||||
client: reqwest::ClientBuilder::new()
|
|
||||||
.build()
|
|
||||||
.expect("Failed to construct http client"),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Simple HTTP request wrapper for calling into storage controller
|
|
||||||
async fn dispatch<RQ, RS>(
|
|
||||||
&self,
|
|
||||||
method: hyper::Method,
|
|
||||||
path: String,
|
|
||||||
body: Option<RQ>,
|
|
||||||
) -> mgmt_api::Result<RS>
|
|
||||||
where
|
|
||||||
RQ: Serialize + Sized,
|
|
||||||
RS: DeserializeOwned + Sized,
|
|
||||||
{
|
|
||||||
// The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
|
|
||||||
// for general purpose API access.
|
|
||||||
let url = Url::from_str(&format!(
|
|
||||||
"http://{}:{}/{path}",
|
|
||||||
self.base_url.host_str().unwrap(),
|
|
||||||
self.base_url.port().unwrap()
|
|
||||||
))
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let mut builder = self.client.request(method, url);
|
|
||||||
if let Some(body) = body {
|
|
||||||
builder = builder.json(&body)
|
|
||||||
}
|
|
||||||
if let Some(jwt_token) = &self.jwt_token {
|
|
||||||
builder = builder.header(
|
|
||||||
reqwest::header::AUTHORIZATION,
|
|
||||||
format!("Bearer {jwt_token}"),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?;
|
|
||||||
let response = response.error_from_body().await?;
|
|
||||||
|
|
||||||
response
|
|
||||||
.json()
|
|
||||||
.await
|
|
||||||
.map_err(pageserver_client::mgmt_api::Error::ReceiveBody)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[tokio::main]
|
|
||||||
async fn main() -> anyhow::Result<()> {
|
|
||||||
let cli = Cli::parse();
|
|
||||||
|
|
||||||
let storcon_client = Client::new(cli.api.clone(), cli.jwt.clone());
|
|
||||||
|
|
||||||
let mut trimmed = cli.api.to_string();
|
|
||||||
trimmed.pop();
|
|
||||||
let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref());
|
|
||||||
|
|
||||||
match cli.command {
|
|
||||||
Command::NodeRegister {
|
|
||||||
node_id,
|
|
||||||
listen_pg_addr,
|
|
||||||
listen_pg_port,
|
|
||||||
listen_http_addr,
|
|
||||||
listen_http_port,
|
|
||||||
} => {
|
|
||||||
storcon_client
|
|
||||||
.dispatch::<_, ()>(
|
|
||||||
Method::POST,
|
|
||||||
"control/v1/node".to_string(),
|
|
||||||
Some(NodeRegisterRequest {
|
|
||||||
node_id,
|
|
||||||
listen_pg_addr,
|
|
||||||
listen_pg_port,
|
|
||||||
listen_http_addr,
|
|
||||||
listen_http_port,
|
|
||||||
}),
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
}
|
|
||||||
Command::TenantCreate { tenant_id } => {
|
|
||||||
vps_client
|
|
||||||
.tenant_create(&TenantCreateRequest {
|
|
||||||
new_tenant_id: TenantShardId::unsharded(tenant_id),
|
|
||||||
generation: None,
|
|
||||||
shard_parameters: ShardParameters::default(),
|
|
||||||
placement_policy: Some(PlacementPolicy::Attached(1)),
|
|
||||||
config: TenantConfig::default(),
|
|
||||||
})
|
|
||||||
.await?;
|
|
||||||
}
|
|
||||||
Command::TenantDelete { tenant_id } => {
|
|
||||||
let status = vps_client
|
|
||||||
.tenant_delete(TenantShardId::unsharded(tenant_id))
|
|
||||||
.await?;
|
|
||||||
tracing::info!("Delete status: {}", status);
|
|
||||||
}
|
|
||||||
Command::Nodes {} => {
|
|
||||||
let resp = storcon_client
|
|
||||||
.dispatch::<(), Vec<NodeDescribeResponse>>(
|
|
||||||
Method::GET,
|
|
||||||
"control/v1/node".to_string(),
|
|
||||||
None,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
let mut table = comfy_table::Table::new();
|
|
||||||
table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
|
|
||||||
for node in resp {
|
|
||||||
table.add_row([
|
|
||||||
format!("{}", node.id),
|
|
||||||
node.listen_http_addr,
|
|
||||||
format!("{:?}", node.scheduling),
|
|
||||||
format!("{:?}", node.availability),
|
|
||||||
]);
|
|
||||||
}
|
|
||||||
println!("{table}");
|
|
||||||
}
|
|
||||||
Command::NodeConfigure {
|
|
||||||
node_id,
|
|
||||||
availability,
|
|
||||||
scheduling,
|
|
||||||
} => {
|
|
||||||
let req = NodeConfigureRequest {
|
|
||||||
node_id,
|
|
||||||
availability: availability.map(|a| a.0),
|
|
||||||
scheduling,
|
|
||||||
};
|
|
||||||
storcon_client
|
|
||||||
.dispatch::<_, ()>(
|
|
||||||
Method::PUT,
|
|
||||||
format!("control/v1/node/{node_id}/config"),
|
|
||||||
Some(req),
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
}
|
|
||||||
Command::Tenants {} => {
|
|
||||||
let resp = storcon_client
|
|
||||||
.dispatch::<(), Vec<TenantDescribeResponse>>(
|
|
||||||
Method::GET,
|
|
||||||
"control/v1/tenant".to_string(),
|
|
||||||
None,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
let mut table = comfy_table::Table::new();
|
|
||||||
table.set_header([
|
|
||||||
"TenantId",
|
|
||||||
"ShardCount",
|
|
||||||
"StripeSize",
|
|
||||||
"Placement",
|
|
||||||
"Scheduling",
|
|
||||||
]);
|
|
||||||
for tenant in resp {
|
|
||||||
let shard_zero = tenant.shards.into_iter().next().unwrap();
|
|
||||||
table.add_row([
|
|
||||||
format!("{}", tenant.tenant_id),
|
|
||||||
format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
|
|
||||||
format!("{:?}", tenant.stripe_size),
|
|
||||||
format!("{:?}", tenant.policy),
|
|
||||||
format!("{:?}", shard_zero.scheduling_policy),
|
|
||||||
]);
|
|
||||||
}
|
|
||||||
|
|
||||||
println!("{table}");
|
|
||||||
}
|
|
||||||
Command::TenantPolicy {
|
|
||||||
tenant_id,
|
|
||||||
placement,
|
|
||||||
scheduling,
|
|
||||||
} => {
|
|
||||||
let req = TenantPolicyRequest {
|
|
||||||
scheduling: scheduling.map(|s| s.0),
|
|
||||||
placement: placement.map(|p| p.0),
|
|
||||||
};
|
|
||||||
storcon_client
|
|
||||||
.dispatch::<_, ()>(
|
|
||||||
Method::PUT,
|
|
||||||
format!("control/v1/tenant/{tenant_id}/policy"),
|
|
||||||
Some(req),
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
}
|
|
||||||
Command::TenantShardSplit {
|
|
||||||
tenant_id,
|
|
||||||
shard_count,
|
|
||||||
stripe_size,
|
|
||||||
} => {
|
|
||||||
let req = TenantShardSplitRequest {
|
|
||||||
new_shard_count: shard_count,
|
|
||||||
new_stripe_size: stripe_size.map(ShardStripeSize),
|
|
||||||
};
|
|
||||||
|
|
||||||
let response = storcon_client
|
|
||||||
.dispatch::<TenantShardSplitRequest, TenantShardSplitResponse>(
|
|
||||||
Method::PUT,
|
|
||||||
format!("control/v1/tenant/{tenant_id}/shard_split"),
|
|
||||||
Some(req),
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
println!(
|
|
||||||
"Split tenant {} into {} shards: {}",
|
|
||||||
tenant_id,
|
|
||||||
shard_count,
|
|
||||||
response
|
|
||||||
.new_shards
|
|
||||||
.iter()
|
|
||||||
.map(|s| format!("{:?}", s))
|
|
||||||
.collect::<Vec<_>>()
|
|
||||||
.join(",")
|
|
||||||
);
|
|
||||||
}
|
|
||||||
Command::TenantShardMigrate {
|
|
||||||
tenant_shard_id,
|
|
||||||
node,
|
|
||||||
} => {
|
|
||||||
let req = TenantShardMigrateRequest {
|
|
||||||
tenant_shard_id,
|
|
||||||
node_id: node,
|
|
||||||
};
|
|
||||||
|
|
||||||
storcon_client
|
|
||||||
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
|
|
||||||
Method::PUT,
|
|
||||||
format!("control/v1/tenant/{tenant_shard_id}/migrate"),
|
|
||||||
Some(req),
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
}
|
|
||||||
Command::TenantConfig { tenant_id, config } => {
|
|
||||||
let tenant_conf = serde_json::from_str(&config)?;
|
|
||||||
|
|
||||||
vps_client
|
|
||||||
.tenant_config(&TenantConfigRequest {
|
|
||||||
tenant_id,
|
|
||||||
config: tenant_conf,
|
|
||||||
})
|
|
||||||
.await?;
|
|
||||||
}
|
|
||||||
Command::TenantScatter { tenant_id } => {
|
|
||||||
// Find the shards
|
|
||||||
let locate_response = storcon_client
|
|
||||||
.dispatch::<(), TenantLocateResponse>(
|
|
||||||
Method::GET,
|
|
||||||
format!("control/v1/tenant/{tenant_id}/locate"),
|
|
||||||
None,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
let shards = locate_response.shards;
|
|
||||||
|
|
||||||
let mut node_to_shards: HashMap<NodeId, Vec<TenantShardId>> = HashMap::new();
|
|
||||||
let shard_count = shards.len();
|
|
||||||
for s in shards {
|
|
||||||
let entry = node_to_shards.entry(s.node_id).or_default();
|
|
||||||
entry.push(s.shard_id);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Load list of available nodes
|
|
||||||
let nodes_resp = storcon_client
|
|
||||||
.dispatch::<(), Vec<NodeDescribeResponse>>(
|
|
||||||
Method::GET,
|
|
||||||
"control/v1/node".to_string(),
|
|
||||||
None,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
for node in nodes_resp {
|
|
||||||
if matches!(node.availability, NodeAvailabilityWrapper::Active) {
|
|
||||||
node_to_shards.entry(node.id).or_default();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let max_shard_per_node = shard_count / node_to_shards.len();
|
|
||||||
|
|
||||||
loop {
|
|
||||||
let mut migrate_shard = None;
|
|
||||||
for shards in node_to_shards.values_mut() {
|
|
||||||
if shards.len() > max_shard_per_node {
|
|
||||||
// Pick the emptiest
|
|
||||||
migrate_shard = Some(shards.pop().unwrap());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
let Some(migrate_shard) = migrate_shard else {
|
|
||||||
break;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Pick the emptiest node to migrate to
|
|
||||||
let mut destinations = node_to_shards
|
|
||||||
.iter()
|
|
||||||
.map(|(k, v)| (k, v.len()))
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
destinations.sort_by_key(|i| i.1);
|
|
||||||
let (destination_node, destination_count) = *destinations.first().unwrap();
|
|
||||||
if destination_count + 1 > max_shard_per_node {
|
|
||||||
// Even the emptiest destination doesn't have space: we're done
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
let destination_node = *destination_node;
|
|
||||||
|
|
||||||
node_to_shards
|
|
||||||
.get_mut(&destination_node)
|
|
||||||
.unwrap()
|
|
||||||
.push(migrate_shard);
|
|
||||||
|
|
||||||
println!("Migrate {} -> {} ...", migrate_shard, destination_node);
|
|
||||||
|
|
||||||
storcon_client
|
|
||||||
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
|
|
||||||
Method::PUT,
|
|
||||||
format!("control/v1/tenant/{migrate_shard}/migrate"),
|
|
||||||
Some(TenantShardMigrateRequest {
|
|
||||||
tenant_shard_id: migrate_shard,
|
|
||||||
node_id: destination_node,
|
|
||||||
}),
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
println!("Migrate {} -> {} OK", migrate_shard, destination_node);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Spread the shards across the nodes
|
|
||||||
}
|
|
||||||
Command::TenantDescribe { tenant_id } => {
|
|
||||||
let describe_response = storcon_client
|
|
||||||
.dispatch::<(), TenantDescribeResponse>(
|
|
||||||
Method::GET,
|
|
||||||
format!("control/v1/tenant/{tenant_id}"),
|
|
||||||
None,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
let shards = describe_response.shards;
|
|
||||||
let mut table = comfy_table::Table::new();
|
|
||||||
table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]);
|
|
||||||
for shard in shards {
|
|
||||||
let secondary = shard
|
|
||||||
.node_secondary
|
|
||||||
.iter()
|
|
||||||
.map(|n| format!("{}", n))
|
|
||||||
.collect::<Vec<_>>()
|
|
||||||
.join(",");
|
|
||||||
|
|
||||||
let mut status_parts = Vec::new();
|
|
||||||
if shard.is_reconciling {
|
|
||||||
status_parts.push("reconciling");
|
|
||||||
}
|
|
||||||
|
|
||||||
if shard.is_pending_compute_notification {
|
|
||||||
status_parts.push("pending_compute");
|
|
||||||
}
|
|
||||||
|
|
||||||
if shard.is_splitting {
|
|
||||||
status_parts.push("splitting");
|
|
||||||
}
|
|
||||||
let status = status_parts.join(",");
|
|
||||||
|
|
||||||
table.add_row([
|
|
||||||
format!("{}", shard.tenant_shard_id),
|
|
||||||
shard
|
|
||||||
.node_attached
|
|
||||||
.map(|n| format!("{}", n))
|
|
||||||
.unwrap_or(String::new()),
|
|
||||||
secondary,
|
|
||||||
shard.last_error,
|
|
||||||
status,
|
|
||||||
]);
|
|
||||||
}
|
|
||||||
println!("{table}");
|
|
||||||
}
|
|
||||||
Command::TenantWarmup { tenant_id } => {
|
|
||||||
let describe_response = storcon_client
|
|
||||||
.dispatch::<(), TenantDescribeResponse>(
|
|
||||||
Method::GET,
|
|
||||||
format!("control/v1/tenant/{tenant_id}"),
|
|
||||||
None,
|
|
||||||
)
|
|
||||||
.await;
|
|
||||||
match describe_response {
|
|
||||||
Ok(describe) => {
|
|
||||||
if matches!(describe.policy, PlacementPolicy::Secondary) {
|
|
||||||
// Fine: it's already known to controller in secondary mode: calling
|
|
||||||
// again to put it into secondary mode won't cause problems.
|
|
||||||
} else {
|
|
||||||
anyhow::bail!("Tenant already present with policy {:?}", describe.policy);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _)) => {
|
|
||||||
// Fine: this tenant isn't know to the storage controller yet.
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
// Unexpected API error
|
|
||||||
return Err(e.into());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
vps_client
|
|
||||||
.location_config(
|
|
||||||
TenantShardId::unsharded(tenant_id),
|
|
||||||
pageserver_api::models::LocationConfig {
|
|
||||||
mode: pageserver_api::models::LocationConfigMode::Secondary,
|
|
||||||
generation: None,
|
|
||||||
secondary_conf: Some(LocationConfigSecondary { warm: true }),
|
|
||||||
shard_number: 0,
|
|
||||||
shard_count: 0,
|
|
||||||
shard_stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE.0,
|
|
||||||
tenant_conf: TenantConfig::default(),
|
|
||||||
},
|
|
||||||
None,
|
|
||||||
true,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
let describe_response = storcon_client
|
|
||||||
.dispatch::<(), TenantDescribeResponse>(
|
|
||||||
Method::GET,
|
|
||||||
format!("control/v1/tenant/{tenant_id}"),
|
|
||||||
None,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
let secondary_ps_id = describe_response
|
|
||||||
.shards
|
|
||||||
.first()
|
|
||||||
.unwrap()
|
|
||||||
.node_secondary
|
|
||||||
.first()
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
println!("Tenant {tenant_id} warming up on pageserver {secondary_ps_id}");
|
|
||||||
loop {
|
|
||||||
let (status, progress) = vps_client
|
|
||||||
.tenant_secondary_download(
|
|
||||||
TenantShardId::unsharded(tenant_id),
|
|
||||||
Some(Duration::from_secs(10)),
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
println!(
|
|
||||||
"Progress: {}/{} layers, {}/{} bytes",
|
|
||||||
progress.layers_downloaded,
|
|
||||||
progress.layers_total,
|
|
||||||
progress.bytes_downloaded,
|
|
||||||
progress.bytes_total
|
|
||||||
);
|
|
||||||
match status {
|
|
||||||
StatusCode::OK => {
|
|
||||||
println!("Download complete");
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
StatusCode::ACCEPTED => {
|
|
||||||
// Loop
|
|
||||||
}
|
|
||||||
_ => {
|
|
||||||
anyhow::bail!("Unexpected download status: {status}");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
@@ -2,8 +2,8 @@
|
|||||||
# see https://diesel.rs/guides/configuring-diesel-cli
|
# see https://diesel.rs/guides/configuring-diesel-cli
|
||||||
|
|
||||||
[print_schema]
|
[print_schema]
|
||||||
file = "storage_controller/src/schema.rs"
|
file = "control_plane/attachment_service/src/schema.rs"
|
||||||
custom_type_derives = ["diesel::query_builder::QueryId"]
|
custom_type_derives = ["diesel::query_builder::QueryId"]
|
||||||
|
|
||||||
[migrations_directory]
|
[migrations_directory]
|
||||||
dir = "storage_controller/migrations"
|
dir = "control_plane/attachment_service/migrations"
|
||||||
|
|||||||
@@ -1,408 +0,0 @@
|
|||||||
# Sharding Phase 1: Static Key-space Sharding
|
|
||||||
|
|
||||||
## Summary
|
|
||||||
|
|
||||||
To enable databases with sizes approaching the capacity of a pageserver's disk,
|
|
||||||
it is necessary to break up the storage for the database, or _shard_ it.
|
|
||||||
|
|
||||||
Sharding in general is a complex area. This RFC aims to define an initial
|
|
||||||
capability that will permit creating large-capacity databases using a static configuration
|
|
||||||
defined at time of Tenant creation.
|
|
||||||
|
|
||||||
## Motivation
|
|
||||||
|
|
||||||
Currently, all data for a Tenant, including all its timelines, is stored on a single
|
|
||||||
pageserver. The local storage required may be several times larger than the actual
|
|
||||||
database size, due to LSM write inflation.
|
|
||||||
|
|
||||||
If a database is larger than what one pageserver can hold, then it becomes impossible
|
|
||||||
for the pageserver to hold it in local storage, as it must do to provide service to
|
|
||||||
clients.
|
|
||||||
|
|
||||||
### Prior art
|
|
||||||
|
|
||||||
In Neon:
|
|
||||||
|
|
||||||
- Layer File Spreading: https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Konstantin-21fd9b11b618475da5f39c61dd8ab7a4
|
|
||||||
- Layer File SPreading: https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Christian-eb6b64182a214e11b3fceceee688d843
|
|
||||||
- Key Space partitioning: https://www.notion.so/neondatabase/One-Pager-Key-Space-Partitioning-Stas-8e3a28a600a04a25a68523f42a170677
|
|
||||||
|
|
||||||
Prior art in other distributed systems is too broad to capture here: pretty much
|
|
||||||
any scale out storage system does something like this.
|
|
||||||
|
|
||||||
## Requirements
|
|
||||||
|
|
||||||
- Enable creating a large (for example, 16TiB) database without requiring dedicated
|
|
||||||
pageserver nodes.
|
|
||||||
- Share read/write bandwidth costs for large databases across pageservers, as well
|
|
||||||
as storage capacity, in order to avoid large capacity databases acting as I/O hotspots
|
|
||||||
that disrupt service to other tenants.
|
|
||||||
- Our data distribution scheme should handle sparse/nonuniform keys well, since postgres
|
|
||||||
does not write out a single contiguous ranges of page numbers.
|
|
||||||
|
|
||||||
_Note: the definition of 'large database' is arbitrary, but the lower bound is to ensure that a database
|
|
||||||
that a user might create on a current-gen enterprise SSD should also work well on
|
|
||||||
Neon. The upper bound is whatever postgres can handle: i.e. we must make sure that the
|
|
||||||
pageserver backend is not the limiting factor in the database size_.
|
|
||||||
|
|
||||||
## Non Goals
|
|
||||||
|
|
||||||
- Independently distributing timelines within the same tenant. If a tenant has many
|
|
||||||
timelines, then sharding may be a less efficient mechanism for distributing load than
|
|
||||||
sharing out timelines between pageservers.
|
|
||||||
- Distributing work in the LSN dimension: this RFC focuses on the Key dimension only,
|
|
||||||
based on the idea that separate mechanisms will make sense for each dimension.
|
|
||||||
|
|
||||||
## Impacted Components
|
|
||||||
|
|
||||||
pageserver, control plane, postgres/smgr
|
|
||||||
|
|
||||||
## Terminology
|
|
||||||
|
|
||||||
**Key**: a postgres page number, qualified by relation. In the sense that the pageserver is a versioned key-value store,
|
|
||||||
the page number is the key in that store. `Key` is a literal data type in existing code.
|
|
||||||
|
|
||||||
**LSN dimension**: this just means the range of LSNs (history), when talking about the range
|
|
||||||
of keys and LSNs as a two dimensional space.
|
|
||||||
|
|
||||||
## Implementation
|
|
||||||
|
|
||||||
### Key sharding vs. LSN sharding
|
|
||||||
|
|
||||||
When we think of sharding across the two dimensional key/lsn space, this is an
|
|
||||||
opportunity to think about how the two dimensions differ:
|
|
||||||
|
|
||||||
- Sharding the key space distributes the _write_ workload of ingesting data
|
|
||||||
and compacting. This work must be carefully managed so that exactly one
|
|
||||||
node owns a given key.
|
|
||||||
- Sharding the LSN space distributes the _historical read_ workload. This work
|
|
||||||
can be done by anyone without any special coordination, as long as they can
|
|
||||||
see the remote index and layers.
|
|
||||||
|
|
||||||
The key sharding is the harder part, and also the more urgent one, to support larger
|
|
||||||
capacity databases. Because distributing historical LSN read work is a relatively
|
|
||||||
simpler problem that most users don't have, we defer it to future work. It is anticipated
|
|
||||||
that some quite simple P2P offload model will enable distributing work for historical
|
|
||||||
reads: a node which is low on space can call out to peer to ask it to download and
|
|
||||||
serve reads from a historical layer.
|
|
||||||
|
|
||||||
### Key mapping scheme
|
|
||||||
|
|
||||||
Having decided to focus on key sharding, we must next decide how we will map
|
|
||||||
keys to shards. It is proposed to use a "wide striping" approach, to obtain a good compromise
|
|
||||||
between data locality and avoiding entire large relations mapping to the same shard.
|
|
||||||
|
|
||||||
We will define two spaces:
|
|
||||||
|
|
||||||
- Key space: unsigned integer
|
|
||||||
- Shard space: integer from 0 to N-1, where we have N shards.
|
|
||||||
|
|
||||||
### Key -> Shard mapping
|
|
||||||
|
|
||||||
Keys are currently defined in the pageserver's getpage@lsn interface as follows:
|
|
||||||
|
|
||||||
```
|
|
||||||
pub struct Key {
|
|
||||||
pub field1: u8,
|
|
||||||
pub field2: u32,
|
|
||||||
pub field3: u32,
|
|
||||||
pub field4: u32,
|
|
||||||
pub field5: u8,
|
|
||||||
pub field6: u32,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
|
|
||||||
Key {
|
|
||||||
field1: 0x00,
|
|
||||||
field2: rel.spcnode,
|
|
||||||
field3: rel.dbnode,
|
|
||||||
field4: rel.relnode,
|
|
||||||
field5: rel.forknum,
|
|
||||||
field6: blknum,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
_Note: keys for relation metadata are ignored here, as this data will be mirrored to all
|
|
||||||
shards. For distribution purposes, we only care about user data keys_
|
|
||||||
|
|
||||||
The properties we want from our Key->Shard mapping are:
|
|
||||||
|
|
||||||
- Locality in `blknum`, such that adjacent `blknum` will usually map to
|
|
||||||
the same stripe and consequently land on the same shard, even though the overall
|
|
||||||
collection of blocks in a relation will be spread over many stripes and therefore
|
|
||||||
many shards.
|
|
||||||
- Avoid the same blknum on different relations landing on the same stripe, so that
|
|
||||||
with many small relations we do not end up aliasing data to the same stripe/shard.
|
|
||||||
- Avoid vulnerability to aliasing in the values of relation identity fields, such that
|
|
||||||
if there are patterns in the value of `relnode`, these do not manifest as patterns
|
|
||||||
in data placement.
|
|
||||||
|
|
||||||
To accomplish this, the blknum is used to select a stripe, and stripes are
|
|
||||||
assigned to shards in a pseudorandom order via a hash. The motivation for
|
|
||||||
pseudo-random distribution (rather than sequential mapping of stripe to shard)
|
|
||||||
is to avoid I/O hotspots when sequentially reading multiple relations: we don't want
|
|
||||||
all relations' stripes to touch pageservers in the same order.
|
|
||||||
|
|
||||||
To map a `Key` to a shard:
|
|
||||||
|
|
||||||
- Hash the `Key` field 4 (relNode).
|
|
||||||
- Divide field 6 (`blknum`) field by the stripe size in pages, and combine the
|
|
||||||
hash of this with the hash from the previous step.
|
|
||||||
- The total hash modulo the shard count gives the shard holding this key.
|
|
||||||
|
|
||||||
Why don't we use the other fields in the Key?
|
|
||||||
|
|
||||||
- We ignore `forknum` for key mapping, because it distinguishes different classes of data
|
|
||||||
in the same relation, and we would like to keep the data in a relation together.
|
|
||||||
- We would like to use spcNode and dbNode, but cannot. Postgres database creation operations can refer to an existing database as a template, such that the created
|
|
||||||
database's blocks differ only by spcNode and dbNode from the original. To enable running
|
|
||||||
this type of creation without cross-pageserver communication, we must ensure that these
|
|
||||||
blocks map to the same shard -- we do this by excluding spcNode and dbNode from the hash.
|
|
||||||
|
|
||||||
### Data placement examples
|
|
||||||
|
|
||||||
For example, consider the extreme large databases cases of postgres data layout in a system with 8 shards
|
|
||||||
and a stripe size of 32k pages:
|
|
||||||
|
|
||||||
- A single large relation: `blknum` division will break the data up into 4096
|
|
||||||
stripes, which will be scattered across the shards.
|
|
||||||
- 4096 relations of of 32k pages each: each relation will map to exactly one stripe,
|
|
||||||
and that stripe will be placed according to the hash of the key fields 4. The
|
|
||||||
data placement will be statistically uniform across shards.
|
|
||||||
|
|
||||||
Data placement will be more uneven on smaller databases:
|
|
||||||
|
|
||||||
- A tenant with 2 shards and 2 relations of one stripe size each: there is a 50% chance
|
|
||||||
that both relations land on the same shard and no data lands on the other shard.
|
|
||||||
- A tenant with 8 shards and one relation of size 12 stripes: 4 shards will have double
|
|
||||||
the data of the other four shards.
|
|
||||||
|
|
||||||
These uneven cases for small amounts of data do not matter, as long as the stripe size
|
|
||||||
is an order of magnitude smaller than the amount of data we are comfortable holding
|
|
||||||
in a single shard: if our system handles shard sizes up to 10-100GB, then it is not an issue if
|
|
||||||
a tenant has some shards with 256MB size and some shards with 512MB size, even though
|
|
||||||
the standard deviation of shard size within the tenant is very high. Our key mapping
|
|
||||||
scheme provides a statistical guarantee that as the tenant's overall data size increases,
|
|
||||||
uniformity of placement will improve.
|
|
||||||
|
|
||||||
### Important Types
|
|
||||||
|
|
||||||
#### `ShardIdentity`
|
|
||||||
|
|
||||||
Provides the information needed to know whether a particular key belongs
|
|
||||||
to a particular shard:
|
|
||||||
|
|
||||||
- Layout version
|
|
||||||
- Stripe size
|
|
||||||
- Shard count
|
|
||||||
- Shard index
|
|
||||||
|
|
||||||
This structure's size is constant. Note that if we had used a differnet key
|
|
||||||
mapping scheme such as consistent hashing with explicit hash ranges assigned
|
|
||||||
to each shard, then the ShardIdentity's size would grow with the shard count: the simpler
|
|
||||||
key mapping scheme used here enables a small fixed size ShardIdentity.
|
|
||||||
|
|
||||||
### Pageserver changes
|
|
||||||
|
|
||||||
#### Structural
|
|
||||||
|
|
||||||
Everywhere the Pageserver currently deals with Tenants, it will move to dealing with
|
|
||||||
`TenantShard`s, which are just a `Tenant` plus a `ShardIdentity` telling it which part
|
|
||||||
of the keyspace it owns. An un-sharded tenant is just a `TenantShard` whose `ShardIdentity`
|
|
||||||
covers the whole keyspace.
|
|
||||||
|
|
||||||
When the pageserver writes layers and index_part.json to remote storage, it must
|
|
||||||
include the shard index & count in the name, to avoid collisions (the count is
|
|
||||||
necessary for future-proofing: the count will vary in time). These keys
|
|
||||||
will also include a generation number: the [generation numbers](025-generation-numbers.md) system will work
|
|
||||||
exactly the same for TenantShards as it does for Tenants today: each shard will have
|
|
||||||
its own generation number.
|
|
||||||
|
|
||||||
#### Storage Format: Keys
|
|
||||||
|
|
||||||
For tenants with >1 shard, layer files implicitly become sparse: within the key
|
|
||||||
range described in the layer name, the layer file for a shard will only hold the
|
|
||||||
content relevant to stripes assigned to the shard.
|
|
||||||
|
|
||||||
For this reason, the LayerFileName within a tenant is no longer unique: different shards
|
|
||||||
may use the same LayerFileName to refer to different data. We may solve this simply
|
|
||||||
by including the shard number in the keys used for layers.
|
|
||||||
|
|
||||||
The shard number will be included as a prefix (as part of tenant ID), like this:
|
|
||||||
|
|
||||||
`pageserver/v1/tenants/<tenant_id>-<shard_number><shard_count>/timelines/<timeline id>/<layer file name>-<generation>`
|
|
||||||
|
|
||||||
`pageserver/v1/tenants/<tenant_id>-<shard_number><shard_count>/timelines/<timeline id>/index_part.json-<generation>`
|
|
||||||
|
|
||||||
Reasons for this particular format:
|
|
||||||
|
|
||||||
- Use of a prefix is convenient for implementation (no need to carry the shard ID everywhere
|
|
||||||
we construct a layer file name), and enables efficient listing of index_parts within
|
|
||||||
a particular shard-timeline prefix.
|
|
||||||
- Including the shard _count_ as well as shard number means that in future when we implement
|
|
||||||
shard splitting, it will be possible for a parent shard and one of its children to write
|
|
||||||
the same layer file without a name collision. For example, a parent shard 0_1 might split
|
|
||||||
into two (0_2, 1_2), and in the process of splitting shard 0_2 could write a layer or index_part
|
|
||||||
that is distinct from what shard 0_1 would have written at the same place.
|
|
||||||
|
|
||||||
In practice, we expect shard counts to be relatively small, so a `u8` will be sufficient,
|
|
||||||
and therefore the shard part of the path can be a fixed-length hex string like `{:02X}{:02X}`,
|
|
||||||
for example a single-shard tenant's prefix will be `0001`.
|
|
||||||
|
|
||||||
For backward compatibility, we may define a special `ShardIdentity` that has shard_count==0,
|
|
||||||
and use this as a cue to construct paths with no prefix at all.
|
|
||||||
|
|
||||||
#### Storage Format: Indices
|
|
||||||
|
|
||||||
In the phase 1 described in this RFC, shards only reference layers they write themselves. However,
|
|
||||||
when we implement shard splitting in future, it will be useful to enable shards to reference layers
|
|
||||||
written by other shards (specifically the parent shard during a split), so that shards don't
|
|
||||||
have to exhaustively copy all data into their own shard-prefixed keys.
|
|
||||||
|
|
||||||
To enable this, the `IndexPart` structure will be extended to store the (shard number, shard count)
|
|
||||||
tuple on each layer, such that it can construct paths for layers written by other shards. This
|
|
||||||
naturally raises the question of who "owns" such layers written by ancestral shards: this problem
|
|
||||||
will be addressed in phase 2.
|
|
||||||
|
|
||||||
For backward compatibility, any index entry without shard information will be assumed to be
|
|
||||||
in the legacy shardidentity.
|
|
||||||
|
|
||||||
#### WAL Ingest
|
|
||||||
|
|
||||||
In Phase 1, all shards will subscribe to the safekeeper to download WAL content. They will filter
|
|
||||||
it down to the pages relevant to their shard:
|
|
||||||
|
|
||||||
- For ordinary user data writes, only retain a write if it matches the ShardIdentity
|
|
||||||
- For metadata describing relations etc, all shards retain these writes.
|
|
||||||
|
|
||||||
The pageservers must somehow give the safekeeper correct feedback on remote_consistent_lsn:
|
|
||||||
one solution here is for the 0th shard to periodically peek at the IndexParts for all the other shards,
|
|
||||||
and have only the 0th shard populate remote_consistent_lsn. However, this is relatively
|
|
||||||
expensive: if the safekeeper can be made shard-aware then it could be taught to use
|
|
||||||
the max() of all shards' remote_consistent_lsns to decide when to trim the WAL.
|
|
||||||
|
|
||||||
#### Compaction/GC
|
|
||||||
|
|
||||||
No changes needed.
|
|
||||||
|
|
||||||
The pageserver doesn't have to do anything special during compaction
|
|
||||||
or GC. It is implicitly operating on the subset of keys that map to its ShardIdentity.
|
|
||||||
This will result in sparse layer files, containing keys only in the stripes that this
|
|
||||||
shard owns. Where optimizations currently exist in compaction for spotting "gaps" in
|
|
||||||
the key range, these should be updated to ignore gaps that are due to sharding, to
|
|
||||||
avoid spuriously splitting up layers ito stripe-sized pieces.
|
|
||||||
|
|
||||||
### Compute Endpoints
|
|
||||||
|
|
||||||
Compute endpoints will need to:
|
|
||||||
|
|
||||||
- Accept a vector of connection strings as part of their configuration from the control plane
|
|
||||||
- Route pageserver requests according to mapping the hash of key to the correct
|
|
||||||
entry in the vector of connection strings.
|
|
||||||
|
|
||||||
Doing this in compute rather than routing requests via a single pageserver is
|
|
||||||
necessary to enable sharding tenants without adding latency from extra hops.
|
|
||||||
|
|
||||||
### Control Plane
|
|
||||||
|
|
||||||
Tenants, or _Projects_ in the control plane, will each own a set of TenantShards (this will
|
|
||||||
be 1 for small tenants). Logic for placement of tenant shards is just the same as the current logic for placing
|
|
||||||
tenants.
|
|
||||||
|
|
||||||
Tenant lifecycle operations like deletion will require fanning-out to all the shards
|
|
||||||
in the tenant. The same goes for timeline creation and deletion: a timeline should
|
|
||||||
not be considered created until it has been created in all shards.
|
|
||||||
|
|
||||||
#### Selectively enabling sharding for large tenants
|
|
||||||
|
|
||||||
Initially, we will explicitly enable sharding for large tenants only.
|
|
||||||
|
|
||||||
In future, this hint mechanism will become optional when we implement automatic
|
|
||||||
re-sharding of tenants.
|
|
||||||
|
|
||||||
## Future Phases
|
|
||||||
|
|
||||||
This section exists to indicate what will likely come next after this phase.
|
|
||||||
|
|
||||||
Phases 2a and 2b are amenable to execution in parallel.
|
|
||||||
|
|
||||||
### Phase 2a: WAL fan-out
|
|
||||||
|
|
||||||
**Problem**: when all shards consume the whole WAL, the network bandwidth used
|
|
||||||
for transmitting the WAL from safekeeper to pageservers is multiplied by a factor
|
|
||||||
of the shard count.
|
|
||||||
|
|
||||||
Network bandwidth is not our most pressing bottleneck, but it is likely to become
|
|
||||||
a problem if we set a modest shard count (~8) on a significant number of tenants,
|
|
||||||
especially as those larger tenants which we shard are also likely to have higher
|
|
||||||
write bandwidth than average.
|
|
||||||
|
|
||||||
### Phase 2b: Shard Splitting
|
|
||||||
|
|
||||||
**Problem**: the number of shards in a tenant is defined at creation time and cannot
|
|
||||||
be changed. This causes excessive sharding for most small tenants, and an upper
|
|
||||||
bound on scale for very large tenants.
|
|
||||||
|
|
||||||
To address this, a _splitting_ feature will later be added. One shard can split its
|
|
||||||
data into a number of children by doing a special compaction operation to generate
|
|
||||||
image layers broken up child-shard-wise, and then writing out an `index_part.json` for
|
|
||||||
each child. This will then require external coordination (by the control plane) to
|
|
||||||
safely attach these new child shards and then move them around to distribute work.
|
|
||||||
The opposite _merging_ operation can also be imagined, but is unlikely to be implemented:
|
|
||||||
once a Tenant has been sharded, the marginal efficiency benefit of merging is unlikely to justify
|
|
||||||
the risk/complexity of implementing such a rarely-encountered scenario.
|
|
||||||
|
|
||||||
### Phase N (future): distributed historical reads
|
|
||||||
|
|
||||||
**Problem**: while sharding based on key is good for handling changes in overall
|
|
||||||
database size, it is less suitable for spiky/unpredictable changes in the read
|
|
||||||
workload to historical layers. Sudden increases in historical reads could result
|
|
||||||
in sudden increases in local disk capacity required for a TenantShard.
|
|
||||||
|
|
||||||
Example: the extreme case of this would be to run a tenant for a year, then create branches
|
|
||||||
with ancestors at monthly intervals. This could lead to a sudden 12x inflation in
|
|
||||||
the on-disk capacity footprint of a TenantShard, since it would be serving reads
|
|
||||||
from all those disparate historical layers.
|
|
||||||
|
|
||||||
If we can respond fast enough, then key-sharding a tenant more finely can help with
|
|
||||||
this, but splitting may be a relatively expensive operation and the increased historical
|
|
||||||
read load may be transient.
|
|
||||||
|
|
||||||
A separate mechanism for handling heavy historical reads could be something like
|
|
||||||
a gossip mechanism for pageservers to communicate
|
|
||||||
about their workload, and then a getpageatlsn offload mechanism where one pageserver can
|
|
||||||
ask another to go read the necessary layers from remote storage to serve the read. This
|
|
||||||
requires relativly little coordination because it is read-only: any node can service any
|
|
||||||
read. All reads to a particular shard would still flow through one node, but the
|
|
||||||
disk capactity & I/O impact of servicing the read would be distributed.
|
|
||||||
|
|
||||||
## FAQ/Alternatives
|
|
||||||
|
|
||||||
### Why stripe the data, rather than using contiguous ranges of keyspace for each shard?
|
|
||||||
|
|
||||||
When a database is growing under a write workload, writes may predominantly hit the
|
|
||||||
end of the keyspace, creating a bandwidth hotspot on that shard. Similarly, if the user
|
|
||||||
is intensively re-writing a particular relation, if that relation lived in a particular
|
|
||||||
shard then it would not achieve our goal of distributing the write work across shards.
|
|
||||||
|
|
||||||
### Why not proxy read requests through one pageserver, so that endpoints don't have to change?
|
|
||||||
|
|
||||||
1. This would not achieve scale-out of network bandwidth: a busy tenant with a large
|
|
||||||
database would still cause a load hotspot on the pageserver routing its read requests.
|
|
||||||
2. The additional hop through the "proxy" pageserver would add latency and overall
|
|
||||||
resource cost (CPU, network bandwidth)
|
|
||||||
|
|
||||||
### Layer File Spreading: use one pageserver as the owner of a tenant, and have it spread out work on a per-layer basis to peers
|
|
||||||
|
|
||||||
In this model, there would be no explicit sharding of work, but the pageserver to which
|
|
||||||
a tenant is attached would not hold all layers on its disk: instead, it would call out
|
|
||||||
to peers to have them store some layers, and call out to those peers to request reads
|
|
||||||
in those layers.
|
|
||||||
|
|
||||||
This mechanism will work well for distributing work in the LSN dimension, but in the key
|
|
||||||
space dimension it has the major limitation of requiring one node to handle all
|
|
||||||
incoming writes, and compactions. Even if the write workload for a large database
|
|
||||||
fits in one pageserver, it will still be a hotspot and such tenants may still
|
|
||||||
de-facto require their own pageserver.
|
|
||||||
@@ -1,479 +0,0 @@
|
|||||||
# Shard splitting
|
|
||||||
|
|
||||||
## Summary
|
|
||||||
|
|
||||||
This RFC describes a new pageserver API for splitting an existing tenant shard into
|
|
||||||
multiple shards, and describes how to use this API to safely increase the total
|
|
||||||
shard count of a tenant.
|
|
||||||
|
|
||||||
## Motivation
|
|
||||||
|
|
||||||
In the [sharding RFC](031-sharding-static.md), a mechanism was introduced to scale
|
|
||||||
tenants beyond the capacity of a single pageserver by breaking up the key space
|
|
||||||
into stripes, and distributing these stripes across many pageservers. However,
|
|
||||||
the shard count was defined once at tenant creation time and not varied thereafter.
|
|
||||||
|
|
||||||
In practice, the expected size of a database is rarely known at creation time, and
|
|
||||||
it is inefficient to enable sharding for very small tenants: we need to be
|
|
||||||
able to create a tenant with a small number of shards (such as 1), and later expand
|
|
||||||
when it becomes clear that the tenant has grown in size to a point where sharding
|
|
||||||
is beneficial.
|
|
||||||
|
|
||||||
### Prior art
|
|
||||||
|
|
||||||
Many distributed systems have the problem of choosing how many shards to create for
|
|
||||||
tenants that do not specify an expected size up-front. There are a couple of general
|
|
||||||
approaches:
|
|
||||||
|
|
||||||
- Write to a key space in order, and start a new shard when the highest key advances
|
|
||||||
past some point. This doesn't work well for Neon, because we write to our key space
|
|
||||||
in many different contiguous ranges (per relation), rather than in one contiguous
|
|
||||||
range. To adapt to this kind of model, we would need a sharding scheme where each
|
|
||||||
relation had its own range of shards, which would be inefficient for the common
|
|
||||||
case of databases with many small relations.
|
|
||||||
- Monitor the system, and automatically re-shard at some size threshold. For
|
|
||||||
example in Ceph, the [pg_autoscaler](https://github.com/ceph/ceph/blob/49c27499af4ee9a90f69fcc6bf3597999d6efc7b/src/pybind/mgr/pg_autoscaler/module.py)
|
|
||||||
component monitors the size of each RADOS Pool, and adjusts the number of Placement
|
|
||||||
Groups (Ceph's shard equivalent).
|
|
||||||
|
|
||||||
## Requirements
|
|
||||||
|
|
||||||
- A configurable capacity limit per-shard is enforced.
|
|
||||||
- Changes in shard count do not interrupt service beyond requiring postgres
|
|
||||||
to reconnect (i.e. milliseconds).
|
|
||||||
- Human being does not have to choose shard count
|
|
||||||
|
|
||||||
## Non Goals
|
|
||||||
|
|
||||||
- Shard splitting is always a tenant-global operation: we will not enable splitting
|
|
||||||
one shard while leaving others intact.
|
|
||||||
- The inverse operation (shard merging) is not described in this RFC. This is a lower
|
|
||||||
priority than splitting, because databases grow more often than they shrink, and
|
|
||||||
a database with many shards will still work properly if the stored data shrinks, just
|
|
||||||
with slightly more overhead (e.g. redundant WAL replication)
|
|
||||||
- Shard splitting is only initiated based on capacity bounds, not load. Splitting
|
|
||||||
a tenant based on load will make sense for some medium-capacity, high-load workloads,
|
|
||||||
but is more complex to reason about and likely is not desirable until we have
|
|
||||||
shard merging to reduce the shard count again if the database becomes less busy.
|
|
||||||
|
|
||||||
## Impacted Components
|
|
||||||
|
|
||||||
pageserver, storage controller
|
|
||||||
|
|
||||||
(the _storage controller_ is the evolution of what was called `attachment_service` in our test environment)
|
|
||||||
|
|
||||||
## Terminology
|
|
||||||
|
|
||||||
**Parent** shards are the shards that exist before a split. **Child** shards are
|
|
||||||
the new shards created during a split.
|
|
||||||
|
|
||||||
**Shard** is synonymous with _tenant shard_.
|
|
||||||
|
|
||||||
**Shard Index** is the 2-tuple of shard number and shard count, written in
|
|
||||||
paths as {:02x}{:02x}, e.g. `0001`.
|
|
||||||
|
|
||||||
## Background
|
|
||||||
|
|
||||||
In the implementation section, a couple of existing aspects of sharding are important
|
|
||||||
to remember:
|
|
||||||
|
|
||||||
- Shard identifiers contain the shard number and count, so that "shard 0 of 1" (`0001`) is
|
|
||||||
a distinct shard from "shard 0 of 2" (`0002`). This is the case in key paths, local
|
|
||||||
storage paths, and remote index metadata.
|
|
||||||
- Remote layer file paths contain the shard index of the shard that created them, and
|
|
||||||
remote indices contain the same index to enable building the layer file path. A shard's
|
|
||||||
index may reference layers that were created by another shard.
|
|
||||||
- Local tenant shard directories include the shard index. All layers downloaded by
|
|
||||||
a tenant shard are stored in this shard-prefixed path, even if those layers were
|
|
||||||
initially created by another shard: tenant shards do not read and write one anothers'
|
|
||||||
paths.
|
|
||||||
- The `Tenant` pageserver type represents one tenant _shard_, not the whole tenant.
|
|
||||||
This is for historical reasons and will be cleaned up in future, but the existing
|
|
||||||
name is used here to help comprehension when reading code.
|
|
||||||
|
|
||||||
## Implementation
|
|
||||||
|
|
||||||
Note: this section focuses on the correctness of the core split process. This will
|
|
||||||
be fairly inefficient in a naive implementation, and several important optimizations
|
|
||||||
are described in a later section.
|
|
||||||
|
|
||||||
There are broadly two parts to the implementation:
|
|
||||||
|
|
||||||
1. The pageserver split API, which splits one shard on one pageserver
|
|
||||||
2. The overall tenant split proccess which is coordinated by the storage controller,
|
|
||||||
and calls into the pageserver split API as needed.
|
|
||||||
|
|
||||||
### Pageserver Split API
|
|
||||||
|
|
||||||
The pageserver will expose a new API endpoint at `/v1/tenant/:tenant_shard_id/shard_split`
|
|
||||||
that takes the new total shard count in the body.
|
|
||||||
|
|
||||||
The pageserver split API operates on one tenant shard, on one pageserver. External
|
|
||||||
coordination is required to use it safely, this is described in the later
|
|
||||||
'Split procedure' section.
|
|
||||||
|
|
||||||
#### Preparation
|
|
||||||
|
|
||||||
First identify the shard indices for the new child shards. These are deterministic,
|
|
||||||
calculated from the parent shard's index, and the number of children being created (this
|
|
||||||
is an input to the API, and validated to be a power of two). In a trivial example, splitting
|
|
||||||
0001 in two always results in 0002 and 0102.
|
|
||||||
|
|
||||||
Child shard indices are chosen such that the childrens' parts of the keyspace will
|
|
||||||
be subsets of the parent's parts of the keyspace.
|
|
||||||
|
|
||||||
#### Step 1: write new remote indices
|
|
||||||
|
|
||||||
In remote storage, splitting is very simple: we may just write new index_part.json
|
|
||||||
objects for each child shard, containing exactly the same layers as the parent shard.
|
|
||||||
|
|
||||||
The children will have more data than they need, but this avoids any exhausive
|
|
||||||
re-writing or copying of layer files.
|
|
||||||
|
|
||||||
The index key path includes a generation number: the parent shard's current
|
|
||||||
attached generation number will also be used for the child shards' indices. This
|
|
||||||
makes the operation safely retryable: if everything crashes and restarts, we may
|
|
||||||
call the split API again on the parent shard, and the result will be some new remote
|
|
||||||
indices for the child shards, under a higher generation number.
|
|
||||||
|
|
||||||
#### Step 2: start new `Tenant` objects
|
|
||||||
|
|
||||||
A new `Tenant` object may be instantiated for each child shard, while the parent
|
|
||||||
shard still exists. When calling the tenant_spawn function for this object,
|
|
||||||
the remote index from step 1 will be read, and the child shard will start
|
|
||||||
to ingest WAL to catch up from whatever was in the remote storage at step 1.
|
|
||||||
|
|
||||||
We now wait for child shards' WAL ingestion to catch up with the parent shard,
|
|
||||||
so that we can safely tear down the parent shard without risking an availability
|
|
||||||
gap to clients reading recent LSNs.
|
|
||||||
|
|
||||||
#### Step 3: tear down parent `Tenant` object
|
|
||||||
|
|
||||||
Once child shards are running and have caught up with WAL ingest, we no longer
|
|
||||||
need the parent shard. Note that clients may still be using it -- when we
|
|
||||||
shut it down, any page_service handlers will also shut down, causing clients
|
|
||||||
to disconnect. When the client reconnects, it will re-lookup the tenant,
|
|
||||||
and hit the child shard instead of the parent (shard lookup from page_service
|
|
||||||
should bias toward higher ShardCount shards).
|
|
||||||
|
|
||||||
Note that at this stage the page service client has not yet been notified of
|
|
||||||
any split. In the trivial single split example:
|
|
||||||
|
|
||||||
- Shard 0001 is gone: Tenant object torn down
|
|
||||||
- Shards 0002 and 0102 are running on the same pageserver where Shard 0001 used to live.
|
|
||||||
- Clients will continue to connect to that server thinking that shard 0001 is there,
|
|
||||||
and all requests will work, because any key that was in shard 0001 is definitely
|
|
||||||
available in either shard 0002 or shard 0102.
|
|
||||||
- Eventually, the storage controller (not the pageserver) will decide to migrate
|
|
||||||
some child shards away: at that point it will do a live migration, ensuring
|
|
||||||
that the client has an updated configuration before it detaches anything
|
|
||||||
from the original server.
|
|
||||||
|
|
||||||
#### Complete
|
|
||||||
|
|
||||||
When we send a 200 response to the split request, we are promising the caller:
|
|
||||||
|
|
||||||
- That the child shards are persistent in remote storage
|
|
||||||
- That the parent shard has been shut down
|
|
||||||
|
|
||||||
This enables the caller to proceed with the overall shard split operation, which
|
|
||||||
may involve other shards on other pageservers.
|
|
||||||
|
|
||||||
### Storage Controller Split procedure
|
|
||||||
|
|
||||||
Splitting a tenant requires calling the pageserver split API, and tracking
|
|
||||||
enough state to ensure recovery + completion in the event of any component (pageserver
|
|
||||||
or storage controller) crashing (or request timing out) during the split.
|
|
||||||
|
|
||||||
1. call the split API on all existing shards. Ensure that the resulting
|
|
||||||
child shards are pinned to their pageservers until _all_ the split calls are done.
|
|
||||||
This pinning may be implemented as a "split bit" on the tenant shards, that
|
|
||||||
blocks any migrations, and also acts as a sign that if we restart, we must go
|
|
||||||
through some recovery steps to resume the split.
|
|
||||||
2. Once all the split calls are done, we may unpin the child shards (clear
|
|
||||||
the split bit). The split is now complete: subsequent steps are just migrations,
|
|
||||||
not strictly part of the split.
|
|
||||||
3. Try to schedule new pageserver locations for the child shards, using
|
|
||||||
a soft anti-affinity constraint to place shards from the same tenant onto different
|
|
||||||
pageservers.
|
|
||||||
|
|
||||||
Updating computes about the new shard count is not necessary until we migrate
|
|
||||||
any of the child shards away from the parent's location.
|
|
||||||
|
|
||||||
### Recovering from failures
|
|
||||||
|
|
||||||
#### Rolling back an incomplete split
|
|
||||||
|
|
||||||
An incomplete shard split may be rolled back quite simply, by attaching the parent shards to pageservers,
|
|
||||||
and detaching child shards. This will lose any WAL ingested into the children after the parents
|
|
||||||
were detached earlier, but the parents will catch up.
|
|
||||||
|
|
||||||
No special pageserver API is needed for this. From the storage controllers point of view, the
|
|
||||||
procedure is:
|
|
||||||
|
|
||||||
1. For all parent shards in the tenant, ensure they are attached
|
|
||||||
2. For all child shards, ensure they are not attached
|
|
||||||
3. Drop child shards from the storage controller's database, and clear the split bit on the parent shards.
|
|
||||||
|
|
||||||
Any remote storage content for child shards is left behind. This is similar to other cases where
|
|
||||||
we may leave garbage objects in S3 (e.g. when we upload a layer but crash before uploading an
|
|
||||||
index that references it). Future online scrub/cleanup functionality can remove these objects, or
|
|
||||||
they will be removed when the tenant is deleted, as tenant deletion lists all objects in the prefix,
|
|
||||||
which would include any child shards that were rolled back.
|
|
||||||
|
|
||||||
If any timelines had been created on child shards, they will be lost when rolling back. To mitigate
|
|
||||||
this, we will **block timeline creation during splitting**, so that we can safely roll back until
|
|
||||||
the split is complete, without risking losing timelines.
|
|
||||||
|
|
||||||
Rolling back an incomplete split will happen automatically if a split fails due to some fatal
|
|
||||||
reason, and will not be accessible via an API:
|
|
||||||
|
|
||||||
- A pageserver fails to complete its split API request after too many retries
|
|
||||||
- A pageserver returns a fatal unexpected error such as 400 or 500
|
|
||||||
- The storage controller database returns a non-retryable error
|
|
||||||
- Some internal invariant is violated in the storage controller split code
|
|
||||||
|
|
||||||
#### Rolling back a complete split
|
|
||||||
|
|
||||||
A complete shard split may be rolled back similarly to an incomplete split, with the following
|
|
||||||
modifications:
|
|
||||||
|
|
||||||
- The parent shards will no longer exist in the storage controller database, so these must
|
|
||||||
be re-synthesized somehow: the hard part of this is figuring the parent shards' generations. This
|
|
||||||
may be accomplished either by probing in S3, or by retaining some tombstone state for deleted
|
|
||||||
shards in the storage controller database.
|
|
||||||
- Any timelines that were created after the split complete will disappear when rolling back
|
|
||||||
to the tenant shards. For this reason, rolling back after a complete split should only
|
|
||||||
be done due to serious issues where loss of recently created timelines is acceptable, or
|
|
||||||
in cases where we have confirmed that no timelines were created in the intervening period.
|
|
||||||
- Parent shards' layers must not have been deleted: this property will come "for free" when
|
|
||||||
we first roll out sharding, by simply not implementing deletion of parent layers after
|
|
||||||
a split. When we do implement such deletion (see "Cleaning up parent-shard layers" in the
|
|
||||||
Optimizations section), it should apply a TTL to layers such that we have a
|
|
||||||
defined walltime window in which rollback will be possible.
|
|
||||||
|
|
||||||
The storage controller will expose an API for rolling back a complete split, for use
|
|
||||||
in the field if we encounter some critical bug with a post-split tenant.
|
|
||||||
|
|
||||||
#### Retrying API calls during Pageserver Restart
|
|
||||||
|
|
||||||
When a pageserver restarts during a split API call, it may witness on-disk content for both parent and
|
|
||||||
child shards from an ongoing split. This does not intrinsically break anything, and the
|
|
||||||
pageserver may include all these shards in its `/re-attach` request to the storage controller.
|
|
||||||
|
|
||||||
In order to support such restarts, it is important that the storage controller stores
|
|
||||||
persistent records of each child shard before it calls into a pageserver, as these child shards
|
|
||||||
may require generation increments via a `/re-attach` request.
|
|
||||||
|
|
||||||
The pageserver restart will also result in a failed API call from the storage controller's point
|
|
||||||
of view. Recall that if _any_ pageserver fails to split, the overall split operation may not
|
|
||||||
complete, and all shards must remain pinned to their current pageserver locations until the
|
|
||||||
split is done.
|
|
||||||
|
|
||||||
The pageserver API calls during splitting will retry on transient errors, so that
|
|
||||||
short availability gaps do not result in a failure of the overall operation. The
|
|
||||||
split in progress will be automatically rolled back if the threshold for API
|
|
||||||
retries is reached (e.g. if a pageserver stays offline for longer than a typical
|
|
||||||
restart).
|
|
||||||
|
|
||||||
#### Rollback on Storage Controller Restart
|
|
||||||
|
|
||||||
On startup, the storage controller will inspect the split bit for tenant shards that
|
|
||||||
it loads from the database. If any splits are in progress:
|
|
||||||
|
|
||||||
- Database content will be reverted to the parent shards
|
|
||||||
- Child shards will be dropped from memory
|
|
||||||
- The parent and child shards will be included in the general startup reconciliation that
|
|
||||||
the storage controller does: any child shards will be detached from pageservers because
|
|
||||||
they don't exist in the storage controller's expected set of shards, and parent shards
|
|
||||||
will be attached if they aren't already.
|
|
||||||
|
|
||||||
#### Storage controller API request failures/retries
|
|
||||||
|
|
||||||
The split request handler will implement idempotency: if the [`Tenant`] requested to split
|
|
||||||
doesn't exist, we will check for the would-be child shards, and if they already exist,
|
|
||||||
we consider the request complete.
|
|
||||||
|
|
||||||
If a request is retried while the original request is still underway, then the split
|
|
||||||
request handler will notice an InProgress marker in TenantManager, and return 503
|
|
||||||
to encourage the client to backoff/retry. This is the same as the general pageserver
|
|
||||||
API handling for calls that try to act on an InProgress shard.
|
|
||||||
|
|
||||||
#### Compute start/restart during a split
|
|
||||||
|
|
||||||
If a compute starts up during split, it will be configured with the old sharding
|
|
||||||
configuration. This will work for reads irrespective of the progress of the split
|
|
||||||
as long as no child hards have been migrated away from their original location, and
|
|
||||||
this is guaranteed in the split procedure (see earlier section).
|
|
||||||
|
|
||||||
#### Pageserver fails permanently during a split
|
|
||||||
|
|
||||||
If a pageserver permanently fails (i.e. the storage controller availability state for it
|
|
||||||
goes to Offline) while a split is in progress, the splitting operation will roll back, and
|
|
||||||
during the roll back it will skip any API calls to the offline pageserver. If the offline
|
|
||||||
pageserver becomes available again, any stale locations will be cleaned up via the normal reconciliation process (the `/re-attach` API).
|
|
||||||
|
|
||||||
### Handling secondary locations
|
|
||||||
|
|
||||||
For correctness, it is not necessary to split secondary locations. We can simply detach
|
|
||||||
the secondary locations for parent shards, and then attach new secondary locations
|
|
||||||
for child shards.
|
|
||||||
|
|
||||||
Clearly this is not optimal, as it will result in re-downloads of layer files that
|
|
||||||
were already present on disk. See "Splitting secondary locations"
|
|
||||||
|
|
||||||
### Conditions to trigger a split
|
|
||||||
|
|
||||||
The pageserver will expose a new API for reporting on shards that are candidates
|
|
||||||
for split: this will return a top-N report of the largest tenant shards by
|
|
||||||
physical size (remote size). This should exclude any tenants that are already
|
|
||||||
at the maximum configured shard count.
|
|
||||||
|
|
||||||
The API would look something like:
|
|
||||||
`/v1/top_n_tenant?shard_count_lt=8&sort_by=resident_size`
|
|
||||||
|
|
||||||
The storage controller will poll that API across all pageservers it manages at some appropriate interval (e.g. 60 seconds).
|
|
||||||
|
|
||||||
A split operation will be started when the tenant exceeds some threshold. This threshold
|
|
||||||
should be _less than_ how large we actually want shards to be, perhaps much less. That's to
|
|
||||||
minimize the amount of work involved in splitting -- if we want 100GiB shards, we shouldn't
|
|
||||||
wait for a tenant to exceed 100GiB before we split anything. Some data analysis of existing
|
|
||||||
tenant size distribution may be useful here: if we can make a statement like "usually, if
|
|
||||||
a tenant has exceeded 20GiB they're probably going to exceed 100GiB later", then we might
|
|
||||||
make our policy to split a tenant at 20GiB.
|
|
||||||
|
|
||||||
The finest split we can do is by factors of two, but we can do higher-cardinality splits
|
|
||||||
too, and this will help to reduce the overhead of repeatedly re-splitting a tenant
|
|
||||||
as it grows. An example of a very simple heuristic for early deployment of the splitting
|
|
||||||
feature would be: "Split tenants into 8 shards when their physical size exceeds 64GiB": that
|
|
||||||
would give us two kinds of tenant (1 shard and 8 shards), and the confidence that once we had
|
|
||||||
split a tenant, it will not need re-splitting soon after.
|
|
||||||
|
|
||||||
## Optimizations
|
|
||||||
|
|
||||||
### Flush parent shard to remote storage during split
|
|
||||||
|
|
||||||
Any data that is in WAL but not remote storage at time of split will need
|
|
||||||
to be replayed by child shards when they start for the first time. To minimize
|
|
||||||
this work, we may flush the parent shard to remote storage before writing the
|
|
||||||
remote indices for child shards.
|
|
||||||
|
|
||||||
It is important that this flush is subject to some time bounds: we may be splitting
|
|
||||||
in response to a surge of write ingest, so it may be time-critical to split. A
|
|
||||||
few seconds to flush latest data should be sufficient to optimize common cases without
|
|
||||||
running the risk of holding up a split for a harmful length of time when a parent
|
|
||||||
shard is being written heavily. If the flush doesn't complete in time, we may proceed
|
|
||||||
to shut down the parent shard and carry on with the split.
|
|
||||||
|
|
||||||
### Hard linking parent layers into child shard directories
|
|
||||||
|
|
||||||
Before we start the Tenant objects for child shards, we may pre-populate their
|
|
||||||
local storage directories with hard links to the layer files already present
|
|
||||||
in the parent shard's local directory. When the child shard starts and downloads
|
|
||||||
its remote index, it will find all those layer files already present on local disk.
|
|
||||||
|
|
||||||
This avoids wasting download capacity and makes splitting faster, but more importantly
|
|
||||||
it avoids taking up a factor of N more disk space when splitting 1 shard into N.
|
|
||||||
|
|
||||||
This mechanism will work well in typical flows where shards are migrated away
|
|
||||||
promptly after a split, but for the general case including what happens when
|
|
||||||
layers are evicted and re-downloaded after a split, see the 'Proactive compaction'
|
|
||||||
section below.
|
|
||||||
|
|
||||||
### Filtering during compaction
|
|
||||||
|
|
||||||
Compaction, especially image layer generation, should skip any keys that are
|
|
||||||
present in a shard's layer files, but do not match the shard's ShardIdentity's
|
|
||||||
is_key_local() check. This avoids carrying around data for longer than necessary
|
|
||||||
in post-split compactions.
|
|
||||||
|
|
||||||
This was already implemented in https://github.com/neondatabase/neon/pull/6246
|
|
||||||
|
|
||||||
### Proactive compaction
|
|
||||||
|
|
||||||
In remote storage, there is little reason to rewrite any data on a shard split:
|
|
||||||
all the children can reference parent layers via the very cheap write of the child
|
|
||||||
index_part.json.
|
|
||||||
|
|
||||||
In local storage, things are more nuanced. During the initial split there is no
|
|
||||||
capacity cost to duplicating parent layers, if we implement the hard linking
|
|
||||||
optimization described above. However, as soon as any layers are evicted from
|
|
||||||
local disk and re-downloaded, the downloaded layers will not be hard-links any more:
|
|
||||||
they'll have real capacity footprint. That isn't a problem if we migrate child shards
|
|
||||||
away from the parent node swiftly, but it risks a significant over-use of local disk
|
|
||||||
space if we do not.
|
|
||||||
|
|
||||||
For example, if we did an 8-way split of a shard, and then _didn't_ migrate 7 of
|
|
||||||
the shards elsewhere, then churned all the layers in all the shards via eviction,
|
|
||||||
then we would blow up the storage capacity used on the node by 8x. If we're splitting
|
|
||||||
a 100GB shard, that could take the pageserver to the point of exhausting disk space.
|
|
||||||
|
|
||||||
To avoid this scenario, we could implement a special compaction mode where we just
|
|
||||||
read historic layers, drop unwanted keys, and write back the layer file. This
|
|
||||||
is pretty expensive, but useful if we have split a large shard and are not going to
|
|
||||||
migrate the child shards away.
|
|
||||||
|
|
||||||
The heuristic conditions for triggering such a compaction are:
|
|
||||||
|
|
||||||
- A) eviction plus time: if a child shard
|
|
||||||
has existed for more than a time threshold, and has been requested to perform at least one eviction, then it becomes urgent for this child shard to execute a proactive compaction to reduce its storage footprint, at the cost of I/O load.
|
|
||||||
- B) resident size plus time: we may inspect the resident layers and calculate how
|
|
||||||
many of them include the overhead of storing pre-split keys. After some time
|
|
||||||
threshold (different to the one in case A) we still have such layers occupying
|
|
||||||
local disk space, then we should proactively compact them.
|
|
||||||
|
|
||||||
### Cleaning up parent-shard layers
|
|
||||||
|
|
||||||
It is functionally harmless to leave parent shard layers in remote storage indefinitely.
|
|
||||||
They would be cleaned up in the event of the tenant's deletion.
|
|
||||||
|
|
||||||
As an optimization to avoid leaking remote storage capacity (which costs money), we may
|
|
||||||
lazily clean up parent shard layers once no child shards reference them.
|
|
||||||
|
|
||||||
This may be done _very_ lazily: e.g. check every PITR interval. The cleanup procedure is:
|
|
||||||
|
|
||||||
- list all the key prefixes beginning with the tenant ID, and select those shard prefixes
|
|
||||||
which do not belong to the most-recently-split set of shards (_ancestral shards_, i.e. `shard*count < max(shard_count) over all shards)`, and those shard prefixes which do have the latest shard count (_current shards_)
|
|
||||||
- If there are no _ancestral shard_ prefixes found, we have nothing to clean up and
|
|
||||||
may drop out now.
|
|
||||||
- find the latest-generation index for each _current shard_, read all and accumulate the set of layers belonging to ancestral shards referenced by these indices.
|
|
||||||
- for all ancestral shards, list objects in the prefix and delete any layer which was not
|
|
||||||
referenced by a current shard.
|
|
||||||
|
|
||||||
If this cleanup is scheduled for 1-2 PITR periods after the split, there is a good chance that child shards will have written their own image layers covering the whole keyspace, such that all parent shard layers will be deletable.
|
|
||||||
|
|
||||||
The cleanup may be done by the scrubber (external process), or we may choose to have
|
|
||||||
the zeroth shard in the latest generation do the work -- there is no obstacle to one shard
|
|
||||||
reading the other shard's indices at runtime, and we do not require visibility of the
|
|
||||||
latest index writes.
|
|
||||||
|
|
||||||
Cleanup should be artificially delayed by some period (for example 24 hours) to ensure
|
|
||||||
that we retain the option to roll back a split in case of bugs.
|
|
||||||
|
|
||||||
### Splitting secondary locations
|
|
||||||
|
|
||||||
We may implement a pageserver API similar to the main splitting API, which does a simpler
|
|
||||||
operation for secondary locations: it would not write anything to S3, instead it would simply
|
|
||||||
create the child shard directory on local disk, hard link in directories from the parent,
|
|
||||||
and set up the in memory (TenantSlot) state for the children.
|
|
||||||
|
|
||||||
Similar to attached locations, a subset of secondary locations will probably need re-locating
|
|
||||||
after the split is complete, to avoid leaving multiple child shards on the same pageservers,
|
|
||||||
where they may use excessive space for the tenant.
|
|
||||||
|
|
||||||
## FAQ/Alternatives
|
|
||||||
|
|
||||||
### What should the thresholds be set to?
|
|
||||||
|
|
||||||
Shard size limit: the pre-sharding default capacity quota for databases was 200GiB, so this could be a starting point for the per-shard size limit.
|
|
||||||
|
|
||||||
Max shard count:
|
|
||||||
|
|
||||||
- The safekeeper overhead to sharding is currently O(N) network bandwidth because
|
|
||||||
the un-filtered WAL is sent to all shards. To avoid this growing out of control,
|
|
||||||
a limit of 8 shards should be temporarily imposed until WAL filtering is implemented
|
|
||||||
on the safekeeper.
|
|
||||||
- there is also little benefit to increasing the shard count beyond the number
|
|
||||||
of pageservers in a region.
|
|
||||||
|
|
||||||
### Is it worth just rewriting all the data during a split to simplify reasoning about space?
|
|
||||||
@@ -7,11 +7,6 @@ Below you will find a brief overview of each subdir in the source tree in alphab
|
|||||||
Neon storage broker, providing messaging between safekeepers and pageservers.
|
Neon storage broker, providing messaging between safekeepers and pageservers.
|
||||||
[storage_broker.md](./storage_broker.md)
|
[storage_broker.md](./storage_broker.md)
|
||||||
|
|
||||||
`storage_controller`:
|
|
||||||
|
|
||||||
Neon storage controller, manages a cluster of pageservers and exposes an API that enables
|
|
||||||
managing a many-sharded tenant as a single entity.
|
|
||||||
|
|
||||||
`/control_plane`:
|
`/control_plane`:
|
||||||
|
|
||||||
Local control plane.
|
Local control plane.
|
||||||
|
|||||||
@@ -1,150 +0,0 @@
|
|||||||
# Storage Controller
|
|
||||||
|
|
||||||
## Concepts
|
|
||||||
|
|
||||||
The storage controller sits between administrative API clients and pageservers, and handles the details of mapping tenants to pageserver tenant shards. For example, creating a tenant is one API call to the storage controller,
|
|
||||||
which is mapped into many API calls to many pageservers (for multiple shards, and for secondary locations).
|
|
||||||
|
|
||||||
It implements a pageserver-compatible API that may be used for CRUD operations on tenants and timelines, translating these requests into appropriate operations on the shards within a tenant, which may be on many different pageservers. Using this API, the storage controller may be used in the same way as the pageserver's administrative HTTP API, hiding
|
|
||||||
the underlying details of how data is spread across multiple nodes.
|
|
||||||
|
|
||||||
The storage controller also manages generations, high availability (via secondary locations) and live migrations for tenants under its management. This is done with a reconciliation loop pattern, where tenants have an “intent” state and a “reconcile” task that tries to make the outside world match the intent.
|
|
||||||
|
|
||||||
## APIs
|
|
||||||
|
|
||||||
The storage controller’s HTTP server implements four logically separate APIs:
|
|
||||||
|
|
||||||
- `/v1/...` path is the pageserver-compatible API. This has to be at the path root because that’s where clients expect to find it on a pageserver.
|
|
||||||
- `/control/v1/...` path is the storage controller’s API, which enables operations such as registering and management pageservers, or executing shard splits.
|
|
||||||
- `/debug/v1/...` path contains endpoints which are either exclusively used in tests, or are for use by engineers when supporting a deployed system.
|
|
||||||
- `/upcall/v1/...` path contains endpoints that are called by pageservers. This includes the `/re-attach` and `/validate` APIs used by pageservers
|
|
||||||
to ensure data safety with generation numbers.
|
|
||||||
|
|
||||||
The API is authenticated with a JWT token, and tokens must have scope `pageserverapi` (i.e. the same scope as pageservers’ APIs).
|
|
||||||
|
|
||||||
See the `http.rs` file in the source for where the HTTP APIs are implemented.
|
|
||||||
|
|
||||||
## Database
|
|
||||||
|
|
||||||
The storage controller uses a postgres database to persist a subset of its state. Note that the storage controller does _not_ keep all its state in the database: this is a design choice to enable most operations to be done efficiently in memory, rather than having to read from the database. See `persistence.rs` for a more comprehensive comment explaining what we do and do not persist: a useful metaphor is that we persist objects like tenants and nodes, but we do not
|
|
||||||
persist the _relationships_ between them: the attachment state of a tenant's shards to nodes is kept in memory and
|
|
||||||
rebuilt on startup.
|
|
||||||
|
|
||||||
The file `[persistence.rs](http://persistence.rs)` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.
|
|
||||||
|
|
||||||
The `diesel` crate is used for defining models & migrations.
|
|
||||||
|
|
||||||
Running a local cluster with `cargo neon` automatically starts a vanilla postgress process to host the storage controller’s database.
|
|
||||||
|
|
||||||
### Diesel tip: migrations
|
|
||||||
|
|
||||||
If you need to modify the database schema, here’s how to create a migration:
|
|
||||||
|
|
||||||
- Install the diesel CLI with `cargo install diesel_cli`
|
|
||||||
- Use `diesel migration generate <name>` to create a new migration
|
|
||||||
- Populate the SQL files in the `migrations/` subdirectory
|
|
||||||
- Use `DATABASE_URL=... diesel migration run` to apply the migration you just wrote: this will update the `[schema.rs](http://schema.rs)` file automatically.
|
|
||||||
- This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/attachment_service`
|
|
||||||
- Commit the migration files and the changes to schema.rs
|
|
||||||
- If you need to iterate, you can rewind migrations with `diesel migration revert -a` and then `diesel migration run` again.
|
|
||||||
- The migrations are build into the storage controller binary, and automatically run at startup after it is deployed, so once you’ve committed a migration no further steps are needed.
|
|
||||||
|
|
||||||
## storcon_cli
|
|
||||||
|
|
||||||
The `storcon_cli` tool enables interactive management of the storage controller. This is usually
|
|
||||||
only necessary for debug, but may also be used to manage nodes (e.g. marking a node as offline).
|
|
||||||
|
|
||||||
`storcon_cli --help` includes details on commands.
|
|
||||||
|
|
||||||
# Deploying
|
|
||||||
|
|
||||||
This section is aimed at engineers deploying the storage controller outside of Neon's cloud platform, as
|
|
||||||
part of a self-hosted system.
|
|
||||||
|
|
||||||
_General note: since the default `neon_local` environment includes a storage controller, this is a useful
|
|
||||||
reference when figuring out deployment._
|
|
||||||
|
|
||||||
## Database
|
|
||||||
|
|
||||||
It is **essential** that the database used by the storage controller is durable (**do not store it on ephemeral
|
|
||||||
local disk**). This database contains pageserver generation numbers, which are essential to data safety on the pageserver.
|
|
||||||
|
|
||||||
The resource requirements for the database are very low: a single CPU core and 1GiB of memory should work well for most deployments. The physical size of the database is typically under a gigabyte.
|
|
||||||
|
|
||||||
Set the URL to the database using the `--database-url` CLI option.
|
|
||||||
|
|
||||||
There is no need to run migrations manually: the storage controller automatically applies migrations
|
|
||||||
when it starts up.
|
|
||||||
|
|
||||||
## Configure pageservers to use the storage controller
|
|
||||||
|
|
||||||
1. The pageserver `control_plane_api` and `control_plane_api_token` should be set in the `pageserver.toml` file. The API setting should
|
|
||||||
point to the "upcall" prefix, for example `http://127.0.0.1:1234/upcall/v1/` is used in neon_local clusters.
|
|
||||||
2. Create a `metadata.json` file in the same directory as `pageserver.toml`: this enables the pageserver to automatically register itself
|
|
||||||
with the storage controller when it starts up. See the example below for the format of this file.
|
|
||||||
|
|
||||||
### Example `metadata.json`
|
|
||||||
|
|
||||||
```
|
|
||||||
{"host":"acmehost.localdomain","http_host":"acmehost.localdomain","http_port":9898,"port":64000}
|
|
||||||
```
|
|
||||||
|
|
||||||
- `port` and `host` refer to the _postgres_ port and host, and these must be accessible from wherever
|
|
||||||
postgres runs.
|
|
||||||
- `http_port` and `http_host` refer to the pageserver's HTTP api, this must be accessible from where
|
|
||||||
the storage controller runs.
|
|
||||||
|
|
||||||
## Handle compute notifications.
|
|
||||||
|
|
||||||
The storage controller independently moves tenant attachments between pageservers in response to
|
|
||||||
changes such as a pageserver node becoming unavailable, or the tenant's shard count changing. To enable
|
|
||||||
postgres clients to handle such changes, the storage controller calls an API hook when a tenant's pageserver
|
|
||||||
location changes.
|
|
||||||
|
|
||||||
The hook is configured using the storage controller's `--compute-hook-url` CLI option. If the hook requires
|
|
||||||
JWT auth, the token may be provided with `--control-plane-jwt-token`. The hook will be invoked with a `PUT` request.
|
|
||||||
|
|
||||||
In the Neon cloud service, this hook is implemented by Neon's internal cloud control plane. In `neon_local` systems
|
|
||||||
the storage controller integrates directly with neon_local to reconfigure local postgres processes instead of calling
|
|
||||||
the compute hook.
|
|
||||||
|
|
||||||
When implementing an on-premise Neon deployment, you must implement a service that handles the compute hook. This is not complicated:
|
|
||||||
the request body has format of the `ComputeHookNotifyRequest` structure, provided below for convenience.
|
|
||||||
|
|
||||||
```
|
|
||||||
struct ComputeHookNotifyRequestShard {
|
|
||||||
node_id: NodeId,
|
|
||||||
shard_number: ShardNumber,
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ComputeHookNotifyRequest {
|
|
||||||
tenant_id: TenantId,
|
|
||||||
stripe_size: Option<ShardStripeSize>,
|
|
||||||
shards: Vec<ComputeHookNotifyRequestShard>,
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
When a notification is received:
|
|
||||||
|
|
||||||
1. Modify postgres configuration for this tenant:
|
|
||||||
|
|
||||||
- set `neon.pageserver_connstr` to a comma-separated list of postgres connection strings to pageservers according to the `shards` list. The
|
|
||||||
shards identified by `NodeId` must be converted to the address+port of the node.
|
|
||||||
- if stripe_size is not None, set `neon.stripe_size` to this value
|
|
||||||
|
|
||||||
2. Send SIGHUP to postgres to reload configuration
|
|
||||||
3. Respond with 200 to the notification request. Do not return success if postgres was not updated: if an error is returned, the controller
|
|
||||||
will retry the notification until it succeeds..
|
|
||||||
|
|
||||||
### Example notification body
|
|
||||||
|
|
||||||
```
|
|
||||||
{
|
|
||||||
"tenant_id": "1f359dd625e519a1a4e8d7509690f6fc",
|
|
||||||
"stripe_size": 32768,
|
|
||||||
"shards": [
|
|
||||||
{"node_id": 344, "shard_number": 0},
|
|
||||||
{"node_id": 722, "shard_number": 1},
|
|
||||||
],
|
|
||||||
}
|
|
||||||
```
|
|
||||||
@@ -10,13 +10,11 @@ libc.workspace = true
|
|||||||
once_cell.workspace = true
|
once_cell.workspace = true
|
||||||
chrono.workspace = true
|
chrono.workspace = true
|
||||||
twox-hash.workspace = true
|
twox-hash.workspace = true
|
||||||
measured.workspace = true
|
|
||||||
|
|
||||||
workspace_hack.workspace = true
|
workspace_hack.workspace = true
|
||||||
|
|
||||||
[target.'cfg(target_os = "linux")'.dependencies]
|
[target.'cfg(target_os = "linux")'.dependencies]
|
||||||
procfs.workspace = true
|
procfs.workspace = true
|
||||||
measured-process.workspace = true
|
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
rand = "0.8"
|
rand = "0.8"
|
||||||
|
|||||||
@@ -7,19 +7,14 @@
|
|||||||
//! use significantly less memory than this, but can only approximate the cardinality.
|
//! use significantly less memory than this, but can only approximate the cardinality.
|
||||||
|
|
||||||
use std::{
|
use std::{
|
||||||
hash::{BuildHasher, BuildHasherDefault, Hash},
|
collections::HashMap,
|
||||||
sync::atomic::AtomicU8,
|
hash::{BuildHasher, BuildHasherDefault, Hash, Hasher},
|
||||||
|
sync::{atomic::AtomicU8, Arc, RwLock},
|
||||||
};
|
};
|
||||||
|
|
||||||
use measured::{
|
use prometheus::{
|
||||||
label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor},
|
core::{self, Describer},
|
||||||
metric::{
|
proto, Opts,
|
||||||
group::{Encoding, MetricValue},
|
|
||||||
name::MetricNameEncoder,
|
|
||||||
Metric, MetricType, MetricVec,
|
|
||||||
},
|
|
||||||
text::TextEncoder,
|
|
||||||
LabelGroup,
|
|
||||||
};
|
};
|
||||||
use twox_hash::xxh3;
|
use twox_hash::xxh3;
|
||||||
|
|
||||||
@@ -45,7 +40,7 @@ macro_rules! register_hll {
|
|||||||
}};
|
}};
|
||||||
|
|
||||||
($N:literal, $NAME:expr, $HELP:expr $(,)?) => {{
|
($N:literal, $NAME:expr, $HELP:expr $(,)?) => {{
|
||||||
$crate::register_hll!($N, $crate::opts!($NAME, $HELP))
|
$crate::register_hll!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES)
|
||||||
}};
|
}};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -98,25 +93,203 @@ macro_rules! register_hll {
|
|||||||
/// ```
|
/// ```
|
||||||
///
|
///
|
||||||
/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
|
/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
|
||||||
pub type HyperLogLogVec<L, const N: usize> = MetricVec<HyperLogLogState<N>, L>;
|
#[derive(Clone)]
|
||||||
pub type HyperLogLog<const N: usize> = Metric<HyperLogLogState<N>>;
|
pub struct HyperLogLogVec<const N: usize> {
|
||||||
|
core: Arc<HyperLogLogVecCore<N>>,
|
||||||
pub struct HyperLogLogState<const N: usize> {
|
|
||||||
shards: [AtomicU8; N],
|
|
||||||
}
|
}
|
||||||
impl<const N: usize> Default for HyperLogLogState<N> {
|
|
||||||
fn default() -> Self {
|
struct HyperLogLogVecCore<const N: usize> {
|
||||||
#[allow(clippy::declare_interior_mutable_const)]
|
pub children: RwLock<HashMap<u64, HyperLogLog<N>, BuildHasherDefault<xxh3::Hash64>>>,
|
||||||
const ZERO: AtomicU8 = AtomicU8::new(0);
|
pub desc: core::Desc,
|
||||||
Self { shards: [ZERO; N] }
|
pub opts: Opts,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<const N: usize> core::Collector for HyperLogLogVec<N> {
|
||||||
|
fn desc(&self) -> Vec<&core::Desc> {
|
||||||
|
vec![&self.core.desc]
|
||||||
|
}
|
||||||
|
|
||||||
|
fn collect(&self) -> Vec<proto::MetricFamily> {
|
||||||
|
let mut m = proto::MetricFamily::default();
|
||||||
|
m.set_name(self.core.desc.fq_name.clone());
|
||||||
|
m.set_help(self.core.desc.help.clone());
|
||||||
|
m.set_field_type(proto::MetricType::GAUGE);
|
||||||
|
|
||||||
|
let mut metrics = Vec::new();
|
||||||
|
for child in self.core.children.read().unwrap().values() {
|
||||||
|
child.core.collect_into(&mut metrics);
|
||||||
|
}
|
||||||
|
m.set_metric(metrics);
|
||||||
|
|
||||||
|
vec![m]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<const N: usize> MetricType for HyperLogLogState<N> {
|
impl<const N: usize> HyperLogLogVec<N> {
|
||||||
type Metadata = ();
|
/// Create a new [`HyperLogLogVec`] based on the provided
|
||||||
|
/// [`Opts`] and partitioned by the given label names. At least one label name must be
|
||||||
|
/// provided.
|
||||||
|
pub fn new(opts: Opts, label_names: &[&str]) -> prometheus::Result<Self> {
|
||||||
|
assert!(N.is_power_of_two());
|
||||||
|
let variable_names = label_names.iter().map(|s| (*s).to_owned()).collect();
|
||||||
|
let opts = opts.variable_labels(variable_names);
|
||||||
|
|
||||||
|
let desc = opts.describe()?;
|
||||||
|
let v = HyperLogLogVecCore {
|
||||||
|
children: RwLock::new(HashMap::default()),
|
||||||
|
desc,
|
||||||
|
opts,
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(Self { core: Arc::new(v) })
|
||||||
|
}
|
||||||
|
|
||||||
|
/// `get_metric_with_label_values` returns the [`HyperLogLog<P>`] for the given slice
|
||||||
|
/// of label values (same order as the VariableLabels in Desc). If that combination of
|
||||||
|
/// label values is accessed for the first time, a new [`HyperLogLog<P>`] is created.
|
||||||
|
///
|
||||||
|
/// An error is returned if the number of label values is not the same as the
|
||||||
|
/// number of VariableLabels in Desc.
|
||||||
|
pub fn get_metric_with_label_values(
|
||||||
|
&self,
|
||||||
|
vals: &[&str],
|
||||||
|
) -> prometheus::Result<HyperLogLog<N>> {
|
||||||
|
self.core.get_metric_with_label_values(vals)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// `with_label_values` works as `get_metric_with_label_values`, but panics if an error
|
||||||
|
/// occurs.
|
||||||
|
pub fn with_label_values(&self, vals: &[&str]) -> HyperLogLog<N> {
|
||||||
|
self.get_metric_with_label_values(vals).unwrap()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<const N: usize> HyperLogLogState<N> {
|
impl<const N: usize> HyperLogLogVecCore<N> {
|
||||||
|
pub fn get_metric_with_label_values(
|
||||||
|
&self,
|
||||||
|
vals: &[&str],
|
||||||
|
) -> prometheus::Result<HyperLogLog<N>> {
|
||||||
|
let h = self.hash_label_values(vals)?;
|
||||||
|
|
||||||
|
if let Some(metric) = self.children.read().unwrap().get(&h).cloned() {
|
||||||
|
return Ok(metric);
|
||||||
|
}
|
||||||
|
|
||||||
|
self.get_or_create_metric(h, vals)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn hash_label_values(&self, vals: &[&str]) -> prometheus::Result<u64> {
|
||||||
|
if vals.len() != self.desc.variable_labels.len() {
|
||||||
|
return Err(prometheus::Error::InconsistentCardinality {
|
||||||
|
expect: self.desc.variable_labels.len(),
|
||||||
|
got: vals.len(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut h = xxh3::Hash64::default();
|
||||||
|
for val in vals {
|
||||||
|
h.write(val.as_bytes());
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(h.finish())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_or_create_metric(
|
||||||
|
&self,
|
||||||
|
hash: u64,
|
||||||
|
label_values: &[&str],
|
||||||
|
) -> prometheus::Result<HyperLogLog<N>> {
|
||||||
|
let mut children = self.children.write().unwrap();
|
||||||
|
// Check exist first.
|
||||||
|
if let Some(metric) = children.get(&hash).cloned() {
|
||||||
|
return Ok(metric);
|
||||||
|
}
|
||||||
|
|
||||||
|
let metric = HyperLogLog::with_opts_and_label_values(&self.opts, label_values)?;
|
||||||
|
children.insert(hash, metric.clone());
|
||||||
|
Ok(metric)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// HLL is a probabilistic cardinality measure.
|
||||||
|
///
|
||||||
|
/// How to use this time-series for a metric name `my_metrics_total_hll`:
|
||||||
|
///
|
||||||
|
/// ```promql
|
||||||
|
/// # harmonic mean
|
||||||
|
/// 1 / (
|
||||||
|
/// sum (
|
||||||
|
/// 2 ^ -(
|
||||||
|
/// # HLL merge operation
|
||||||
|
/// max (my_metrics_total_hll{}) by (hll_shard, other_labels...)
|
||||||
|
/// )
|
||||||
|
/// ) without (hll_shard)
|
||||||
|
/// )
|
||||||
|
/// * alpha
|
||||||
|
/// * shards_count
|
||||||
|
/// * shards_count
|
||||||
|
/// ```
|
||||||
|
///
|
||||||
|
/// If you want an estimate over time, you can use the following query:
|
||||||
|
///
|
||||||
|
/// ```promql
|
||||||
|
/// # harmonic mean
|
||||||
|
/// 1 / (
|
||||||
|
/// sum (
|
||||||
|
/// 2 ^ -(
|
||||||
|
/// # HLL merge operation
|
||||||
|
/// max (
|
||||||
|
/// max_over_time(my_metrics_total_hll{}[$__rate_interval])
|
||||||
|
/// ) by (hll_shard, other_labels...)
|
||||||
|
/// )
|
||||||
|
/// ) without (hll_shard)
|
||||||
|
/// )
|
||||||
|
/// * alpha
|
||||||
|
/// * shards_count
|
||||||
|
/// * shards_count
|
||||||
|
/// ```
|
||||||
|
///
|
||||||
|
/// In the case of low cardinality, you might want to use the linear counting approximation:
|
||||||
|
///
|
||||||
|
/// ```promql
|
||||||
|
/// # LinearCounting(m, V) = m log (m / V)
|
||||||
|
/// shards_count * ln(shards_count /
|
||||||
|
/// # calculate V = how many shards contain a 0
|
||||||
|
/// count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard)
|
||||||
|
/// )
|
||||||
|
/// ```
|
||||||
|
///
|
||||||
|
/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct HyperLogLog<const N: usize> {
|
||||||
|
core: Arc<HyperLogLogCore<N>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<const N: usize> HyperLogLog<N> {
|
||||||
|
/// Create a [`HyperLogLog`] with the `name` and `help` arguments.
|
||||||
|
pub fn new<S1: Into<String>, S2: Into<String>>(name: S1, help: S2) -> prometheus::Result<Self> {
|
||||||
|
assert!(N.is_power_of_two());
|
||||||
|
let opts = Opts::new(name, help);
|
||||||
|
Self::with_opts(opts)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a [`HyperLogLog`] with the `opts` options.
|
||||||
|
pub fn with_opts(opts: Opts) -> prometheus::Result<Self> {
|
||||||
|
Self::with_opts_and_label_values(&opts, &[])
|
||||||
|
}
|
||||||
|
|
||||||
|
fn with_opts_and_label_values(opts: &Opts, label_values: &[&str]) -> prometheus::Result<Self> {
|
||||||
|
let desc = opts.describe()?;
|
||||||
|
let labels = make_label_pairs(&desc, label_values)?;
|
||||||
|
|
||||||
|
let v = HyperLogLogCore {
|
||||||
|
shards: [0; N].map(AtomicU8::new),
|
||||||
|
desc,
|
||||||
|
labels,
|
||||||
|
};
|
||||||
|
Ok(Self { core: Arc::new(v) })
|
||||||
|
}
|
||||||
|
|
||||||
pub fn measure(&self, item: &impl Hash) {
|
pub fn measure(&self, item: &impl Hash) {
|
||||||
// changing the hasher will break compatibility with previous measurements.
|
// changing the hasher will break compatibility with previous measurements.
|
||||||
self.record(BuildHasherDefault::<xxh3::Hash64>::default().hash_one(item));
|
self.record(BuildHasherDefault::<xxh3::Hash64>::default().hash_one(item));
|
||||||
@@ -126,11 +299,42 @@ impl<const N: usize> HyperLogLogState<N> {
|
|||||||
let p = N.ilog2() as u8;
|
let p = N.ilog2() as u8;
|
||||||
let j = hash & (N as u64 - 1);
|
let j = hash & (N as u64 - 1);
|
||||||
let rho = (hash >> p).leading_zeros() as u8 + 1 - p;
|
let rho = (hash >> p).leading_zeros() as u8 + 1 - p;
|
||||||
self.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
|
self.core.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct HyperLogLogCore<const N: usize> {
|
||||||
|
shards: [AtomicU8; N],
|
||||||
|
desc: core::Desc,
|
||||||
|
labels: Vec<proto::LabelPair>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<const N: usize> core::Collector for HyperLogLog<N> {
|
||||||
|
fn desc(&self) -> Vec<&core::Desc> {
|
||||||
|
vec![&self.core.desc]
|
||||||
}
|
}
|
||||||
|
|
||||||
fn take_sample(&self) -> [u8; N] {
|
fn collect(&self) -> Vec<proto::MetricFamily> {
|
||||||
self.shards.each_ref().map(|x| {
|
let mut m = proto::MetricFamily::default();
|
||||||
|
m.set_name(self.core.desc.fq_name.clone());
|
||||||
|
m.set_help(self.core.desc.help.clone());
|
||||||
|
m.set_field_type(proto::MetricType::GAUGE);
|
||||||
|
|
||||||
|
let mut metrics = Vec::new();
|
||||||
|
self.core.collect_into(&mut metrics);
|
||||||
|
m.set_metric(metrics);
|
||||||
|
|
||||||
|
vec![m]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<const N: usize> HyperLogLogCore<N> {
|
||||||
|
fn collect_into(&self, metrics: &mut Vec<proto::Metric>) {
|
||||||
|
self.shards.iter().enumerate().for_each(|(i, x)| {
|
||||||
|
let mut shard_label = proto::LabelPair::default();
|
||||||
|
shard_label.set_name("hll_shard".to_owned());
|
||||||
|
shard_label.set_value(format!("{i}"));
|
||||||
|
|
||||||
// We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus.
|
// We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus.
|
||||||
|
|
||||||
// This seems like it would be a race condition,
|
// This seems like it would be a race condition,
|
||||||
@@ -140,90 +344,85 @@ impl<const N: usize> HyperLogLogState<N> {
|
|||||||
|
|
||||||
// TODO: maybe we shouldn't reset this on every collect, instead, only after a time window.
|
// TODO: maybe we shouldn't reset this on every collect, instead, only after a time window.
|
||||||
// this would mean that a dev port-forwarding the metrics url won't break the sampling.
|
// this would mean that a dev port-forwarding the metrics url won't break the sampling.
|
||||||
x.swap(0, std::sync::atomic::Ordering::Relaxed)
|
let v = x.swap(0, std::sync::atomic::Ordering::Relaxed);
|
||||||
|
|
||||||
|
let mut m = proto::Metric::default();
|
||||||
|
let mut c = proto::Gauge::default();
|
||||||
|
c.set_value(v as f64);
|
||||||
|
m.set_gauge(c);
|
||||||
|
|
||||||
|
let mut labels = Vec::with_capacity(self.labels.len() + 1);
|
||||||
|
labels.extend_from_slice(&self.labels);
|
||||||
|
labels.push(shard_label);
|
||||||
|
|
||||||
|
m.set_label(labels);
|
||||||
|
metrics.push(m);
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEncoder<W>>
|
|
||||||
for HyperLogLogState<N>
|
fn make_label_pairs(
|
||||||
{
|
desc: &core::Desc,
|
||||||
fn write_type(
|
label_values: &[&str],
|
||||||
name: impl MetricNameEncoder,
|
) -> prometheus::Result<Vec<proto::LabelPair>> {
|
||||||
enc: &mut TextEncoder<W>,
|
if desc.variable_labels.len() != label_values.len() {
|
||||||
) -> Result<(), std::io::Error> {
|
return Err(prometheus::Error::InconsistentCardinality {
|
||||||
enc.write_type(&name, measured::text::MetricType::Gauge)
|
expect: desc.variable_labels.len(),
|
||||||
|
got: label_values.len(),
|
||||||
|
});
|
||||||
}
|
}
|
||||||
fn collect_into(
|
|
||||||
&self,
|
|
||||||
_: &(),
|
|
||||||
labels: impl LabelGroup,
|
|
||||||
name: impl MetricNameEncoder,
|
|
||||||
enc: &mut TextEncoder<W>,
|
|
||||||
) -> Result<(), std::io::Error> {
|
|
||||||
struct I64(i64);
|
|
||||||
impl LabelValue for I64 {
|
|
||||||
fn visit<V: LabelVisitor>(&self, v: V) -> V::Output {
|
|
||||||
v.write_int(self.0)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct HllShardLabel {
|
let total_len = desc.variable_labels.len() + desc.const_label_pairs.len();
|
||||||
hll_shard: i64,
|
if total_len == 0 {
|
||||||
}
|
return Ok(vec![]);
|
||||||
|
|
||||||
impl LabelGroup for HllShardLabel {
|
|
||||||
fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
|
|
||||||
const LE: &LabelName = LabelName::from_str("hll_shard");
|
|
||||||
v.write_value(LE, &I64(self.hll_shard));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
self.take_sample()
|
|
||||||
.into_iter()
|
|
||||||
.enumerate()
|
|
||||||
.try_for_each(|(hll_shard, val)| {
|
|
||||||
enc.write_metric_value(
|
|
||||||
name.by_ref(),
|
|
||||||
labels.by_ref().compose_with(HllShardLabel {
|
|
||||||
hll_shard: hll_shard as i64,
|
|
||||||
}),
|
|
||||||
MetricValue::Int(val as i64),
|
|
||||||
)
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if desc.variable_labels.is_empty() {
|
||||||
|
return Ok(desc.const_label_pairs.clone());
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut label_pairs = Vec::with_capacity(total_len);
|
||||||
|
for (i, n) in desc.variable_labels.iter().enumerate() {
|
||||||
|
let mut label_pair = proto::LabelPair::default();
|
||||||
|
label_pair.set_name(n.clone());
|
||||||
|
label_pair.set_value(label_values[i].to_owned());
|
||||||
|
label_pairs.push(label_pair);
|
||||||
|
}
|
||||||
|
|
||||||
|
for label_pair in &desc.const_label_pairs {
|
||||||
|
label_pairs.push(label_pair.clone());
|
||||||
|
}
|
||||||
|
label_pairs.sort();
|
||||||
|
Ok(label_pairs)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
|
|
||||||
use measured::{label::StaticLabelSet, FixedCardinalityLabel};
|
use prometheus::{proto, Opts};
|
||||||
use rand::{rngs::StdRng, Rng, SeedableRng};
|
use rand::{rngs::StdRng, Rng, SeedableRng};
|
||||||
use rand_distr::{Distribution, Zipf};
|
use rand_distr::{Distribution, Zipf};
|
||||||
|
|
||||||
use crate::HyperLogLogVec;
|
use crate::HyperLogLogVec;
|
||||||
|
|
||||||
#[derive(FixedCardinalityLabel, Clone, Copy)]
|
fn collect(hll: &HyperLogLogVec<32>) -> Vec<proto::Metric> {
|
||||||
#[label(singleton = "x")]
|
let mut metrics = vec![];
|
||||||
enum Label {
|
hll.core
|
||||||
A,
|
.children
|
||||||
B,
|
.read()
|
||||||
|
.unwrap()
|
||||||
|
.values()
|
||||||
|
.for_each(|c| c.core.collect_into(&mut metrics));
|
||||||
|
metrics
|
||||||
}
|
}
|
||||||
|
fn get_cardinality(metrics: &[proto::Metric], filter: impl Fn(&proto::Metric) -> bool) -> f64 {
|
||||||
fn collect(hll: &HyperLogLogVec<StaticLabelSet<Label>, 32>) -> ([u8; 32], [u8; 32]) {
|
|
||||||
// cannot go through the `hll.collect_family_into` interface yet...
|
|
||||||
// need to see if I can fix the conflicting impls problem in measured.
|
|
||||||
(
|
|
||||||
hll.get_metric(hll.with_labels(Label::A)).take_sample(),
|
|
||||||
hll.get_metric(hll.with_labels(Label::B)).take_sample(),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn get_cardinality(samples: &[[u8; 32]]) -> f64 {
|
|
||||||
let mut buckets = [0.0; 32];
|
let mut buckets = [0.0; 32];
|
||||||
for &sample in samples {
|
for metric in metrics.chunks_exact(32) {
|
||||||
for (i, m) in sample.into_iter().enumerate() {
|
if filter(&metric[0]) {
|
||||||
buckets[i] = f64::max(buckets[i], m as f64);
|
for (i, m) in metric.iter().enumerate() {
|
||||||
|
buckets[i] = f64::max(buckets[i], m.get_gauge().get_value());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -238,7 +437,7 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn test_cardinality(n: usize, dist: impl Distribution<f64>) -> ([usize; 3], [f64; 3]) {
|
fn test_cardinality(n: usize, dist: impl Distribution<f64>) -> ([usize; 3], [f64; 3]) {
|
||||||
let hll = HyperLogLogVec::<StaticLabelSet<Label>, 32>::new();
|
let hll = HyperLogLogVec::<32>::new(Opts::new("foo", "bar"), &["x"]).unwrap();
|
||||||
|
|
||||||
let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist);
|
let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist);
|
||||||
let mut set_a = HashSet::new();
|
let mut set_a = HashSet::new();
|
||||||
@@ -246,20 +445,18 @@ mod tests {
|
|||||||
|
|
||||||
for x in iter.by_ref().take(n) {
|
for x in iter.by_ref().take(n) {
|
||||||
set_a.insert(x.to_bits());
|
set_a.insert(x.to_bits());
|
||||||
hll.get_metric(hll.with_labels(Label::A))
|
hll.with_label_values(&["a"]).measure(&x.to_bits());
|
||||||
.measure(&x.to_bits());
|
|
||||||
}
|
}
|
||||||
for x in iter.by_ref().take(n) {
|
for x in iter.by_ref().take(n) {
|
||||||
set_b.insert(x.to_bits());
|
set_b.insert(x.to_bits());
|
||||||
hll.get_metric(hll.with_labels(Label::B))
|
hll.with_label_values(&["b"]).measure(&x.to_bits());
|
||||||
.measure(&x.to_bits());
|
|
||||||
}
|
}
|
||||||
let merge = &set_a | &set_b;
|
let merge = &set_a | &set_b;
|
||||||
|
|
||||||
let (a, b) = collect(&hll);
|
let metrics = collect(&hll);
|
||||||
let len = get_cardinality(&[a, b]);
|
let len = get_cardinality(&metrics, |_| true);
|
||||||
let len_a = get_cardinality(&[a]);
|
let len_a = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "a");
|
||||||
let len_b = get_cardinality(&[b]);
|
let len_b = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "b");
|
||||||
|
|
||||||
([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b])
|
([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b])
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,17 +4,6 @@
|
|||||||
//! a default registry.
|
//! a default registry.
|
||||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||||
|
|
||||||
use measured::{
|
|
||||||
label::{LabelGroupSet, LabelGroupVisitor, LabelName, NoLabels},
|
|
||||||
metric::{
|
|
||||||
counter::CounterState,
|
|
||||||
gauge::GaugeState,
|
|
||||||
group::{Encoding, MetricValue},
|
|
||||||
name::{MetricName, MetricNameEncoder},
|
|
||||||
MetricEncoding, MetricFamilyEncoding,
|
|
||||||
},
|
|
||||||
FixedCardinalityLabel, LabelGroup, MetricGroup,
|
|
||||||
};
|
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use prometheus::core::{
|
use prometheus::core::{
|
||||||
Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec,
|
Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec,
|
||||||
@@ -22,7 +11,6 @@ use prometheus::core::{
|
|||||||
pub use prometheus::opts;
|
pub use prometheus::opts;
|
||||||
pub use prometheus::register;
|
pub use prometheus::register;
|
||||||
pub use prometheus::Error;
|
pub use prometheus::Error;
|
||||||
use prometheus::Registry;
|
|
||||||
pub use prometheus::{core, default_registry, proto};
|
pub use prometheus::{core, default_registry, proto};
|
||||||
pub use prometheus::{exponential_buckets, linear_buckets};
|
pub use prometheus::{exponential_buckets, linear_buckets};
|
||||||
pub use prometheus::{register_counter_vec, Counter, CounterVec};
|
pub use prometheus::{register_counter_vec, Counter, CounterVec};
|
||||||
@@ -35,12 +23,13 @@ pub use prometheus::{register_int_counter_vec, IntCounterVec};
|
|||||||
pub use prometheus::{register_int_gauge, IntGauge};
|
pub use prometheus::{register_int_gauge, IntGauge};
|
||||||
pub use prometheus::{register_int_gauge_vec, IntGaugeVec};
|
pub use prometheus::{register_int_gauge_vec, IntGaugeVec};
|
||||||
pub use prometheus::{Encoder, TextEncoder};
|
pub use prometheus::{Encoder, TextEncoder};
|
||||||
|
use prometheus::{Registry, Result};
|
||||||
|
|
||||||
pub mod launch_timestamp;
|
pub mod launch_timestamp;
|
||||||
mod wrappers;
|
mod wrappers;
|
||||||
pub use wrappers::{CountedReader, CountedWriter};
|
pub use wrappers::{CountedReader, CountedWriter};
|
||||||
mod hll;
|
mod hll;
|
||||||
pub use hll::{HyperLogLog, HyperLogLogState, HyperLogLogVec};
|
pub use hll::{HyperLogLog, HyperLogLogVec};
|
||||||
#[cfg(target_os = "linux")]
|
#[cfg(target_os = "linux")]
|
||||||
pub mod more_process_metrics;
|
pub mod more_process_metrics;
|
||||||
|
|
||||||
@@ -70,7 +59,7 @@ static INTERNAL_REGISTRY: Lazy<Registry> = Lazy::new(Registry::new);
|
|||||||
/// Register a collector in the internal registry. MUST be called before the first call to `gather()`.
|
/// Register a collector in the internal registry. MUST be called before the first call to `gather()`.
|
||||||
/// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector
|
/// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector
|
||||||
/// while holding the lock.
|
/// while holding the lock.
|
||||||
pub fn register_internal(c: Box<dyn Collector>) -> prometheus::Result<()> {
|
pub fn register_internal(c: Box<dyn Collector>) -> Result<()> {
|
||||||
INTERNAL_REGISTRY.register(c)
|
INTERNAL_REGISTRY.register(c)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -107,127 +96,6 @@ pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
|
|||||||
0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
|
0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
|
||||||
];
|
];
|
||||||
|
|
||||||
pub struct BuildInfo {
|
|
||||||
pub revision: &'static str,
|
|
||||||
pub build_tag: &'static str,
|
|
||||||
}
|
|
||||||
|
|
||||||
// todo: allow label group without the set
|
|
||||||
impl LabelGroup for BuildInfo {
|
|
||||||
fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
|
|
||||||
const REVISION: &LabelName = LabelName::from_str("revision");
|
|
||||||
v.write_value(REVISION, &self.revision);
|
|
||||||
const BUILD_TAG: &LabelName = LabelName::from_str("build_tag");
|
|
||||||
v.write_value(BUILD_TAG, &self.build_tag);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T: Encoding> MetricFamilyEncoding<T> for BuildInfo
|
|
||||||
where
|
|
||||||
GaugeState: MetricEncoding<T>,
|
|
||||||
{
|
|
||||||
fn collect_family_into(
|
|
||||||
&self,
|
|
||||||
name: impl measured::metric::name::MetricNameEncoder,
|
|
||||||
enc: &mut T,
|
|
||||||
) -> Result<(), T::Err> {
|
|
||||||
enc.write_help(&name, "Build/version information")?;
|
|
||||||
GaugeState::write_type(&name, enc)?;
|
|
||||||
GaugeState {
|
|
||||||
count: std::sync::atomic::AtomicI64::new(1),
|
|
||||||
}
|
|
||||||
.collect_into(&(), self, name, enc)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(MetricGroup)]
|
|
||||||
#[metric(new(build_info: BuildInfo))]
|
|
||||||
pub struct NeonMetrics {
|
|
||||||
#[cfg(target_os = "linux")]
|
|
||||||
#[metric(namespace = "process")]
|
|
||||||
#[metric(init = measured_process::ProcessCollector::for_self())]
|
|
||||||
process: measured_process::ProcessCollector,
|
|
||||||
|
|
||||||
#[metric(namespace = "libmetrics")]
|
|
||||||
#[metric(init = LibMetrics::new(build_info))]
|
|
||||||
libmetrics: LibMetrics,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(MetricGroup)]
|
|
||||||
#[metric(new(build_info: BuildInfo))]
|
|
||||||
pub struct LibMetrics {
|
|
||||||
#[metric(init = build_info)]
|
|
||||||
build_info: BuildInfo,
|
|
||||||
|
|
||||||
#[metric(flatten)]
|
|
||||||
rusage: Rusage,
|
|
||||||
|
|
||||||
serve_count: CollectionCounter,
|
|
||||||
}
|
|
||||||
|
|
||||||
fn write_gauge<Enc: Encoding>(
|
|
||||||
x: i64,
|
|
||||||
labels: impl LabelGroup,
|
|
||||||
name: impl MetricNameEncoder,
|
|
||||||
enc: &mut Enc,
|
|
||||||
) -> Result<(), Enc::Err> {
|
|
||||||
enc.write_metric_value(name, labels, MetricValue::Int(x))
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Default)]
|
|
||||||
struct Rusage;
|
|
||||||
|
|
||||||
#[derive(FixedCardinalityLabel, Clone, Copy)]
|
|
||||||
#[label(singleton = "io_operation")]
|
|
||||||
enum IoOp {
|
|
||||||
Read,
|
|
||||||
Write,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T: Encoding> MetricGroup<T> for Rusage
|
|
||||||
where
|
|
||||||
GaugeState: MetricEncoding<T>,
|
|
||||||
{
|
|
||||||
fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
|
|
||||||
const DISK_IO: &MetricName = MetricName::from_str("disk_io_bytes_total");
|
|
||||||
const MAXRSS: &MetricName = MetricName::from_str("maxrss_kb");
|
|
||||||
|
|
||||||
let ru = get_rusage_stats();
|
|
||||||
|
|
||||||
enc.write_help(
|
|
||||||
DISK_IO,
|
|
||||||
"Bytes written and read from disk, grouped by the operation (read|write)",
|
|
||||||
)?;
|
|
||||||
GaugeState::write_type(DISK_IO, enc)?;
|
|
||||||
write_gauge(ru.ru_inblock * BYTES_IN_BLOCK, IoOp::Read, DISK_IO, enc)?;
|
|
||||||
write_gauge(ru.ru_oublock * BYTES_IN_BLOCK, IoOp::Write, DISK_IO, enc)?;
|
|
||||||
|
|
||||||
enc.write_help(MAXRSS, "Memory usage (Maximum Resident Set Size)")?;
|
|
||||||
GaugeState::write_type(MAXRSS, enc)?;
|
|
||||||
write_gauge(ru.ru_maxrss, IoOp::Read, MAXRSS, enc)?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Default)]
|
|
||||||
struct CollectionCounter(CounterState);
|
|
||||||
|
|
||||||
impl<T: Encoding> MetricFamilyEncoding<T> for CollectionCounter
|
|
||||||
where
|
|
||||||
CounterState: MetricEncoding<T>,
|
|
||||||
{
|
|
||||||
fn collect_family_into(
|
|
||||||
&self,
|
|
||||||
name: impl measured::metric::name::MetricNameEncoder,
|
|
||||||
enc: &mut T,
|
|
||||||
) -> Result<(), T::Err> {
|
|
||||||
self.0.inc();
|
|
||||||
enc.write_help(&name, "Number of metric requests made")?;
|
|
||||||
self.0.collect_into(&(), NoLabels, name, enc)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn set_build_info_metric(revision: &str, build_tag: &str) {
|
pub fn set_build_info_metric(revision: &str, build_tag: &str) {
|
||||||
let metric = register_int_gauge_vec!(
|
let metric = register_int_gauge_vec!(
|
||||||
"libmetrics_build_info",
|
"libmetrics_build_info",
|
||||||
@@ -237,7 +105,6 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) {
|
|||||||
.expect("Failed to register build info metric");
|
.expect("Failed to register build info metric");
|
||||||
metric.with_label_values(&[revision, build_tag]).set(1);
|
metric.with_label_values(&[revision, build_tag]).set(1);
|
||||||
}
|
}
|
||||||
const BYTES_IN_BLOCK: i64 = 512;
|
|
||||||
|
|
||||||
// Records I/O stats in a "cross-platform" way.
|
// Records I/O stats in a "cross-platform" way.
|
||||||
// Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats.
|
// Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats.
|
||||||
@@ -250,6 +117,7 @@ const BYTES_IN_BLOCK: i64 = 512;
|
|||||||
fn update_rusage_metrics() {
|
fn update_rusage_metrics() {
|
||||||
let rusage_stats = get_rusage_stats();
|
let rusage_stats = get_rusage_stats();
|
||||||
|
|
||||||
|
const BYTES_IN_BLOCK: i64 = 512;
|
||||||
DISK_IO_BYTES
|
DISK_IO_BYTES
|
||||||
.with_label_values(&["read"])
|
.with_label_values(&["read"])
|
||||||
.set(rusage_stats.ru_inblock * BYTES_IN_BLOCK);
|
.set(rusage_stats.ru_inblock * BYTES_IN_BLOCK);
|
||||||
@@ -283,7 +151,6 @@ macro_rules! register_int_counter_pair_vec {
|
|||||||
}
|
}
|
||||||
}};
|
}};
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Create an [`IntCounterPair`] and registers to default registry.
|
/// Create an [`IntCounterPair`] and registers to default registry.
|
||||||
#[macro_export(local_inner_macros)]
|
#[macro_export(local_inner_macros)]
|
||||||
macro_rules! register_int_counter_pair {
|
macro_rules! register_int_counter_pair {
|
||||||
@@ -321,10 +188,7 @@ impl<P: Atomic> GenericCounterPairVec<P> {
|
|||||||
///
|
///
|
||||||
/// An error is returned if the number of label values is not the same as the
|
/// An error is returned if the number of label values is not the same as the
|
||||||
/// number of VariableLabels in Desc.
|
/// number of VariableLabels in Desc.
|
||||||
pub fn get_metric_with_label_values(
|
pub fn get_metric_with_label_values(&self, vals: &[&str]) -> Result<GenericCounterPair<P>> {
|
||||||
&self,
|
|
||||||
vals: &[&str],
|
|
||||||
) -> prometheus::Result<GenericCounterPair<P>> {
|
|
||||||
Ok(GenericCounterPair {
|
Ok(GenericCounterPair {
|
||||||
inc: self.inc.get_metric_with_label_values(vals)?,
|
inc: self.inc.get_metric_with_label_values(vals)?,
|
||||||
dec: self.dec.get_metric_with_label_values(vals)?,
|
dec: self.dec.get_metric_with_label_values(vals)?,
|
||||||
@@ -337,7 +201,7 @@ impl<P: Atomic> GenericCounterPairVec<P> {
|
|||||||
self.get_metric_with_label_values(vals).unwrap()
|
self.get_metric_with_label_values(vals).unwrap()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn remove_label_values(&self, res: &mut [prometheus::Result<()>; 2], vals: &[&str]) {
|
pub fn remove_label_values(&self, res: &mut [Result<()>; 2], vals: &[&str]) {
|
||||||
res[0] = self.inc.remove_label_values(vals);
|
res[0] = self.inc.remove_label_values(vals);
|
||||||
res[1] = self.dec.remove_label_values(vals);
|
res[1] = self.dec.remove_label_values(vals);
|
||||||
}
|
}
|
||||||
@@ -421,171 +285,3 @@ pub type IntCounterPair = GenericCounterPair<AtomicU64>;
|
|||||||
|
|
||||||
/// A guard for [`IntCounterPair`] that will decrement the gauge on drop
|
/// A guard for [`IntCounterPair`] that will decrement the gauge on drop
|
||||||
pub type IntCounterPairGuard = GenericCounterPairGuard<AtomicU64>;
|
pub type IntCounterPairGuard = GenericCounterPairGuard<AtomicU64>;
|
||||||
|
|
||||||
pub trait CounterPairAssoc {
|
|
||||||
const INC_NAME: &'static MetricName;
|
|
||||||
const DEC_NAME: &'static MetricName;
|
|
||||||
|
|
||||||
const INC_HELP: &'static str;
|
|
||||||
const DEC_HELP: &'static str;
|
|
||||||
|
|
||||||
type LabelGroupSet: LabelGroupSet;
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct CounterPairVec<A: CounterPairAssoc> {
|
|
||||||
vec: measured::metric::MetricVec<MeasuredCounterPairState, A::LabelGroupSet>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<A: CounterPairAssoc> Default for CounterPairVec<A>
|
|
||||||
where
|
|
||||||
A::LabelGroupSet: Default,
|
|
||||||
{
|
|
||||||
fn default() -> Self {
|
|
||||||
Self {
|
|
||||||
vec: Default::default(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<A: CounterPairAssoc> CounterPairVec<A> {
|
|
||||||
pub fn guard(
|
|
||||||
&self,
|
|
||||||
labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>,
|
|
||||||
) -> MeasuredCounterPairGuard<'_, A> {
|
|
||||||
let id = self.vec.with_labels(labels);
|
|
||||||
self.vec.get_metric(id).inc.inc();
|
|
||||||
MeasuredCounterPairGuard { vec: &self.vec, id }
|
|
||||||
}
|
|
||||||
pub fn inc(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) {
|
|
||||||
let id = self.vec.with_labels(labels);
|
|
||||||
self.vec.get_metric(id).inc.inc();
|
|
||||||
}
|
|
||||||
pub fn dec(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) {
|
|
||||||
let id = self.vec.with_labels(labels);
|
|
||||||
self.vec.get_metric(id).dec.inc();
|
|
||||||
}
|
|
||||||
pub fn remove_metric(
|
|
||||||
&self,
|
|
||||||
labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>,
|
|
||||||
) -> Option<MeasuredCounterPairState> {
|
|
||||||
let id = self.vec.with_labels(labels);
|
|
||||||
self.vec.remove_metric(id)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T, A> ::measured::metric::group::MetricGroup<T> for CounterPairVec<A>
|
|
||||||
where
|
|
||||||
T: ::measured::metric::group::Encoding,
|
|
||||||
A: CounterPairAssoc,
|
|
||||||
::measured::metric::counter::CounterState: ::measured::metric::MetricEncoding<T>,
|
|
||||||
{
|
|
||||||
fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
|
|
||||||
// write decrement first to avoid a race condition where inc - dec < 0
|
|
||||||
T::write_help(enc, A::DEC_NAME, A::DEC_HELP)?;
|
|
||||||
self.vec
|
|
||||||
.collect_family_into(A::DEC_NAME, &mut Dec(&mut *enc))?;
|
|
||||||
|
|
||||||
T::write_help(enc, A::INC_NAME, A::INC_HELP)?;
|
|
||||||
self.vec
|
|
||||||
.collect_family_into(A::INC_NAME, &mut Inc(&mut *enc))?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(MetricGroup, Default)]
|
|
||||||
pub struct MeasuredCounterPairState {
|
|
||||||
pub inc: CounterState,
|
|
||||||
pub dec: CounterState,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl measured::metric::MetricType for MeasuredCounterPairState {
|
|
||||||
type Metadata = ();
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct MeasuredCounterPairGuard<'a, A: CounterPairAssoc> {
|
|
||||||
vec: &'a measured::metric::MetricVec<MeasuredCounterPairState, A::LabelGroupSet>,
|
|
||||||
id: measured::metric::LabelId<A::LabelGroupSet>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<A: CounterPairAssoc> Drop for MeasuredCounterPairGuard<'_, A> {
|
|
||||||
fn drop(&mut self) {
|
|
||||||
self.vec.get_metric(self.id).dec.inc();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the inc counter to the inner encoder.
|
|
||||||
struct Inc<T>(T);
|
|
||||||
/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the dec counter to the inner encoder.
|
|
||||||
struct Dec<T>(T);
|
|
||||||
|
|
||||||
impl<T: Encoding> Encoding for Inc<T> {
|
|
||||||
type Err = T::Err;
|
|
||||||
|
|
||||||
fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
|
|
||||||
self.0.write_help(name, help)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn write_metric_value(
|
|
||||||
&mut self,
|
|
||||||
name: impl MetricNameEncoder,
|
|
||||||
labels: impl LabelGroup,
|
|
||||||
value: MetricValue,
|
|
||||||
) -> Result<(), Self::Err> {
|
|
||||||
self.0.write_metric_value(name, labels, value)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T: Encoding> MetricEncoding<Inc<T>> for MeasuredCounterPairState
|
|
||||||
where
|
|
||||||
CounterState: MetricEncoding<T>,
|
|
||||||
{
|
|
||||||
fn write_type(name: impl MetricNameEncoder, enc: &mut Inc<T>) -> Result<(), T::Err> {
|
|
||||||
CounterState::write_type(name, &mut enc.0)
|
|
||||||
}
|
|
||||||
fn collect_into(
|
|
||||||
&self,
|
|
||||||
metadata: &(),
|
|
||||||
labels: impl LabelGroup,
|
|
||||||
name: impl MetricNameEncoder,
|
|
||||||
enc: &mut Inc<T>,
|
|
||||||
) -> Result<(), T::Err> {
|
|
||||||
self.inc.collect_into(metadata, labels, name, &mut enc.0)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T: Encoding> Encoding for Dec<T> {
|
|
||||||
type Err = T::Err;
|
|
||||||
|
|
||||||
fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
|
|
||||||
self.0.write_help(name, help)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn write_metric_value(
|
|
||||||
&mut self,
|
|
||||||
name: impl MetricNameEncoder,
|
|
||||||
labels: impl LabelGroup,
|
|
||||||
value: MetricValue,
|
|
||||||
) -> Result<(), Self::Err> {
|
|
||||||
self.0.write_metric_value(name, labels, value)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Write the dec counter to the encoder
|
|
||||||
impl<T: Encoding> MetricEncoding<Dec<T>> for MeasuredCounterPairState
|
|
||||||
where
|
|
||||||
CounterState: MetricEncoding<T>,
|
|
||||||
{
|
|
||||||
fn write_type(name: impl MetricNameEncoder, enc: &mut Dec<T>) -> Result<(), T::Err> {
|
|
||||||
CounterState::write_type(name, &mut enc.0)
|
|
||||||
}
|
|
||||||
fn collect_into(
|
|
||||||
&self,
|
|
||||||
metadata: &(),
|
|
||||||
labels: impl LabelGroup,
|
|
||||||
name: impl MetricNameEncoder,
|
|
||||||
enc: &mut Dec<T>,
|
|
||||||
) -> Result<(), T::Err> {
|
|
||||||
self.dec.collect_into(metadata, labels, name, &mut enc.0)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -2,14 +2,11 @@ use std::str::FromStr;
|
|||||||
|
|
||||||
/// Request/response types for the storage controller
|
/// Request/response types for the storage controller
|
||||||
/// API (`/control/v1` prefix). Implemented by the server
|
/// API (`/control/v1` prefix). Implemented by the server
|
||||||
/// in [`storage_controller::http`]
|
/// in [`attachment_service::http`]
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use utils::id::{NodeId, TenantId};
|
use utils::id::NodeId;
|
||||||
|
|
||||||
use crate::{
|
use crate::{models::ShardParameters, shard::TenantShardId};
|
||||||
models::{ShardParameters, TenantConfig},
|
|
||||||
shard::{ShardStripeSize, TenantShardId},
|
|
||||||
};
|
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize)]
|
||||||
pub struct TenantCreateResponseShard {
|
pub struct TenantCreateResponseShard {
|
||||||
@@ -38,16 +35,10 @@ pub struct NodeRegisterRequest {
|
|||||||
pub struct NodeConfigureRequest {
|
pub struct NodeConfigureRequest {
|
||||||
pub node_id: NodeId,
|
pub node_id: NodeId,
|
||||||
|
|
||||||
pub availability: Option<NodeAvailabilityWrapper>,
|
pub availability: Option<NodeAvailability>,
|
||||||
pub scheduling: Option<NodeSchedulingPolicy>,
|
pub scheduling: Option<NodeSchedulingPolicy>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
|
||||||
pub struct TenantPolicyRequest {
|
|
||||||
pub placement: Option<PlacementPolicy>,
|
|
||||||
pub scheduling: Option<ShardSchedulingPolicy>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
pub struct TenantLocateResponseShard {
|
pub struct TenantLocateResponseShard {
|
||||||
pub shard_id: TenantShardId,
|
pub shard_id: TenantShardId,
|
||||||
@@ -66,48 +57,6 @@ pub struct TenantLocateResponse {
|
|||||||
pub shard_params: ShardParameters,
|
pub shard_params: ShardParameters,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
|
||||||
pub struct TenantDescribeResponse {
|
|
||||||
pub tenant_id: TenantId,
|
|
||||||
pub shards: Vec<TenantDescribeResponseShard>,
|
|
||||||
pub stripe_size: ShardStripeSize,
|
|
||||||
pub policy: PlacementPolicy,
|
|
||||||
pub config: TenantConfig,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
|
||||||
pub struct NodeDescribeResponse {
|
|
||||||
pub id: NodeId,
|
|
||||||
|
|
||||||
pub availability: NodeAvailabilityWrapper,
|
|
||||||
pub scheduling: NodeSchedulingPolicy,
|
|
||||||
|
|
||||||
pub listen_http_addr: String,
|
|
||||||
pub listen_http_port: u16,
|
|
||||||
|
|
||||||
pub listen_pg_addr: String,
|
|
||||||
pub listen_pg_port: u16,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
|
||||||
pub struct TenantDescribeResponseShard {
|
|
||||||
pub tenant_shard_id: TenantShardId,
|
|
||||||
|
|
||||||
pub node_attached: Option<NodeId>,
|
|
||||||
pub node_secondary: Vec<NodeId>,
|
|
||||||
|
|
||||||
pub last_error: String,
|
|
||||||
|
|
||||||
/// A task is currently running to reconcile this tenant's intent state with the state on pageservers
|
|
||||||
pub is_reconciling: bool,
|
|
||||||
/// This shard failed in sending a compute notification to the cloud control plane, and a retry is pending.
|
|
||||||
pub is_pending_compute_notification: bool,
|
|
||||||
/// A shard split is currently underway
|
|
||||||
pub is_splitting: bool,
|
|
||||||
|
|
||||||
pub scheduling_policy: ShardSchedulingPolicy,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Explicitly migrating a particular shard is a low level operation
|
/// Explicitly migrating a particular shard is a low level operation
|
||||||
/// TODO: higher level "Reschedule tenant" operation where the request
|
/// TODO: higher level "Reschedule tenant" operation where the request
|
||||||
/// specifies some constraints, e.g. asking it to get off particular node(s)
|
/// specifies some constraints, e.g. asking it to get off particular node(s)
|
||||||
@@ -117,94 +66,29 @@ pub struct TenantShardMigrateRequest {
|
|||||||
pub node_id: NodeId,
|
pub node_id: NodeId,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Utilisation score indicating how good a candidate a pageserver
|
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
|
||||||
/// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
|
|
||||||
/// Lower values are better.
|
|
||||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)]
|
|
||||||
pub struct UtilizationScore(pub u64);
|
|
||||||
|
|
||||||
impl UtilizationScore {
|
|
||||||
pub fn worst() -> Self {
|
|
||||||
UtilizationScore(u64::MAX)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
|
|
||||||
#[serde(into = "NodeAvailabilityWrapper")]
|
|
||||||
pub enum NodeAvailability {
|
pub enum NodeAvailability {
|
||||||
// Normal, happy state
|
// Normal, happy state
|
||||||
Active(UtilizationScore),
|
Active,
|
||||||
// Offline: Tenants shouldn't try to attach here, but they may assume that their
|
// Offline: Tenants shouldn't try to attach here, but they may assume that their
|
||||||
// secondary locations on this node still exist. Newly added nodes are in this
|
// secondary locations on this node still exist. Newly added nodes are in this
|
||||||
// state until we successfully contact them.
|
// state until we successfully contact them.
|
||||||
Offline,
|
Offline,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PartialEq for NodeAvailability {
|
impl FromStr for NodeAvailability {
|
||||||
fn eq(&self, other: &Self) -> bool {
|
type Err = anyhow::Error;
|
||||||
use NodeAvailability::*;
|
|
||||||
matches!((self, other), (Active(_), Active(_)) | (Offline, Offline))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Eq for NodeAvailability {}
|
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||||
|
match s {
|
||||||
// This wrapper provides serde functionality and it should only be used to
|
"active" => Ok(Self::Active),
|
||||||
// communicate with external callers which don't know or care about the
|
"offline" => Ok(Self::Offline),
|
||||||
// utilisation score of the pageserver it is targeting.
|
_ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
|
||||||
#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
|
|
||||||
pub enum NodeAvailabilityWrapper {
|
|
||||||
Active,
|
|
||||||
Offline,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<NodeAvailabilityWrapper> for NodeAvailability {
|
|
||||||
fn from(val: NodeAvailabilityWrapper) -> Self {
|
|
||||||
match val {
|
|
||||||
// Assume the worst utilisation score to begin with. It will later be updated by
|
|
||||||
// the heartbeats.
|
|
||||||
NodeAvailabilityWrapper::Active => NodeAvailability::Active(UtilizationScore::worst()),
|
|
||||||
NodeAvailabilityWrapper::Offline => NodeAvailability::Offline,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<NodeAvailability> for NodeAvailabilityWrapper {
|
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
|
||||||
fn from(val: NodeAvailability) -> Self {
|
|
||||||
match val {
|
|
||||||
NodeAvailability::Active(_) => NodeAvailabilityWrapper::Active,
|
|
||||||
NodeAvailability::Offline => NodeAvailabilityWrapper::Offline,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
|
|
||||||
pub enum ShardSchedulingPolicy {
|
|
||||||
// Normal mode: the tenant's scheduled locations may be updated at will, including
|
|
||||||
// for non-essential optimization.
|
|
||||||
Active,
|
|
||||||
|
|
||||||
// Disable optimizations, but permit scheduling when necessary to fulfil the PlacementPolicy.
|
|
||||||
// For example, this still permits a node's attachment location to change to a secondary in
|
|
||||||
// response to a node failure, or to assign a new secondary if a node was removed.
|
|
||||||
Essential,
|
|
||||||
|
|
||||||
// No scheduling: leave the shard running wherever it currently is. Even if the shard is
|
|
||||||
// unavailable, it will not be rescheduled to another node.
|
|
||||||
Pause,
|
|
||||||
|
|
||||||
// No reconciling: we will make no location_conf API calls to pageservers at all. If the
|
|
||||||
// shard is unavailable, it stays that way. If a node fails, this shard doesn't get failed over.
|
|
||||||
Stop,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Default for ShardSchedulingPolicy {
|
|
||||||
fn default() -> Self {
|
|
||||||
Self::Active
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
|
|
||||||
pub enum NodeSchedulingPolicy {
|
pub enum NodeSchedulingPolicy {
|
||||||
Active,
|
Active,
|
||||||
Filling,
|
Filling,
|
||||||
@@ -243,8 +127,11 @@ impl From<NodeSchedulingPolicy> for String {
|
|||||||
/// to create secondary locations.
|
/// to create secondary locations.
|
||||||
#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
|
#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
|
||||||
pub enum PlacementPolicy {
|
pub enum PlacementPolicy {
|
||||||
/// Normal live state: one attached pageserver and zero or more secondaries.
|
/// Cheapest way to attach a tenant: just one pageserver, no secondary
|
||||||
Attached(usize),
|
Single,
|
||||||
|
/// Production-ready way to attach a tenant: one attached pageserver and
|
||||||
|
/// some number of secondaries.
|
||||||
|
Double(usize),
|
||||||
/// Create one secondary mode locations. This is useful when onboarding
|
/// Create one secondary mode locations. This is useful when onboarding
|
||||||
/// a tenant, or for an idle tenant that we might want to bring online quickly.
|
/// a tenant, or for an idle tenant that we might want to bring online quickly.
|
||||||
Secondary,
|
Secondary,
|
||||||
@@ -266,14 +153,14 @@ mod test {
|
|||||||
/// Check stability of PlacementPolicy's serialization
|
/// Check stability of PlacementPolicy's serialization
|
||||||
#[test]
|
#[test]
|
||||||
fn placement_policy_encoding() -> anyhow::Result<()> {
|
fn placement_policy_encoding() -> anyhow::Result<()> {
|
||||||
let v = PlacementPolicy::Attached(1);
|
let v = PlacementPolicy::Double(1);
|
||||||
let encoded = serde_json::to_string(&v)?;
|
let encoded = serde_json::to_string(&v)?;
|
||||||
assert_eq!(encoded, "{\"Attached\":1}");
|
assert_eq!(encoded, "{\"Double\":1}");
|
||||||
assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
|
assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
|
||||||
|
|
||||||
let v = PlacementPolicy::Detached;
|
let v = PlacementPolicy::Single;
|
||||||
let encoded = serde_json::to_string(&v)?;
|
let encoded = serde_json::to_string(&v)?;
|
||||||
assert_eq!(encoded, "\"Detached\"");
|
assert_eq!(encoded, "\"Single\"");
|
||||||
assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
|
assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ pub mod utilization;
|
|||||||
pub use utilization::PageserverUtilization;
|
pub use utilization::PageserverUtilization;
|
||||||
|
|
||||||
use std::{
|
use std::{
|
||||||
borrow::Cow,
|
|
||||||
collections::HashMap,
|
collections::HashMap,
|
||||||
io::{BufRead, Read},
|
io::{BufRead, Read},
|
||||||
num::{NonZeroU64, NonZeroUsize},
|
num::{NonZeroU64, NonZeroUsize},
|
||||||
@@ -20,7 +19,6 @@ use utils::{
|
|||||||
history_buffer::HistoryBufferWithDropCounter,
|
history_buffer::HistoryBufferWithDropCounter,
|
||||||
id::{NodeId, TenantId, TimelineId},
|
id::{NodeId, TenantId, TimelineId},
|
||||||
lsn::Lsn,
|
lsn::Lsn,
|
||||||
serde_system_time,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
use crate::controller_api::PlacementPolicy;
|
use crate::controller_api::PlacementPolicy;
|
||||||
@@ -296,13 +294,13 @@ pub struct TenantConfig {
|
|||||||
pub lagging_wal_timeout: Option<String>,
|
pub lagging_wal_timeout: Option<String>,
|
||||||
pub max_lsn_wal_lag: Option<NonZeroU64>,
|
pub max_lsn_wal_lag: Option<NonZeroU64>,
|
||||||
pub trace_read_requests: Option<bool>,
|
pub trace_read_requests: Option<bool>,
|
||||||
|
pub image_layer_compression: Option<CompressionAlgorithm>,
|
||||||
pub eviction_policy: Option<EvictionPolicy>,
|
pub eviction_policy: Option<EvictionPolicy>,
|
||||||
pub min_resident_size_override: Option<u64>,
|
pub min_resident_size_override: Option<u64>,
|
||||||
pub evictions_low_residence_duration_metric_threshold: Option<String>,
|
pub evictions_low_residence_duration_metric_threshold: Option<String>,
|
||||||
pub heatmap_period: Option<String>,
|
pub heatmap_period: Option<String>,
|
||||||
pub lazy_slru_download: Option<bool>,
|
pub lazy_slru_download: Option<bool>,
|
||||||
pub timeline_get_throttle: Option<ThrottleConfig>,
|
pub timeline_get_throttle: Option<ThrottleConfig>,
|
||||||
pub image_layer_creation_check_threshold: Option<u8>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||||
@@ -330,6 +328,23 @@ pub enum CompactionAlgorithm {
|
|||||||
Tiered,
|
Tiered,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(
|
||||||
|
Debug,
|
||||||
|
Clone,
|
||||||
|
Copy,
|
||||||
|
PartialEq,
|
||||||
|
Eq,
|
||||||
|
Serialize,
|
||||||
|
Deserialize,
|
||||||
|
strum_macros::FromRepr,
|
||||||
|
enum_map::Enum,
|
||||||
|
)]
|
||||||
|
#[repr(u8)]
|
||||||
|
pub enum CompressionAlgorithm {
|
||||||
|
NoCompression,
|
||||||
|
LZ4,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||||
pub struct EvictionPolicyLayerAccessThreshold {
|
pub struct EvictionPolicyLayerAccessThreshold {
|
||||||
#[serde(with = "humantime_serde")]
|
#[serde(with = "humantime_serde")]
|
||||||
@@ -580,7 +595,7 @@ pub struct TimelineInfo {
|
|||||||
pub walreceiver_status: String,
|
pub walreceiver_status: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize)]
|
||||||
pub struct LayerMapInfo {
|
pub struct LayerMapInfo {
|
||||||
pub in_memory_layers: Vec<InMemoryLayerInfo>,
|
pub in_memory_layers: Vec<InMemoryLayerInfo>,
|
||||||
pub historic_layers: Vec<HistoricLayerInfo>,
|
pub historic_layers: Vec<HistoricLayerInfo>,
|
||||||
@@ -598,7 +613,7 @@ pub enum LayerAccessKind {
|
|||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
pub struct LayerAccessStatFullDetails {
|
pub struct LayerAccessStatFullDetails {
|
||||||
pub when_millis_since_epoch: u64,
|
pub when_millis_since_epoch: u64,
|
||||||
pub task_kind: Cow<'static, str>,
|
pub task_kind: &'static str,
|
||||||
pub access_kind: LayerAccessKind,
|
pub access_kind: LayerAccessKind,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -657,23 +672,23 @@ impl LayerResidenceEvent {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize)]
|
||||||
pub struct LayerAccessStats {
|
pub struct LayerAccessStats {
|
||||||
pub access_count_by_access_kind: HashMap<LayerAccessKind, u64>,
|
pub access_count_by_access_kind: HashMap<LayerAccessKind, u64>,
|
||||||
pub task_kind_access_flag: Vec<Cow<'static, str>>,
|
pub task_kind_access_flag: Vec<&'static str>,
|
||||||
pub first: Option<LayerAccessStatFullDetails>,
|
pub first: Option<LayerAccessStatFullDetails>,
|
||||||
pub accesses_history: HistoryBufferWithDropCounter<LayerAccessStatFullDetails, 16>,
|
pub accesses_history: HistoryBufferWithDropCounter<LayerAccessStatFullDetails, 16>,
|
||||||
pub residence_events_history: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
|
pub residence_events_history: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize)]
|
||||||
#[serde(tag = "kind")]
|
#[serde(tag = "kind")]
|
||||||
pub enum InMemoryLayerInfo {
|
pub enum InMemoryLayerInfo {
|
||||||
Open { lsn_start: Lsn },
|
Open { lsn_start: Lsn },
|
||||||
Frozen { lsn_start: Lsn, lsn_end: Lsn },
|
Frozen { lsn_start: Lsn, lsn_end: Lsn },
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize)]
|
||||||
#[serde(tag = "kind")]
|
#[serde(tag = "kind")]
|
||||||
pub enum HistoricLayerInfo {
|
pub enum HistoricLayerInfo {
|
||||||
Delta {
|
Delta {
|
||||||
@@ -695,32 +710,6 @@ pub enum HistoricLayerInfo {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
impl HistoricLayerInfo {
|
|
||||||
pub fn layer_file_name(&self) -> &str {
|
|
||||||
match self {
|
|
||||||
HistoricLayerInfo::Delta {
|
|
||||||
layer_file_name, ..
|
|
||||||
} => layer_file_name,
|
|
||||||
HistoricLayerInfo::Image {
|
|
||||||
layer_file_name, ..
|
|
||||||
} => layer_file_name,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
pub fn is_remote(&self) -> bool {
|
|
||||||
match self {
|
|
||||||
HistoricLayerInfo::Delta { remote, .. } => *remote,
|
|
||||||
HistoricLayerInfo::Image { remote, .. } => *remote,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
pub fn set_remote(&mut self, value: bool) {
|
|
||||||
let field = match self {
|
|
||||||
HistoricLayerInfo::Delta { remote, .. } => remote,
|
|
||||||
HistoricLayerInfo::Image { remote, .. } => remote,
|
|
||||||
};
|
|
||||||
*field = value;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize)]
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
pub struct DownloadRemoteLayersTaskSpawnRequest {
|
pub struct DownloadRemoteLayersTaskSpawnRequest {
|
||||||
pub max_concurrent_downloads: NonZeroUsize,
|
pub max_concurrent_downloads: NonZeroUsize,
|
||||||
@@ -747,48 +736,10 @@ pub struct TimelineGcRequest {
|
|||||||
pub gc_horizon: Option<u64>,
|
pub gc_horizon: Option<u64>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
||||||
pub struct WalRedoManagerProcessStatus {
|
|
||||||
pub pid: u32,
|
|
||||||
/// The strum-generated `into::<&'static str>()` for `pageserver::walredo::ProcessKind`.
|
|
||||||
/// `ProcessKind` are a transitory thing, so, they have no enum representation in `pageserver_api`.
|
|
||||||
pub kind: Cow<'static, str>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
pub struct WalRedoManagerStatus {
|
pub struct WalRedoManagerStatus {
|
||||||
pub last_redo_at: Option<chrono::DateTime<chrono::Utc>>,
|
pub last_redo_at: Option<chrono::DateTime<chrono::Utc>>,
|
||||||
pub process: Option<WalRedoManagerProcessStatus>,
|
pub pid: Option<u32>,
|
||||||
}
|
|
||||||
|
|
||||||
/// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
|
|
||||||
/// a download job, timing out while waiting for it to run, and then inspecting this status to understand
|
|
||||||
/// what's happening.
|
|
||||||
#[derive(Default, Debug, Serialize, Deserialize, Clone)]
|
|
||||||
pub struct SecondaryProgress {
|
|
||||||
/// The remote storage LastModified time of the heatmap object we last downloaded.
|
|
||||||
pub heatmap_mtime: Option<serde_system_time::SystemTime>,
|
|
||||||
|
|
||||||
/// The number of layers currently on-disk
|
|
||||||
pub layers_downloaded: usize,
|
|
||||||
/// The number of layers in the most recently seen heatmap
|
|
||||||
pub layers_total: usize,
|
|
||||||
|
|
||||||
/// The number of layer bytes currently on-disk
|
|
||||||
pub bytes_downloaded: u64,
|
|
||||||
/// The number of layer bytes in the most recently seen heatmap
|
|
||||||
pub bytes_total: u64,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
|
||||||
pub struct TenantScanRemoteStorageShard {
|
|
||||||
pub tenant_shard_id: TenantShardId,
|
|
||||||
pub generation: Option<u32>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Debug, Default)]
|
|
||||||
pub struct TenantScanRemoteStorageResponse {
|
|
||||||
pub shards: Vec<TenantScanRemoteStorageShard>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub mod virtual_file {
|
pub mod virtual_file {
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
use utils::serde_system_time::SystemTime;
|
use std::time::SystemTime;
|
||||||
|
|
||||||
/// Pageserver current utilization and scoring for how good candidate the pageserver would be for
|
/// Pageserver current utilization and scoring for how good candidate the pageserver would be for
|
||||||
/// the next tenant.
|
/// the next tenant.
|
||||||
@@ -7,7 +7,7 @@ use utils::serde_system_time::SystemTime;
|
|||||||
///
|
///
|
||||||
/// `format: int64` fields must use `ser_saturating_u63` because openapi generated clients might
|
/// `format: int64` fields must use `ser_saturating_u63` because openapi generated clients might
|
||||||
/// not handle full u64 values properly.
|
/// not handle full u64 values properly.
|
||||||
#[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
|
#[derive(serde::Serialize, Debug)]
|
||||||
pub struct PageserverUtilization {
|
pub struct PageserverUtilization {
|
||||||
/// Used disk space
|
/// Used disk space
|
||||||
#[serde(serialize_with = "ser_saturating_u63")]
|
#[serde(serialize_with = "ser_saturating_u63")]
|
||||||
@@ -21,9 +21,17 @@ pub struct PageserverUtilization {
|
|||||||
/// When was this snapshot captured, pageserver local time.
|
/// When was this snapshot captured, pageserver local time.
|
||||||
///
|
///
|
||||||
/// Use millis to give confidence that the value is regenerated often enough.
|
/// Use millis to give confidence that the value is regenerated often enough.
|
||||||
|
#[serde(serialize_with = "ser_rfc3339_millis")]
|
||||||
pub captured_at: SystemTime,
|
pub captured_at: SystemTime,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn ser_rfc3339_millis<S: serde::Serializer>(
|
||||||
|
ts: &SystemTime,
|
||||||
|
serializer: S,
|
||||||
|
) -> Result<S::Ok, S::Error> {
|
||||||
|
serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
|
||||||
|
}
|
||||||
|
|
||||||
/// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
|
/// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
|
||||||
///
|
///
|
||||||
/// Instead of newtype, use this because a newtype would get require handling deserializing values
|
/// Instead of newtype, use this because a newtype would get require handling deserializing values
|
||||||
@@ -50,9 +58,7 @@ mod tests {
|
|||||||
disk_usage_bytes: u64::MAX,
|
disk_usage_bytes: u64::MAX,
|
||||||
free_space_bytes: 0,
|
free_space_bytes: 0,
|
||||||
utilization_score: u64::MAX,
|
utilization_score: u64::MAX,
|
||||||
captured_at: SystemTime(
|
captured_at: SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
|
||||||
std::time::SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
|
|
||||||
),
|
|
||||||
};
|
};
|
||||||
|
|
||||||
let s = serde_json::to_string(&doc).unwrap();
|
let s = serde_json::to_string(&doc).unwrap();
|
||||||
|
|||||||
@@ -8,89 +8,12 @@ use hex::FromHex;
|
|||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use utils::id::TenantId;
|
use utils::id::TenantId;
|
||||||
|
|
||||||
/// See docs/rfcs/031-sharding-static.md for an overview of sharding.
|
|
||||||
///
|
|
||||||
/// This module contains a variety of types used to represent the concept of sharding
|
|
||||||
/// a Neon tenant across multiple physical shards. Since there are quite a few of these,
|
|
||||||
/// we provide an summary here.
|
|
||||||
///
|
|
||||||
/// Types used to describe shards:
|
|
||||||
/// - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
|
|
||||||
/// which identifies a tenant which is not shard-aware. This means its storage paths do not include
|
|
||||||
/// a shard suffix.
|
|
||||||
/// - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
|
|
||||||
/// - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
|
|
||||||
/// without the tenant ID. This is useful for things that are implicitly scoped to a particular
|
|
||||||
/// tenant, such as layer files.
|
|
||||||
/// - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
|
|
||||||
/// detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
|
|
||||||
/// - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
|
|
||||||
/// four hex digits. An unsharded tenant is `0000`.
|
|
||||||
/// - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
|
|
||||||
///
|
|
||||||
/// Types used to describe the parameters for data distribution in a sharded tenant:
|
|
||||||
/// - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
|
|
||||||
/// multiple shards. Its value is given in 8kiB pages.
|
|
||||||
/// - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
|
|
||||||
/// always zero: this is provided for future upgrades that might introduce different
|
|
||||||
/// data distribution schemes.
|
|
||||||
///
|
|
||||||
/// Examples:
|
|
||||||
/// - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
|
|
||||||
/// - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
|
|
||||||
/// - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
|
|
||||||
/// and their slugs are 0004, 0104, 0204, and 0304.
|
|
||||||
|
|
||||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
|
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
|
||||||
pub struct ShardNumber(pub u8);
|
pub struct ShardNumber(pub u8);
|
||||||
|
|
||||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
|
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
|
||||||
pub struct ShardCount(u8);
|
pub struct ShardCount(u8);
|
||||||
|
|
||||||
/// Combination of ShardNumber and ShardCount. For use within the context of a particular tenant,
|
|
||||||
/// when we need to know which shard we're dealing with, but do not need to know the full
|
|
||||||
/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
|
|
||||||
/// the fully qualified TenantShardId.
|
|
||||||
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
|
|
||||||
pub struct ShardIndex {
|
|
||||||
pub shard_number: ShardNumber,
|
|
||||||
pub shard_count: ShardCount,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`],
|
|
||||||
/// and to check whether that [`ShardNumber`] is the same as the current shard.
|
|
||||||
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
|
|
||||||
pub struct ShardIdentity {
|
|
||||||
pub number: ShardNumber,
|
|
||||||
pub count: ShardCount,
|
|
||||||
pub stripe_size: ShardStripeSize,
|
|
||||||
layout: ShardLayout,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Formatting helper, for generating the `shard_id` label in traces.
|
|
||||||
struct ShardSlug<'a>(&'a TenantShardId);
|
|
||||||
|
|
||||||
/// TenantShardId globally identifies a particular shard in a particular tenant.
|
|
||||||
///
|
|
||||||
/// These are written as `<TenantId>-<ShardSlug>`, for example:
|
|
||||||
/// # The second shard in a two-shard tenant
|
|
||||||
/// 072f1291a5310026820b2fe4b2968934-0102
|
|
||||||
///
|
|
||||||
/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
|
|
||||||
/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
|
|
||||||
/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
|
|
||||||
///
|
|
||||||
/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
|
|
||||||
/// is both forward and backward compatible with TenantId: a legacy TenantId can be
|
|
||||||
/// decoded as a TenantShardId, and when re-encoded it will be parseable
|
|
||||||
/// as a TenantId.
|
|
||||||
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
|
|
||||||
pub struct TenantShardId {
|
|
||||||
pub tenant_id: TenantId,
|
|
||||||
pub shard_number: ShardNumber,
|
|
||||||
pub shard_count: ShardCount,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ShardCount {
|
impl ShardCount {
|
||||||
pub const MAX: Self = Self(u8::MAX);
|
pub const MAX: Self = Self(u8::MAX);
|
||||||
|
|
||||||
@@ -115,7 +38,6 @@ impl ShardCount {
|
|||||||
self.0
|
self.0
|
||||||
}
|
}
|
||||||
|
|
||||||
///
|
|
||||||
pub fn is_unsharded(&self) -> bool {
|
pub fn is_unsharded(&self) -> bool {
|
||||||
self.0 == 0
|
self.0 == 0
|
||||||
}
|
}
|
||||||
@@ -131,6 +53,33 @@ impl ShardNumber {
|
|||||||
pub const MAX: Self = Self(u8::MAX);
|
pub const MAX: Self = Self(u8::MAX);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// TenantShardId identify the units of work for the Pageserver.
|
||||||
|
///
|
||||||
|
/// These are written as `<tenant_id>-<shard number><shard-count>`, for example:
|
||||||
|
///
|
||||||
|
/// # The second shard in a two-shard tenant
|
||||||
|
/// 072f1291a5310026820b2fe4b2968934-0102
|
||||||
|
///
|
||||||
|
/// Historically, tenants could not have multiple shards, and were identified
|
||||||
|
/// by TenantId. To support this, TenantShardId has a special legacy
|
||||||
|
/// mode where `shard_count` is equal to zero: this represents a single-sharded
|
||||||
|
/// tenant which should be written as a TenantId with no suffix.
|
||||||
|
///
|
||||||
|
/// The human-readable encoding of TenantShardId, such as used in API URLs,
|
||||||
|
/// is both forward and backward compatible: a legacy TenantId can be
|
||||||
|
/// decoded as a TenantShardId, and when re-encoded it will be parseable
|
||||||
|
/// as a TenantId.
|
||||||
|
///
|
||||||
|
/// Note that the binary encoding is _not_ backward compatible, because
|
||||||
|
/// at the time sharding is introduced, there are no existing binary structures
|
||||||
|
/// containing TenantId that we need to handle.
|
||||||
|
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
|
||||||
|
pub struct TenantShardId {
|
||||||
|
pub tenant_id: TenantId,
|
||||||
|
pub shard_number: ShardNumber,
|
||||||
|
pub shard_count: ShardCount,
|
||||||
|
}
|
||||||
|
|
||||||
impl TenantShardId {
|
impl TenantShardId {
|
||||||
pub fn unsharded(tenant_id: TenantId) -> Self {
|
pub fn unsharded(tenant_id: TenantId) -> Self {
|
||||||
Self {
|
Self {
|
||||||
@@ -162,13 +111,10 @@ impl TenantShardId {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Convenience for code that has special behavior on the 0th shard.
|
/// Convenience for code that has special behavior on the 0th shard.
|
||||||
pub fn is_shard_zero(&self) -> bool {
|
pub fn is_zero(&self) -> bool {
|
||||||
self.shard_number == ShardNumber(0)
|
self.shard_number == ShardNumber(0)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The "unsharded" value is distinct from simply having a single shard: it represents
|
|
||||||
/// a tenant which is not shard-aware at all, and whose storage paths will not include
|
|
||||||
/// a shard suffix.
|
|
||||||
pub fn is_unsharded(&self) -> bool {
|
pub fn is_unsharded(&self) -> bool {
|
||||||
self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
|
self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
|
||||||
}
|
}
|
||||||
@@ -204,6 +150,9 @@ impl TenantShardId {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Formatting helper
|
||||||
|
struct ShardSlug<'a>(&'a TenantShardId);
|
||||||
|
|
||||||
impl<'a> std::fmt::Display for ShardSlug<'a> {
|
impl<'a> std::fmt::Display for ShardSlug<'a> {
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
write!(
|
write!(
|
||||||
@@ -273,6 +222,16 @@ impl From<[u8; 18]> for TenantShardId {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// For use within the context of a particular tenant, when we need to know which
|
||||||
|
/// shard we're dealing with, but do not need to know the full ShardIdentity (because
|
||||||
|
/// we won't be doing any page->shard mapping), and do not need to know the fully qualified
|
||||||
|
/// TenantShardId.
|
||||||
|
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
|
||||||
|
pub struct ShardIndex {
|
||||||
|
pub shard_number: ShardNumber,
|
||||||
|
pub shard_count: ShardCount,
|
||||||
|
}
|
||||||
|
|
||||||
impl ShardIndex {
|
impl ShardIndex {
|
||||||
pub fn new(number: ShardNumber, count: ShardCount) -> Self {
|
pub fn new(number: ShardNumber, count: ShardCount) -> Self {
|
||||||
Self {
|
Self {
|
||||||
@@ -287,9 +246,6 @@ impl ShardIndex {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The "unsharded" value is distinct from simply having a single shard: it represents
|
|
||||||
/// a tenant which is not shard-aware at all, and whose storage paths will not include
|
|
||||||
/// a shard suffix.
|
|
||||||
pub fn is_unsharded(&self) -> bool {
|
pub fn is_unsharded(&self) -> bool {
|
||||||
self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
|
self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
|
||||||
}
|
}
|
||||||
@@ -357,8 +313,6 @@ impl Serialize for TenantShardId {
|
|||||||
if serializer.is_human_readable() {
|
if serializer.is_human_readable() {
|
||||||
serializer.collect_str(self)
|
serializer.collect_str(self)
|
||||||
} else {
|
} else {
|
||||||
// Note: while human encoding of [`TenantShardId`] is backward and forward
|
|
||||||
// compatible, this binary encoding is not.
|
|
||||||
let mut packed: [u8; 18] = [0; 18];
|
let mut packed: [u8; 18] = [0; 18];
|
||||||
packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
|
packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
|
||||||
packed[16] = self.shard_number.0;
|
packed[16] = self.shard_number.0;
|
||||||
@@ -436,6 +390,16 @@ const LAYOUT_BROKEN: ShardLayout = ShardLayout(255);
|
|||||||
/// Default stripe size in pages: 256MiB divided by 8kiB page size.
|
/// Default stripe size in pages: 256MiB divided by 8kiB page size.
|
||||||
const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
|
const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
|
||||||
|
|
||||||
|
/// The ShardIdentity contains the information needed for one member of map
|
||||||
|
/// to resolve a key to a shard, and then check whether that shard is ==self.
|
||||||
|
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
|
||||||
|
pub struct ShardIdentity {
|
||||||
|
pub number: ShardNumber,
|
||||||
|
pub count: ShardCount,
|
||||||
|
pub stripe_size: ShardStripeSize,
|
||||||
|
layout: ShardLayout,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(thiserror::Error, Debug, PartialEq, Eq)]
|
#[derive(thiserror::Error, Debug, PartialEq, Eq)]
|
||||||
pub enum ShardConfigError {
|
pub enum ShardConfigError {
|
||||||
#[error("Invalid shard count")]
|
#[error("Invalid shard count")]
|
||||||
@@ -475,9 +439,6 @@ impl ShardIdentity {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The "unsharded" value is distinct from simply having a single shard: it represents
|
|
||||||
/// a tenant which is not shard-aware at all, and whose storage paths will not include
|
|
||||||
/// a shard suffix.
|
|
||||||
pub fn is_unsharded(&self) -> bool {
|
pub fn is_unsharded(&self) -> bool {
|
||||||
self.number == ShardNumber(0) && self.count == ShardCount(0)
|
self.number == ShardNumber(0) && self.count == ShardCount(0)
|
||||||
}
|
}
|
||||||
@@ -526,8 +487,6 @@ impl ShardIdentity {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Return true if the key should be ingested by this shard
|
/// Return true if the key should be ingested by this shard
|
||||||
///
|
|
||||||
/// Shards must ingest _at least_ keys which return true from this check.
|
|
||||||
pub fn is_key_local(&self, key: &Key) -> bool {
|
pub fn is_key_local(&self, key: &Key) -> bool {
|
||||||
assert!(!self.is_broken());
|
assert!(!self.is_broken());
|
||||||
if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) {
|
if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) {
|
||||||
@@ -538,9 +497,7 @@ impl ShardIdentity {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Return true if the key should be discarded if found in this shard's
|
/// Return true if the key should be discarded if found in this shard's
|
||||||
/// data store, e.g. during compaction after a split.
|
/// data store, e.g. during compaction after a split
|
||||||
///
|
|
||||||
/// Shards _may_ drop keys which return false here, but are not obliged to.
|
|
||||||
pub fn is_key_disposable(&self, key: &Key) -> bool {
|
pub fn is_key_disposable(&self, key: &Key) -> bool {
|
||||||
if key_is_shard0(key) {
|
if key_is_shard0(key) {
|
||||||
// Q: Why can't we dispose of shard0 content if we're not shard 0?
|
// Q: Why can't we dispose of shard0 content if we're not shard 0?
|
||||||
@@ -566,7 +523,7 @@ impl ShardIdentity {
|
|||||||
|
|
||||||
/// Convenience for checking if this identity is the 0th shard in a tenant,
|
/// Convenience for checking if this identity is the 0th shard in a tenant,
|
||||||
/// for special cases on shard 0 such as ingesting relation sizes.
|
/// for special cases on shard 0 such as ingesting relation sizes.
|
||||||
pub fn is_shard_zero(&self) -> bool {
|
pub fn is_zero(&self) -> bool {
|
||||||
self.number == ShardNumber(0)
|
self.number == ShardNumber(0)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,9 +6,7 @@
|
|||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use utils::id::NodeId;
|
use utils::id::NodeId;
|
||||||
|
|
||||||
use crate::{
|
use crate::{controller_api::NodeRegisterRequest, shard::TenantShardId};
|
||||||
controller_api::NodeRegisterRequest, models::LocationConfigMode, shard::TenantShardId,
|
|
||||||
};
|
|
||||||
|
|
||||||
/// Upcall message sent by the pageserver to the configured `control_plane_api` on
|
/// Upcall message sent by the pageserver to the configured `control_plane_api` on
|
||||||
/// startup.
|
/// startup.
|
||||||
@@ -22,20 +20,12 @@ pub struct ReAttachRequest {
|
|||||||
pub register: Option<NodeRegisterRequest>,
|
pub register: Option<NodeRegisterRequest>,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn default_mode() -> LocationConfigMode {
|
#[derive(Serialize, Deserialize)]
|
||||||
LocationConfigMode::AttachedSingle
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
|
||||||
pub struct ReAttachResponseTenant {
|
pub struct ReAttachResponseTenant {
|
||||||
pub id: TenantShardId,
|
pub id: TenantShardId,
|
||||||
/// Mandatory if LocationConfigMode is None or set to an Attached* mode
|
pub gen: u32,
|
||||||
pub gen: Option<u32>,
|
|
||||||
|
|
||||||
/// Default value only for backward compat: this field should be set
|
|
||||||
#[serde(default = "default_mode")]
|
|
||||||
pub mode: LocationConfigMode,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize)]
|
||||||
pub struct ReAttachResponse {
|
pub struct ReAttachResponse {
|
||||||
pub tenants: Vec<ReAttachResponseTenant>,
|
pub tenants: Vec<ReAttachResponseTenant>,
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
use anyhow::*;
|
use anyhow::*;
|
||||||
use clap::{value_parser, Arg, ArgMatches, Command};
|
use clap::{value_parser, Arg, ArgMatches, Command};
|
||||||
use postgres::Client;
|
|
||||||
use std::{path::PathBuf, str::FromStr};
|
use std::{path::PathBuf, str::FromStr};
|
||||||
use wal_craft::*;
|
use wal_craft::*;
|
||||||
|
|
||||||
@@ -9,8 +8,8 @@ fn main() -> Result<()> {
|
|||||||
.init();
|
.init();
|
||||||
let arg_matches = cli().get_matches();
|
let arg_matches = cli().get_matches();
|
||||||
|
|
||||||
let wal_craft = |arg_matches: &ArgMatches, client: &mut Client| {
|
let wal_craft = |arg_matches: &ArgMatches, client| {
|
||||||
let intermediate_lsns = match arg_matches
|
let (intermediate_lsns, end_of_wal_lsn) = match arg_matches
|
||||||
.get_one::<String>("type")
|
.get_one::<String>("type")
|
||||||
.map(|s| s.as_str())
|
.map(|s| s.as_str())
|
||||||
.context("'type' is required")?
|
.context("'type' is required")?
|
||||||
@@ -26,7 +25,6 @@ fn main() -> Result<()> {
|
|||||||
LastWalRecordCrossingSegment::NAME => LastWalRecordCrossingSegment::craft(client)?,
|
LastWalRecordCrossingSegment::NAME => LastWalRecordCrossingSegment::craft(client)?,
|
||||||
a => panic!("Unknown --type argument: {a}"),
|
a => panic!("Unknown --type argument: {a}"),
|
||||||
};
|
};
|
||||||
let end_of_wal_lsn = client.pg_current_wal_insert_lsn()?;
|
|
||||||
for lsn in intermediate_lsns {
|
for lsn in intermediate_lsns {
|
||||||
println!("intermediate_lsn = {lsn}");
|
println!("intermediate_lsn = {lsn}");
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ use postgres::types::PgLsn;
|
|||||||
use postgres::Client;
|
use postgres::Client;
|
||||||
use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
|
use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
|
||||||
use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
|
use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
|
||||||
|
use std::cmp::Ordering;
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
use std::process::Command;
|
use std::process::Command;
|
||||||
use std::time::{Duration, Instant};
|
use std::time::{Duration, Instant};
|
||||||
@@ -231,52 +232,59 @@ pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> anyhow
|
|||||||
pub trait Crafter {
|
pub trait Crafter {
|
||||||
const NAME: &'static str;
|
const NAME: &'static str;
|
||||||
|
|
||||||
/// Generates WAL using the client `client`. Returns a vector of some valid
|
/// Generates WAL using the client `client`. Returns a pair of:
|
||||||
/// "interesting" intermediate LSNs which one may start reading from.
|
/// * A vector of some valid "interesting" intermediate LSNs which one may start reading from.
|
||||||
/// test_end_of_wal uses this to check various starting points.
|
/// May include or exclude Lsn(0) and the end-of-wal.
|
||||||
///
|
/// * The expected end-of-wal LSN.
|
||||||
/// Note that postgres is generally keen about writing some WAL. While we
|
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)>;
|
||||||
/// try to disable it (autovacuum, big wal_writer_delay, etc) it is always
|
|
||||||
/// possible, e.g. xl_running_xacts are dumped each 15s. So checks about
|
|
||||||
/// stable WAL end would be flaky unless postgres is shut down. For this
|
|
||||||
/// reason returning potential end of WAL here is pointless. Most of the
|
|
||||||
/// time this doesn't happen though, so it is reasonable to create needed
|
|
||||||
/// WAL structure and immediately kill postgres like test_end_of_wal does.
|
|
||||||
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>>;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Wraps some WAL craft function, providing current LSN to it before the
|
|
||||||
/// insertion and flushing WAL afterwards. Also pushes initial LSN to the
|
|
||||||
/// result.
|
|
||||||
fn craft_internal<C: postgres::GenericClient>(
|
fn craft_internal<C: postgres::GenericClient>(
|
||||||
client: &mut C,
|
client: &mut C,
|
||||||
f: impl Fn(&mut C, PgLsn) -> anyhow::Result<Vec<PgLsn>>,
|
f: impl Fn(&mut C, PgLsn) -> anyhow::Result<(Vec<PgLsn>, Option<PgLsn>)>,
|
||||||
) -> anyhow::Result<Vec<PgLsn>> {
|
) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
|
||||||
ensure_server_config(client)?;
|
ensure_server_config(client)?;
|
||||||
|
|
||||||
let initial_lsn = client.pg_current_wal_insert_lsn()?;
|
let initial_lsn = client.pg_current_wal_insert_lsn()?;
|
||||||
info!("LSN initial = {}", initial_lsn);
|
info!("LSN initial = {}", initial_lsn);
|
||||||
|
|
||||||
let mut intermediate_lsns = f(client, initial_lsn)?;
|
let (mut intermediate_lsns, last_lsn) = f(client, initial_lsn)?;
|
||||||
|
let last_lsn = match last_lsn {
|
||||||
|
None => client.pg_current_wal_insert_lsn()?,
|
||||||
|
Some(last_lsn) => {
|
||||||
|
let insert_lsn = client.pg_current_wal_insert_lsn()?;
|
||||||
|
match last_lsn.cmp(&insert_lsn) {
|
||||||
|
Ordering::Less => bail!(
|
||||||
|
"Some records were inserted after the crafted WAL: {} vs {}",
|
||||||
|
last_lsn,
|
||||||
|
insert_lsn
|
||||||
|
),
|
||||||
|
Ordering::Equal => last_lsn,
|
||||||
|
Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
if !intermediate_lsns.starts_with(&[initial_lsn]) {
|
if !intermediate_lsns.starts_with(&[initial_lsn]) {
|
||||||
intermediate_lsns.insert(0, initial_lsn);
|
intermediate_lsns.insert(0, initial_lsn);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Some records may be not flushed, e.g. non-transactional logical messages.
|
// Some records may be not flushed, e.g. non-transactional logical messages.
|
||||||
//
|
|
||||||
// Note: this is broken if pg_current_wal_insert_lsn is at page boundary
|
|
||||||
// because pg_current_wal_insert_lsn skips page headers.
|
|
||||||
client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?;
|
client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?;
|
||||||
Ok(intermediate_lsns)
|
match last_lsn.cmp(&client.pg_current_wal_flush_lsn()?) {
|
||||||
|
Ordering::Less => bail!("Some records were flushed after the crafted WAL"),
|
||||||
|
Ordering::Equal => {}
|
||||||
|
Ordering::Greater => bail!("Reported LSN is greater than flush_lsn"),
|
||||||
|
}
|
||||||
|
Ok((intermediate_lsns, last_lsn))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct Simple;
|
pub struct Simple;
|
||||||
impl Crafter for Simple {
|
impl Crafter for Simple {
|
||||||
const NAME: &'static str = "simple";
|
const NAME: &'static str = "simple";
|
||||||
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
|
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
|
||||||
craft_internal(client, |client, _| {
|
craft_internal(client, |client, _| {
|
||||||
client.execute("CREATE table t(x int)", &[])?;
|
client.execute("CREATE table t(x int)", &[])?;
|
||||||
Ok(Vec::new())
|
Ok((Vec::new(), None))
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -284,36 +292,29 @@ impl Crafter for Simple {
|
|||||||
pub struct LastWalRecordXlogSwitch;
|
pub struct LastWalRecordXlogSwitch;
|
||||||
impl Crafter for LastWalRecordXlogSwitch {
|
impl Crafter for LastWalRecordXlogSwitch {
|
||||||
const NAME: &'static str = "last_wal_record_xlog_switch";
|
const NAME: &'static str = "last_wal_record_xlog_switch";
|
||||||
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
|
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
|
||||||
// Do not use craft_internal because here we end up with flush_lsn exactly on
|
// Do not use generate_internal because here we end up with flush_lsn exactly on
|
||||||
// the segment boundary and insert_lsn after the initial page header, which is unusual.
|
// the segment boundary and insert_lsn after the initial page header, which is unusual.
|
||||||
ensure_server_config(client)?;
|
ensure_server_config(client)?;
|
||||||
|
|
||||||
client.execute("CREATE table t(x int)", &[])?;
|
client.execute("CREATE table t(x int)", &[])?;
|
||||||
let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
|
let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
|
||||||
// pg_switch_wal returns end of last record of the switched segment,
|
let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
|
||||||
// i.e. end of SWITCH itself.
|
let next_segment = PgLsn::from(0x0200_0000);
|
||||||
let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
|
|
||||||
let before_xlog_switch_u64 = u64::from(before_xlog_switch);
|
|
||||||
let next_segment = PgLsn::from(
|
|
||||||
before_xlog_switch_u64 - (before_xlog_switch_u64 % WAL_SEGMENT_SIZE as u64)
|
|
||||||
+ WAL_SEGMENT_SIZE as u64,
|
|
||||||
);
|
|
||||||
ensure!(
|
ensure!(
|
||||||
xlog_switch_record_end <= next_segment,
|
after_xlog_switch <= next_segment,
|
||||||
"XLOG_SWITCH record ended after the expected segment boundary: {} > {}",
|
"XLOG_SWITCH message ended after the expected segment boundary: {} > {}",
|
||||||
xlog_switch_record_end,
|
after_xlog_switch,
|
||||||
next_segment
|
next_segment
|
||||||
);
|
);
|
||||||
Ok(vec![before_xlog_switch, xlog_switch_record_end])
|
Ok((vec![before_xlog_switch, after_xlog_switch], next_segment))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct LastWalRecordXlogSwitchEndsOnPageBoundary;
|
pub struct LastWalRecordXlogSwitchEndsOnPageBoundary;
|
||||||
/// Craft xlog SWITCH record ending at page boundary.
|
|
||||||
impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
|
impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
|
||||||
const NAME: &'static str = "last_wal_record_xlog_switch_ends_on_page_boundary";
|
const NAME: &'static str = "last_wal_record_xlog_switch_ends_on_page_boundary";
|
||||||
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
|
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
|
||||||
// Do not use generate_internal because here we end up with flush_lsn exactly on
|
// Do not use generate_internal because here we end up with flush_lsn exactly on
|
||||||
// the segment boundary and insert_lsn after the initial page header, which is unusual.
|
// the segment boundary and insert_lsn after the initial page header, which is unusual.
|
||||||
ensure_server_config(client)?;
|
ensure_server_config(client)?;
|
||||||
@@ -360,29 +361,28 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
|
|||||||
|
|
||||||
// Emit the XLOG_SWITCH
|
// Emit the XLOG_SWITCH
|
||||||
let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
|
let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
|
||||||
let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
|
let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
|
||||||
let next_segment = PgLsn::from(0x0200_0000);
|
let next_segment = PgLsn::from(0x0200_0000);
|
||||||
ensure!(
|
ensure!(
|
||||||
xlog_switch_record_end < next_segment,
|
after_xlog_switch < next_segment,
|
||||||
"XLOG_SWITCH record ended on or after the expected segment boundary: {} > {}",
|
"XLOG_SWITCH message ended on or after the expected segment boundary: {} > {}",
|
||||||
xlog_switch_record_end,
|
after_xlog_switch,
|
||||||
next_segment
|
next_segment
|
||||||
);
|
);
|
||||||
ensure!(
|
ensure!(
|
||||||
u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
|
u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
|
||||||
"XLOG_SWITCH message ended not on page boundary: {}, offset = {}",
|
"XLOG_SWITCH message ended not on page boundary: {}, offset = {}",
|
||||||
xlog_switch_record_end,
|
after_xlog_switch,
|
||||||
u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
|
u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ
|
||||||
);
|
);
|
||||||
Ok(vec![before_xlog_switch, xlog_switch_record_end])
|
Ok((vec![before_xlog_switch, after_xlog_switch], next_segment))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Write ~16MB logical message; it should cross WAL segment.
|
fn craft_single_logical_message(
|
||||||
fn craft_seg_size_logical_message(
|
|
||||||
client: &mut impl postgres::GenericClient,
|
client: &mut impl postgres::GenericClient,
|
||||||
transactional: bool,
|
transactional: bool,
|
||||||
) -> anyhow::Result<Vec<PgLsn>> {
|
) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
|
||||||
craft_internal(client, |client, initial_lsn| {
|
craft_internal(client, |client, initial_lsn| {
|
||||||
ensure!(
|
ensure!(
|
||||||
initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024),
|
initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024),
|
||||||
@@ -405,24 +405,34 @@ fn craft_seg_size_logical_message(
|
|||||||
"Logical message crossed two segments"
|
"Logical message crossed two segments"
|
||||||
);
|
);
|
||||||
|
|
||||||
Ok(vec![message_lsn])
|
if transactional {
|
||||||
|
// Transactional logical messages are part of a transaction, so the one above is
|
||||||
|
// followed by a small COMMIT record.
|
||||||
|
|
||||||
|
let after_message_lsn = client.pg_current_wal_insert_lsn()?;
|
||||||
|
ensure!(
|
||||||
|
message_lsn < after_message_lsn,
|
||||||
|
"No record found after the emitted message"
|
||||||
|
);
|
||||||
|
Ok((vec![message_lsn], Some(after_message_lsn)))
|
||||||
|
} else {
|
||||||
|
Ok((Vec::new(), Some(message_lsn)))
|
||||||
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct WalRecordCrossingSegmentFollowedBySmallOne;
|
pub struct WalRecordCrossingSegmentFollowedBySmallOne;
|
||||||
impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne {
|
impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne {
|
||||||
const NAME: &'static str = "wal_record_crossing_segment_followed_by_small_one";
|
const NAME: &'static str = "wal_record_crossing_segment_followed_by_small_one";
|
||||||
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
|
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
|
||||||
// Transactional message crossing WAL segment will be followed by small
|
craft_single_logical_message(client, true)
|
||||||
// commit record.
|
|
||||||
craft_seg_size_logical_message(client, true)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct LastWalRecordCrossingSegment;
|
pub struct LastWalRecordCrossingSegment;
|
||||||
impl Crafter for LastWalRecordCrossingSegment {
|
impl Crafter for LastWalRecordCrossingSegment {
|
||||||
const NAME: &'static str = "last_wal_record_crossing_segment";
|
const NAME: &'static str = "last_wal_record_crossing_segment";
|
||||||
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
|
fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
|
||||||
craft_seg_size_logical_message(client, false)
|
craft_single_logical_message(client, false)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -11,15 +11,13 @@ use utils::const_assert;
|
|||||||
use utils::lsn::Lsn;
|
use utils::lsn::Lsn;
|
||||||
|
|
||||||
fn init_logging() {
|
fn init_logging() {
|
||||||
let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(format!(
|
let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(
|
||||||
"crate=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"
|
format!("crate=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"),
|
||||||
)))
|
))
|
||||||
.is_test(true)
|
.is_test(true)
|
||||||
.try_init();
|
.try_init();
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Test that find_end_of_wal returns the same results as pg_dump on various
|
|
||||||
/// WALs created by Crafter.
|
|
||||||
fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
|
fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
|
||||||
use crate::*;
|
use crate::*;
|
||||||
|
|
||||||
@@ -40,13 +38,13 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
|
|||||||
}
|
}
|
||||||
cfg.initdb().unwrap();
|
cfg.initdb().unwrap();
|
||||||
let srv = cfg.start_server().unwrap();
|
let srv = cfg.start_server().unwrap();
|
||||||
let intermediate_lsns = C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
|
let (intermediate_lsns, expected_end_of_wal_partial) =
|
||||||
|
C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
|
||||||
let intermediate_lsns: Vec<Lsn> = intermediate_lsns
|
let intermediate_lsns: Vec<Lsn> = intermediate_lsns
|
||||||
.iter()
|
.iter()
|
||||||
.map(|&lsn| u64::from(lsn).into())
|
.map(|&lsn| u64::from(lsn).into())
|
||||||
.collect();
|
.collect();
|
||||||
// Kill postgres. Note that it might have inserted to WAL something after
|
let expected_end_of_wal: Lsn = u64::from(expected_end_of_wal_partial).into();
|
||||||
// 'craft' did its job.
|
|
||||||
srv.kill();
|
srv.kill();
|
||||||
|
|
||||||
// Check find_end_of_wal on the initial WAL
|
// Check find_end_of_wal on the initial WAL
|
||||||
@@ -58,7 +56,7 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
|
|||||||
.filter(|fname| IsXLogFileName(fname))
|
.filter(|fname| IsXLogFileName(fname))
|
||||||
.max()
|
.max()
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let expected_end_of_wal = find_pg_waldump_end_of_wal(&cfg, &last_segment);
|
check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal);
|
||||||
for start_lsn in intermediate_lsns
|
for start_lsn in intermediate_lsns
|
||||||
.iter()
|
.iter()
|
||||||
.chain(std::iter::once(&expected_end_of_wal))
|
.chain(std::iter::once(&expected_end_of_wal))
|
||||||
@@ -93,7 +91,11 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &str) -> Lsn {
|
fn check_pg_waldump_end_of_wal(
|
||||||
|
cfg: &crate::Conf,
|
||||||
|
last_segment: &str,
|
||||||
|
expected_end_of_wal: Lsn,
|
||||||
|
) {
|
||||||
// Get the actual end of WAL by pg_waldump
|
// Get the actual end of WAL by pg_waldump
|
||||||
let waldump_output = cfg
|
let waldump_output = cfg
|
||||||
.pg_waldump("000000010000000000000001", last_segment)
|
.pg_waldump("000000010000000000000001", last_segment)
|
||||||
@@ -111,8 +113,11 @@ fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &str) -> Lsn {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap();
|
let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap();
|
||||||
info!("waldump erred on {}", waldump_wal_end);
|
info!(
|
||||||
waldump_wal_end
|
"waldump erred on {}, expected wal end at {}",
|
||||||
|
waldump_wal_end, expected_end_of_wal
|
||||||
|
);
|
||||||
|
assert_eq!(waldump_wal_end, expected_end_of_wal);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn check_end_of_wal(
|
fn check_end_of_wal(
|
||||||
@@ -205,9 +210,9 @@ pub fn test_update_next_xid() {
|
|||||||
#[test]
|
#[test]
|
||||||
pub fn test_encode_logical_message() {
|
pub fn test_encode_logical_message() {
|
||||||
let expected = [
|
let expected = [
|
||||||
64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255, 38,
|
64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255,
|
||||||
0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114, 101, 102,
|
38, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114,
|
||||||
105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
|
101, 102, 105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
|
||||||
];
|
];
|
||||||
let actual = encode_logical_message("prefix", "message");
|
let actual = encode_logical_message("prefix", "message");
|
||||||
assert_eq!(expected, actual[..]);
|
assert_eq!(expected, actual[..]);
|
||||||
|
|||||||
@@ -18,7 +18,6 @@ camino.workspace = true
|
|||||||
humantime.workspace = true
|
humantime.workspace = true
|
||||||
hyper = { workspace = true, features = ["stream"] }
|
hyper = { workspace = true, features = ["stream"] }
|
||||||
futures.workspace = true
|
futures.workspace = true
|
||||||
rand.workspace = true
|
|
||||||
serde.workspace = true
|
serde.workspace = true
|
||||||
serde_json.workspace = true
|
serde_json.workspace = true
|
||||||
tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
|
tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
|
||||||
|
|||||||
@@ -157,8 +157,9 @@ impl AzureBlobStorage {
|
|||||||
let mut bufs = Vec::new();
|
let mut bufs = Vec::new();
|
||||||
while let Some(part) = response.next().await {
|
while let Some(part) = response.next().await {
|
||||||
let part = part?;
|
let part = part?;
|
||||||
|
let etag_str: &str = part.blob.properties.etag.as_ref();
|
||||||
if etag.is_none() {
|
if etag.is_none() {
|
||||||
etag = Some(part.blob.properties.etag);
|
etag = Some(etag.unwrap_or_else(|| etag_str.to_owned()));
|
||||||
}
|
}
|
||||||
if last_modified.is_none() {
|
if last_modified.is_none() {
|
||||||
last_modified = Some(part.blob.properties.last_modified.into());
|
last_modified = Some(part.blob.properties.last_modified.into());
|
||||||
@@ -173,16 +174,6 @@ impl AzureBlobStorage {
|
|||||||
.map_err(|e| DownloadError::Other(e.into()))?;
|
.map_err(|e| DownloadError::Other(e.into()))?;
|
||||||
bufs.push(data);
|
bufs.push(data);
|
||||||
}
|
}
|
||||||
|
|
||||||
if bufs.is_empty() {
|
|
||||||
return Err(DownloadError::Other(anyhow::anyhow!(
|
|
||||||
"Azure GET response contained no buffers"
|
|
||||||
)));
|
|
||||||
}
|
|
||||||
// unwrap safety: if these were None, bufs would be empty and we would have returned an error already
|
|
||||||
let etag = etag.unwrap();
|
|
||||||
let last_modified = last_modified.unwrap();
|
|
||||||
|
|
||||||
Ok(Download {
|
Ok(Download {
|
||||||
download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
|
download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
|
||||||
etag,
|
etag,
|
||||||
|
|||||||
@@ -42,9 +42,6 @@ pub use self::{
|
|||||||
};
|
};
|
||||||
use s3_bucket::RequestKind;
|
use s3_bucket::RequestKind;
|
||||||
|
|
||||||
/// Azure SDK's ETag type is a simple String wrapper: we use this internally instead of repeating it here.
|
|
||||||
pub use azure_core::Etag;
|
|
||||||
|
|
||||||
pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
|
pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
|
||||||
|
|
||||||
/// Currently, sync happens with AWS S3, that has two limits on requests per second:
|
/// Currently, sync happens with AWS S3, that has two limits on requests per second:
|
||||||
@@ -134,11 +131,6 @@ impl RemotePath {
|
|||||||
pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> {
|
pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> {
|
||||||
self.0.strip_prefix(&p.0)
|
self.0.strip_prefix(&p.0)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_trailing_slash(&self) -> Self {
|
|
||||||
// Unwrap safety inputs are guararnteed to be valid UTF-8
|
|
||||||
Self(format!("{}/", self.0).try_into().unwrap())
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// We don't need callers to be able to pass arbitrary delimiters: just control
|
/// We don't need callers to be able to pass arbitrary delimiters: just control
|
||||||
@@ -162,21 +154,47 @@ pub struct Listing {
|
|||||||
/// providing basic CRUD operations for storage files.
|
/// providing basic CRUD operations for storage files.
|
||||||
#[allow(async_fn_in_trait)]
|
#[allow(async_fn_in_trait)]
|
||||||
pub trait RemoteStorage: Send + Sync + 'static {
|
pub trait RemoteStorage: Send + Sync + 'static {
|
||||||
/// List objects in remote storage, with semantics matching AWS S3's ListObjectsV2.
|
/// Lists all top level subdirectories for a given prefix
|
||||||
/// (see `<https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html>`)
|
/// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
|
||||||
///
|
/// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS)
|
||||||
/// Note that the prefix is relative to any `prefix_in_bucket` configured for the client, not
|
/// so this method doesnt need to.
|
||||||
/// from the absolute root of the bucket.
|
async fn list_prefixes(
|
||||||
///
|
&self,
|
||||||
/// `mode` configures whether to use a delimiter. Without a delimiter all keys
|
prefix: Option<&RemotePath>,
|
||||||
/// within the prefix are listed in the `keys` of the result. With a delimiter, any "directories" at the top level of
|
cancel: &CancellationToken,
|
||||||
/// the prefix are returned in the `prefixes` of the result, and keys in the top level of the prefix are
|
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||||
/// returned in `keys` ().
|
let result = self
|
||||||
///
|
.list(prefix, ListingMode::WithDelimiter, None, cancel)
|
||||||
/// `max_keys` controls the maximum number of keys that will be returned. If this is None, this function
|
.await?
|
||||||
/// will iteratively call listobjects until it runs out of keys. Note that this is not safe to use on
|
.prefixes;
|
||||||
/// unlimted size buckets, as the full list of objects is allocated into a monolithic data structure.
|
Ok(result)
|
||||||
|
}
|
||||||
|
/// Lists all files in directory "recursively"
|
||||||
|
/// (not really recursively, because AWS has a flat namespace)
|
||||||
|
/// Note: This is subtely different than list_prefixes,
|
||||||
|
/// because it is for listing files instead of listing
|
||||||
|
/// names sharing common prefixes.
|
||||||
|
/// For example,
|
||||||
|
/// list_files("foo/bar") = ["foo/bar/cat123.txt",
|
||||||
|
/// "foo/bar/cat567.txt", "foo/bar/dog123.txt", "foo/bar/dog456.txt"]
|
||||||
|
/// whereas,
|
||||||
|
/// list_prefixes("foo/bar/") = ["cat", "dog"]
|
||||||
|
/// See `test_real_s3.rs` for more details.
|
||||||
///
|
///
|
||||||
|
/// max_keys limits max number of keys returned; None means unlimited.
|
||||||
|
async fn list_files(
|
||||||
|
&self,
|
||||||
|
prefix: Option<&RemotePath>,
|
||||||
|
max_keys: Option<NonZeroU32>,
|
||||||
|
cancel: &CancellationToken,
|
||||||
|
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||||
|
let result = self
|
||||||
|
.list(prefix, ListingMode::NoDelimiter, max_keys, cancel)
|
||||||
|
.await?
|
||||||
|
.keys;
|
||||||
|
Ok(result)
|
||||||
|
}
|
||||||
|
|
||||||
async fn list(
|
async fn list(
|
||||||
&self,
|
&self,
|
||||||
prefix: Option<&RemotePath>,
|
prefix: Option<&RemotePath>,
|
||||||
@@ -273,9 +291,9 @@ pub type DownloadStream =
|
|||||||
pub struct Download {
|
pub struct Download {
|
||||||
pub download_stream: DownloadStream,
|
pub download_stream: DownloadStream,
|
||||||
/// The last time the file was modified (`last-modified` HTTP header)
|
/// The last time the file was modified (`last-modified` HTTP header)
|
||||||
pub last_modified: SystemTime,
|
pub last_modified: Option<SystemTime>,
|
||||||
/// A way to identify this specific version of the resource (`etag` HTTP header)
|
/// A way to identify this specific version of the resource (`etag` HTTP header)
|
||||||
pub etag: Etag,
|
pub etag: Option<String>,
|
||||||
/// Extra key-value data, associated with the current remote file.
|
/// Extra key-value data, associated with the current remote file.
|
||||||
pub metadata: Option<StorageMetadata>,
|
pub metadata: Option<StorageMetadata>,
|
||||||
}
|
}
|
||||||
@@ -315,6 +333,41 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// A function for listing all the files in a "directory"
|
||||||
|
// Example:
|
||||||
|
// list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
|
||||||
|
//
|
||||||
|
// max_keys limits max number of keys returned; None means unlimited.
|
||||||
|
pub async fn list_files(
|
||||||
|
&self,
|
||||||
|
folder: Option<&RemotePath>,
|
||||||
|
max_keys: Option<NonZeroU32>,
|
||||||
|
cancel: &CancellationToken,
|
||||||
|
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||||
|
match self {
|
||||||
|
Self::LocalFs(s) => s.list_files(folder, max_keys, cancel).await,
|
||||||
|
Self::AwsS3(s) => s.list_files(folder, max_keys, cancel).await,
|
||||||
|
Self::AzureBlob(s) => s.list_files(folder, max_keys, cancel).await,
|
||||||
|
Self::Unreliable(s) => s.list_files(folder, max_keys, cancel).await,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// lists common *prefixes*, if any of files
|
||||||
|
// Example:
|
||||||
|
// list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"]
|
||||||
|
pub async fn list_prefixes(
|
||||||
|
&self,
|
||||||
|
prefix: Option<&RemotePath>,
|
||||||
|
cancel: &CancellationToken,
|
||||||
|
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||||
|
match self {
|
||||||
|
Self::LocalFs(s) => s.list_prefixes(prefix, cancel).await,
|
||||||
|
Self::AwsS3(s) => s.list_prefixes(prefix, cancel).await,
|
||||||
|
Self::AzureBlob(s) => s.list_prefixes(prefix, cancel).await,
|
||||||
|
Self::Unreliable(s) => s.list_prefixes(prefix, cancel).await,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// See [`RemoteStorage::upload`]
|
/// See [`RemoteStorage::upload`]
|
||||||
pub async fn upload(
|
pub async fn upload(
|
||||||
&self,
|
&self,
|
||||||
@@ -509,16 +562,6 @@ impl GenericRemoteStorage {
|
|||||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
pub struct StorageMetadata(HashMap<String, String>);
|
pub struct StorageMetadata(HashMap<String, String>);
|
||||||
|
|
||||||
impl<const N: usize> From<[(&str, &str); N]> for StorageMetadata {
|
|
||||||
fn from(arr: [(&str, &str); N]) -> Self {
|
|
||||||
let map: HashMap<String, String> = arr
|
|
||||||
.iter()
|
|
||||||
.map(|(k, v)| (k.to_string(), v.to_string()))
|
|
||||||
.collect();
|
|
||||||
Self(map)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// External backup storage configuration, enough for creating a client for that storage.
|
/// External backup storage configuration, enough for creating a client for that storage.
|
||||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
pub struct RemoteStorageConfig {
|
pub struct RemoteStorageConfig {
|
||||||
|
|||||||
@@ -5,10 +5,12 @@
|
|||||||
//! volume is mounted to the local FS.
|
//! volume is mounted to the local FS.
|
||||||
|
|
||||||
use std::{
|
use std::{
|
||||||
collections::HashSet,
|
borrow::Cow,
|
||||||
|
future::Future,
|
||||||
io::ErrorKind,
|
io::ErrorKind,
|
||||||
num::NonZeroU32,
|
num::NonZeroU32,
|
||||||
time::{Duration, SystemTime, UNIX_EPOCH},
|
pin::Pin,
|
||||||
|
time::{Duration, SystemTime},
|
||||||
};
|
};
|
||||||
|
|
||||||
use anyhow::{bail, ensure, Context};
|
use anyhow::{bail, ensure, Context};
|
||||||
@@ -20,15 +22,14 @@ use tokio::{
|
|||||||
io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
|
io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
|
||||||
};
|
};
|
||||||
use tokio_util::{io::ReaderStream, sync::CancellationToken};
|
use tokio_util::{io::ReaderStream, sync::CancellationToken};
|
||||||
use utils::crashsafe::path_with_suffix_extension;
|
use tracing::*;
|
||||||
|
use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
|
Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
|
||||||
REMOTE_STORAGE_PREFIX_SEPARATOR,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
use super::{RemoteStorage, StorageMetadata};
|
use super::{RemoteStorage, StorageMetadata};
|
||||||
use crate::Etag;
|
|
||||||
|
|
||||||
const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp";
|
const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp";
|
||||||
|
|
||||||
@@ -91,47 +92,7 @@ impl LocalFs {
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
async fn list_all(&self) -> anyhow::Result<Vec<RemotePath>> {
|
async fn list_all(&self) -> anyhow::Result<Vec<RemotePath>> {
|
||||||
use std::{future::Future, pin::Pin};
|
Ok(get_all_files(&self.storage_root, true)
|
||||||
fn get_all_files<'a, P>(
|
|
||||||
directory_path: P,
|
|
||||||
) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
|
|
||||||
where
|
|
||||||
P: AsRef<Utf8Path> + Send + Sync + 'a,
|
|
||||||
{
|
|
||||||
Box::pin(async move {
|
|
||||||
let directory_path = directory_path.as_ref();
|
|
||||||
if directory_path.exists() {
|
|
||||||
if directory_path.is_dir() {
|
|
||||||
let mut paths = Vec::new();
|
|
||||||
let mut dir_contents = fs::read_dir(directory_path).await?;
|
|
||||||
while let Some(dir_entry) = dir_contents.next_entry().await? {
|
|
||||||
let file_type = dir_entry.file_type().await?;
|
|
||||||
let entry_path =
|
|
||||||
Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
|
|
||||||
anyhow::Error::msg(format!(
|
|
||||||
"non-Unicode path: {}",
|
|
||||||
pb.to_string_lossy()
|
|
||||||
))
|
|
||||||
})?;
|
|
||||||
if file_type.is_symlink() {
|
|
||||||
tracing::debug!("{entry_path:?} is a symlink, skipping")
|
|
||||||
} else if file_type.is_dir() {
|
|
||||||
paths.extend(get_all_files(&entry_path).await?.into_iter())
|
|
||||||
} else {
|
|
||||||
paths.push(entry_path);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(paths)
|
|
||||||
} else {
|
|
||||||
bail!("Path {directory_path:?} is not a directory")
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
Ok(Vec::new())
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(get_all_files(&self.storage_root)
|
|
||||||
.await?
|
.await?
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|path| {
|
.map(|path| {
|
||||||
@@ -153,20 +114,11 @@ impl LocalFs {
|
|||||||
None => self.storage_root.clone(),
|
None => self.storage_root.clone(),
|
||||||
};
|
};
|
||||||
|
|
||||||
eprintln!("local_fs list: searching from {full_path} for initial_dir");
|
|
||||||
// If we were given a directory, we may use it as our starting point.
|
// If we were given a directory, we may use it as our starting point.
|
||||||
// Otherwise, we must go up to the first ancestor dir that exists. This is because
|
// Otherwise, we must go up to the first ancestor dir that exists. This is because
|
||||||
// S3 object list prefixes can be arbitrary strings, but when reading
|
// S3 object list prefixes can be arbitrary strings, but when reading
|
||||||
// the local filesystem we need a directory to start calling read_dir on.
|
// the local filesystem we need a directory to start calling read_dir on.
|
||||||
let mut initial_dir = full_path.clone();
|
let mut initial_dir = full_path.clone();
|
||||||
|
|
||||||
// If there's no trailing slash, we have to start looking from one above: even if
|
|
||||||
// `initial_dir` is a directory, we should still list any prefixes in the parent
|
|
||||||
// that start with the same string.
|
|
||||||
if !full_path.to_string().ends_with('/') {
|
|
||||||
initial_dir.pop();
|
|
||||||
}
|
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
// Did we make it to the root?
|
// Did we make it to the root?
|
||||||
if initial_dir.parent().is_none() {
|
if initial_dir.parent().is_none() {
|
||||||
@@ -197,8 +149,6 @@ impl LocalFs {
|
|||||||
// starts_with later.
|
// starts_with later.
|
||||||
let prefix = full_path.as_str();
|
let prefix = full_path.as_str();
|
||||||
|
|
||||||
eprintln!("local_fs list: initial_dir={initial_dir}");
|
|
||||||
|
|
||||||
let mut files = vec![];
|
let mut files = vec![];
|
||||||
let mut directory_queue = vec![initial_dir];
|
let mut directory_queue = vec![initial_dir];
|
||||||
while let Some(cur_folder) = directory_queue.pop() {
|
while let Some(cur_folder) = directory_queue.pop() {
|
||||||
@@ -212,8 +162,6 @@ impl LocalFs {
|
|||||||
if full_file_name.is_dir() {
|
if full_file_name.is_dir() {
|
||||||
directory_queue.push(full_file_name);
|
directory_queue.push(full_file_name);
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
eprintln!("Drop {full_file_name}, not in prefix");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -249,7 +197,6 @@ impl LocalFs {
|
|||||||
fs::OpenOptions::new()
|
fs::OpenOptions::new()
|
||||||
.write(true)
|
.write(true)
|
||||||
.create(true)
|
.create(true)
|
||||||
.truncate(true)
|
|
||||||
.open(&temp_file_path)
|
.open(&temp_file_path)
|
||||||
.await
|
.await
|
||||||
.with_context(|| {
|
.with_context(|| {
|
||||||
@@ -343,76 +290,64 @@ impl RemoteStorage for LocalFs {
|
|||||||
max_keys: Option<NonZeroU32>,
|
max_keys: Option<NonZeroU32>,
|
||||||
cancel: &CancellationToken,
|
cancel: &CancellationToken,
|
||||||
) -> Result<Listing, DownloadError> {
|
) -> Result<Listing, DownloadError> {
|
||||||
if let Some(prefix) = prefix {
|
|
||||||
eprintln!("local_fs list: prefix={}", prefix);
|
|
||||||
}
|
|
||||||
let op = async {
|
let op = async {
|
||||||
let mut result = Listing::default();
|
let mut result = Listing::default();
|
||||||
|
|
||||||
// Filter out directories: in S3 directories don't exist, only the keys within them do.
|
if let ListingMode::NoDelimiter = mode {
|
||||||
let keys = self
|
let keys = self
|
||||||
.list_recursive(prefix)
|
.list_recursive(prefix)
|
||||||
|
.await
|
||||||
|
.map_err(DownloadError::Other)?;
|
||||||
|
|
||||||
|
result.keys = keys
|
||||||
|
.into_iter()
|
||||||
|
.filter(|k| {
|
||||||
|
let path = k.with_base(&self.storage_root);
|
||||||
|
!path.is_dir()
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
if let Some(max_keys) = max_keys {
|
||||||
|
result.keys.truncate(max_keys.get() as usize);
|
||||||
|
}
|
||||||
|
|
||||||
|
return Ok(result);
|
||||||
|
}
|
||||||
|
|
||||||
|
let path = match prefix {
|
||||||
|
Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
|
||||||
|
None => Cow::Borrowed(&self.storage_root),
|
||||||
|
};
|
||||||
|
|
||||||
|
let prefixes_to_filter = get_all_files(path.as_ref(), false)
|
||||||
.await
|
.await
|
||||||
.map_err(DownloadError::Other)?;
|
.map_err(DownloadError::Other)?;
|
||||||
let keys = keys
|
|
||||||
.into_iter()
|
|
||||||
.filter(|k| {
|
|
||||||
let path = k.with_base(&self.storage_root);
|
|
||||||
!path.is_dir()
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
if let ListingMode::NoDelimiter = mode {
|
// filter out empty directories to mirror s3 behavior.
|
||||||
result.keys = keys;
|
for prefix in prefixes_to_filter {
|
||||||
} else {
|
if prefix.is_dir()
|
||||||
let mut prefixes = HashSet::new();
|
&& is_directory_empty(&prefix)
|
||||||
for key in keys {
|
.await
|
||||||
eprintln!("key: {key}");
|
.map_err(DownloadError::Other)?
|
||||||
// If the part after the prefix includes a "/", take only the first part and put it in `prefixes`.
|
{
|
||||||
let relative_key = if let Some(prefix) = prefix {
|
continue;
|
||||||
let mut prefix = prefix.clone();
|
}
|
||||||
// We only strip the dirname of the prefix, so that when we strip it from the start of keys we
|
|
||||||
// end up with full file/dir names.
|
let stripped = prefix
|
||||||
let prefix_full_local_path = prefix.with_base(&self.storage_root);
|
.strip_prefix(&self.storage_root)
|
||||||
let has_slash = prefix.0.to_string().ends_with('/');
|
.context("Failed to strip prefix")
|
||||||
let strip_prefix = if prefix_full_local_path.is_dir() && has_slash {
|
.and_then(RemotePath::new)
|
||||||
prefix
|
.expect(
|
||||||
} else {
|
"We list files for storage root, hence should be able to remote the prefix",
|
||||||
prefix.0.pop();
|
);
|
||||||
prefix
|
|
||||||
};
|
if prefix.is_dir() {
|
||||||
eprintln!("strip_prefix={strip_prefix}");
|
result.prefixes.push(stripped);
|
||||||
|
} else {
|
||||||
RemotePath::new(key.strip_prefix(&strip_prefix).unwrap()).unwrap()
|
result.keys.push(stripped);
|
||||||
} else {
|
|
||||||
key
|
|
||||||
};
|
|
||||||
|
|
||||||
eprintln!("relative_key: {relative_key}");
|
|
||||||
|
|
||||||
let relative_key = format!("{}", relative_key);
|
|
||||||
if relative_key.contains(REMOTE_STORAGE_PREFIX_SEPARATOR) {
|
|
||||||
let first_part = relative_key
|
|
||||||
.split(REMOTE_STORAGE_PREFIX_SEPARATOR)
|
|
||||||
.next()
|
|
||||||
.unwrap()
|
|
||||||
.to_owned();
|
|
||||||
prefixes.insert(first_part);
|
|
||||||
} else {
|
|
||||||
result
|
|
||||||
.keys
|
|
||||||
.push(RemotePath::from_string(&relative_key).unwrap());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
result.prefixes = prefixes
|
|
||||||
.into_iter()
|
|
||||||
.map(|s| RemotePath::from_string(&s).unwrap())
|
|
||||||
.collect();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(max_keys) = max_keys {
|
|
||||||
result.keys.truncate(max_keys.get() as usize);
|
|
||||||
}
|
|
||||||
Ok(result)
|
Ok(result)
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -471,37 +406,35 @@ impl RemoteStorage for LocalFs {
|
|||||||
cancel: &CancellationToken,
|
cancel: &CancellationToken,
|
||||||
) -> Result<Download, DownloadError> {
|
) -> Result<Download, DownloadError> {
|
||||||
let target_path = from.with_base(&self.storage_root);
|
let target_path = from.with_base(&self.storage_root);
|
||||||
|
if file_exists(&target_path).map_err(DownloadError::BadInput)? {
|
||||||
|
let source = ReaderStream::new(
|
||||||
|
fs::OpenOptions::new()
|
||||||
|
.read(true)
|
||||||
|
.open(&target_path)
|
||||||
|
.await
|
||||||
|
.with_context(|| {
|
||||||
|
format!("Failed to open source file {target_path:?} to use in the download")
|
||||||
|
})
|
||||||
|
.map_err(DownloadError::Other)?,
|
||||||
|
);
|
||||||
|
|
||||||
let file_metadata = file_metadata(&target_path).await?;
|
let metadata = self
|
||||||
|
.read_storage_metadata(&target_path)
|
||||||
let source = ReaderStream::new(
|
|
||||||
fs::OpenOptions::new()
|
|
||||||
.read(true)
|
|
||||||
.open(&target_path)
|
|
||||||
.await
|
.await
|
||||||
.with_context(|| {
|
.map_err(DownloadError::Other)?;
|
||||||
format!("Failed to open source file {target_path:?} to use in the download")
|
|
||||||
})
|
|
||||||
.map_err(DownloadError::Other)?,
|
|
||||||
);
|
|
||||||
|
|
||||||
let metadata = self
|
let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
|
||||||
.read_storage_metadata(&target_path)
|
let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
|
||||||
.await
|
|
||||||
.map_err(DownloadError::Other)?;
|
|
||||||
|
|
||||||
let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
|
Ok(Download {
|
||||||
let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
|
metadata,
|
||||||
|
last_modified: None,
|
||||||
let etag = mock_etag(&file_metadata);
|
etag: None,
|
||||||
Ok(Download {
|
download_stream: Box::pin(source),
|
||||||
metadata,
|
})
|
||||||
last_modified: file_metadata
|
} else {
|
||||||
.modified()
|
Err(DownloadError::NotFound)
|
||||||
.map_err(|e| DownloadError::Other(anyhow::anyhow!(e).context("Reading mtime")))?,
|
}
|
||||||
etag,
|
|
||||||
download_stream: Box::pin(source),
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn download_byte_range(
|
async fn download_byte_range(
|
||||||
@@ -519,51 +452,50 @@ impl RemoteStorage for LocalFs {
|
|||||||
return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) and end_exclusive ({end_exclusive:?}) difference is zero bytes")));
|
return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) and end_exclusive ({end_exclusive:?}) difference is zero bytes")));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let target_path = from.with_base(&self.storage_root);
|
let target_path = from.with_base(&self.storage_root);
|
||||||
let file_metadata = file_metadata(&target_path).await?;
|
if file_exists(&target_path).map_err(DownloadError::BadInput)? {
|
||||||
let mut source = tokio::fs::OpenOptions::new()
|
let mut source = tokio::fs::OpenOptions::new()
|
||||||
.read(true)
|
.read(true)
|
||||||
.open(&target_path)
|
.open(&target_path)
|
||||||
.await
|
.await
|
||||||
.with_context(|| {
|
.with_context(|| {
|
||||||
format!("Failed to open source file {target_path:?} to use in the download")
|
format!("Failed to open source file {target_path:?} to use in the download")
|
||||||
|
})
|
||||||
|
.map_err(DownloadError::Other)?;
|
||||||
|
|
||||||
|
let len = source
|
||||||
|
.metadata()
|
||||||
|
.await
|
||||||
|
.context("query file length")
|
||||||
|
.map_err(DownloadError::Other)?
|
||||||
|
.len();
|
||||||
|
|
||||||
|
source
|
||||||
|
.seek(io::SeekFrom::Start(start_inclusive))
|
||||||
|
.await
|
||||||
|
.context("Failed to seek to the range start in a local storage file")
|
||||||
|
.map_err(DownloadError::Other)?;
|
||||||
|
|
||||||
|
let metadata = self
|
||||||
|
.read_storage_metadata(&target_path)
|
||||||
|
.await
|
||||||
|
.map_err(DownloadError::Other)?;
|
||||||
|
|
||||||
|
let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive);
|
||||||
|
let source = ReaderStream::new(source);
|
||||||
|
|
||||||
|
let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
|
||||||
|
let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
|
||||||
|
|
||||||
|
Ok(Download {
|
||||||
|
metadata,
|
||||||
|
last_modified: None,
|
||||||
|
etag: None,
|
||||||
|
download_stream: Box::pin(source),
|
||||||
})
|
})
|
||||||
.map_err(DownloadError::Other)?;
|
} else {
|
||||||
|
Err(DownloadError::NotFound)
|
||||||
let len = source
|
}
|
||||||
.metadata()
|
|
||||||
.await
|
|
||||||
.context("query file length")
|
|
||||||
.map_err(DownloadError::Other)?
|
|
||||||
.len();
|
|
||||||
|
|
||||||
source
|
|
||||||
.seek(io::SeekFrom::Start(start_inclusive))
|
|
||||||
.await
|
|
||||||
.context("Failed to seek to the range start in a local storage file")
|
|
||||||
.map_err(DownloadError::Other)?;
|
|
||||||
|
|
||||||
let metadata = self
|
|
||||||
.read_storage_metadata(&target_path)
|
|
||||||
.await
|
|
||||||
.map_err(DownloadError::Other)?;
|
|
||||||
|
|
||||||
let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive);
|
|
||||||
let source = ReaderStream::new(source);
|
|
||||||
|
|
||||||
let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
|
|
||||||
let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
|
|
||||||
|
|
||||||
let etag = mock_etag(&file_metadata);
|
|
||||||
Ok(Download {
|
|
||||||
metadata,
|
|
||||||
last_modified: file_metadata
|
|
||||||
.modified()
|
|
||||||
.map_err(|e| DownloadError::Other(anyhow::anyhow!(e).context("Reading mtime")))?,
|
|
||||||
etag,
|
|
||||||
download_stream: Box::pin(source),
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn delete(&self, path: &RemotePath, _cancel: &CancellationToken) -> anyhow::Result<()> {
|
async fn delete(&self, path: &RemotePath, _cancel: &CancellationToken) -> anyhow::Result<()> {
|
||||||
@@ -623,6 +555,50 @@ fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
|
|||||||
path_with_suffix_extension(original_path, "metadata")
|
path_with_suffix_extension(original_path, "metadata")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn get_all_files<'a, P>(
|
||||||
|
directory_path: P,
|
||||||
|
recursive: bool,
|
||||||
|
) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
|
||||||
|
where
|
||||||
|
P: AsRef<Utf8Path> + Send + Sync + 'a,
|
||||||
|
{
|
||||||
|
Box::pin(async move {
|
||||||
|
let directory_path = directory_path.as_ref();
|
||||||
|
if directory_path.exists() {
|
||||||
|
if directory_path.is_dir() {
|
||||||
|
let mut paths = Vec::new();
|
||||||
|
let mut dir_contents = fs::read_dir(directory_path).await?;
|
||||||
|
while let Some(dir_entry) = dir_contents.next_entry().await? {
|
||||||
|
let file_type = dir_entry.file_type().await?;
|
||||||
|
let entry_path =
|
||||||
|
Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
|
||||||
|
anyhow::Error::msg(format!(
|
||||||
|
"non-Unicode path: {}",
|
||||||
|
pb.to_string_lossy()
|
||||||
|
))
|
||||||
|
})?;
|
||||||
|
if file_type.is_symlink() {
|
||||||
|
debug!("{entry_path:?} is a symlink, skipping")
|
||||||
|
} else if file_type.is_dir() {
|
||||||
|
if recursive {
|
||||||
|
paths.extend(get_all_files(&entry_path, true).await?.into_iter())
|
||||||
|
} else {
|
||||||
|
paths.push(entry_path)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
paths.push(entry_path);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(paths)
|
||||||
|
} else {
|
||||||
|
bail!("Path {directory_path:?} is not a directory")
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
Ok(Vec::new())
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<()> {
|
async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<()> {
|
||||||
let target_dir = match target_file_path.parent() {
|
let target_dir = match target_file_path.parent() {
|
||||||
Some(parent_dir) => parent_dir,
|
Some(parent_dir) => parent_dir,
|
||||||
@@ -634,22 +610,13 @@ async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn file_metadata(file_path: &Utf8Path) -> Result<std::fs::Metadata, DownloadError> {
|
fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
|
||||||
tokio::fs::metadata(&file_path).await.map_err(|e| {
|
if file_path.exists() {
|
||||||
if e.kind() == ErrorKind::NotFound {
|
ensure!(file_path.is_file(), "file path '{file_path}' is not a file");
|
||||||
DownloadError::NotFound
|
Ok(true)
|
||||||
} else {
|
} else {
|
||||||
DownloadError::BadInput(e.into())
|
Ok(false)
|
||||||
}
|
}
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
// Use mtime as stand-in for ETag. We could calculate a meaningful one by md5'ing the contents of files we
|
|
||||||
// read, but that's expensive and the local_fs test helper's whole reason for existence is to run small tests
|
|
||||||
// quickly, with less overhead than using a mock S3 server.
|
|
||||||
fn mock_etag(meta: &std::fs::Metadata) -> Etag {
|
|
||||||
let mtime = meta.modified().expect("Filesystem mtime missing");
|
|
||||||
format!("{}", mtime.duration_since(UNIX_EPOCH).unwrap().as_millis()).into()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@@ -942,18 +909,13 @@ mod fs_tests {
|
|||||||
// No delimiter: should recursively list everything
|
// No delimiter: should recursively list everything
|
||||||
let (storage, cancel) = create_storage()?;
|
let (storage, cancel) = create_storage()?;
|
||||||
let child = upload_dummy_file(&storage, "grandparent/parent/child", None, &cancel).await?;
|
let child = upload_dummy_file(&storage, "grandparent/parent/child", None, &cancel).await?;
|
||||||
let child_sibling =
|
|
||||||
upload_dummy_file(&storage, "grandparent/parent/child_sibling", None, &cancel).await?;
|
|
||||||
let uncle = upload_dummy_file(&storage, "grandparent/uncle", None, &cancel).await?;
|
let uncle = upload_dummy_file(&storage, "grandparent/uncle", None, &cancel).await?;
|
||||||
|
|
||||||
let listing = storage
|
let listing = storage
|
||||||
.list(None, ListingMode::NoDelimiter, None, &cancel)
|
.list(None, ListingMode::NoDelimiter, None, &cancel)
|
||||||
.await?;
|
.await?;
|
||||||
assert!(listing.prefixes.is_empty());
|
assert!(listing.prefixes.is_empty());
|
||||||
assert_eq!(
|
assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());
|
||||||
listing.keys,
|
|
||||||
[uncle.clone(), child.clone(), child_sibling.clone()].to_vec()
|
|
||||||
);
|
|
||||||
|
|
||||||
// Delimiter: should only go one deep
|
// Delimiter: should only go one deep
|
||||||
let listing = storage
|
let listing = storage
|
||||||
@@ -966,25 +928,7 @@ mod fs_tests {
|
|||||||
);
|
);
|
||||||
assert!(listing.keys.is_empty());
|
assert!(listing.keys.is_empty());
|
||||||
|
|
||||||
// Delimiter & prefix with a trailing slash
|
// Delimiter & prefix
|
||||||
let listing = storage
|
|
||||||
.list(
|
|
||||||
Some(&RemotePath::from_string("timelines/some_timeline/grandparent/").unwrap()),
|
|
||||||
ListingMode::WithDelimiter,
|
|
||||||
None,
|
|
||||||
&cancel,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
assert_eq!(
|
|
||||||
listing.keys,
|
|
||||||
[RemotePath::from_string("uncle").unwrap()].to_vec()
|
|
||||||
);
|
|
||||||
assert_eq!(
|
|
||||||
listing.prefixes,
|
|
||||||
[RemotePath::from_string("parent").unwrap()].to_vec()
|
|
||||||
);
|
|
||||||
|
|
||||||
// Delimiter and prefix without a trailing slash
|
|
||||||
let listing = storage
|
let listing = storage
|
||||||
.list(
|
.list(
|
||||||
Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
|
Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
|
||||||
@@ -993,66 +937,12 @@ mod fs_tests {
|
|||||||
&cancel,
|
&cancel,
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
assert_eq!(listing.keys, [].to_vec());
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
listing.prefixes,
|
listing.prefixes,
|
||||||
[RemotePath::from_string("grandparent").unwrap()].to_vec()
|
[RemotePath::from_string("timelines/some_timeline/grandparent/parent").unwrap()]
|
||||||
);
|
.to_vec()
|
||||||
|
|
||||||
// Delimiter and prefix that's partway through a path component
|
|
||||||
let listing = storage
|
|
||||||
.list(
|
|
||||||
Some(&RemotePath::from_string("timelines/some_timeline/grandp").unwrap()),
|
|
||||||
ListingMode::WithDelimiter,
|
|
||||||
None,
|
|
||||||
&cancel,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
assert_eq!(listing.keys, [].to_vec());
|
|
||||||
assert_eq!(
|
|
||||||
listing.prefixes,
|
|
||||||
[RemotePath::from_string("grandparent").unwrap()].to_vec()
|
|
||||||
);
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[tokio::test]
|
|
||||||
async fn list_part_component() -> anyhow::Result<()> {
|
|
||||||
// No delimiter: should recursively list everything
|
|
||||||
let (storage, cancel) = create_storage()?;
|
|
||||||
|
|
||||||
// Imitates what happens in a tenant path when we have an unsharded path and a sharded path, and do a listing
|
|
||||||
// of the unsharded path: although there is a "directory" at the unsharded path, it should be handled as
|
|
||||||
// a freeform prefix.
|
|
||||||
let _child_a =
|
|
||||||
upload_dummy_file(&storage, "grandparent/tenant-01/child", None, &cancel).await?;
|
|
||||||
let _child_b =
|
|
||||||
upload_dummy_file(&storage, "grandparent/tenant/child", None, &cancel).await?;
|
|
||||||
|
|
||||||
// Delimiter and prefix that's partway through a path component
|
|
||||||
let listing = storage
|
|
||||||
.list(
|
|
||||||
Some(
|
|
||||||
&RemotePath::from_string("timelines/some_timeline/grandparent/tenant").unwrap(),
|
|
||||||
),
|
|
||||||
ListingMode::WithDelimiter,
|
|
||||||
None,
|
|
||||||
&cancel,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
assert_eq!(listing.keys, [].to_vec());
|
|
||||||
|
|
||||||
let mut found_prefixes = listing.prefixes.clone();
|
|
||||||
found_prefixes.sort();
|
|
||||||
assert_eq!(
|
|
||||||
found_prefixes,
|
|
||||||
[
|
|
||||||
RemotePath::from_string("tenant").unwrap(),
|
|
||||||
RemotePath::from_string("tenant-01").unwrap(),
|
|
||||||
]
|
|
||||||
.to_vec()
|
|
||||||
);
|
);
|
||||||
|
assert_eq!(listing.keys, [uncle.clone()].to_vec());
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -35,8 +35,8 @@ use aws_sdk_s3::{
|
|||||||
};
|
};
|
||||||
use aws_smithy_async::rt::sleep::TokioSleep;
|
use aws_smithy_async::rt::sleep::TokioSleep;
|
||||||
|
|
||||||
|
use aws_smithy_types::byte_stream::ByteStream;
|
||||||
use aws_smithy_types::{body::SdkBody, DateTime};
|
use aws_smithy_types::{body::SdkBody, DateTime};
|
||||||
use aws_smithy_types::{byte_stream::ByteStream, date_time::ConversionError};
|
|
||||||
use bytes::Bytes;
|
use bytes::Bytes;
|
||||||
use futures::stream::Stream;
|
use futures::stream::Stream;
|
||||||
use hyper::Body;
|
use hyper::Body;
|
||||||
@@ -178,7 +178,10 @@ impl S3Bucket {
|
|||||||
|
|
||||||
pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
|
pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
|
||||||
assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
|
assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
|
||||||
let path_string = path.get_path().as_str();
|
let path_string = path
|
||||||
|
.get_path()
|
||||||
|
.as_str()
|
||||||
|
.trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR);
|
||||||
match &self.prefix_in_bucket {
|
match &self.prefix_in_bucket {
|
||||||
Some(prefix) => prefix.clone() + "/" + path_string,
|
Some(prefix) => prefix.clone() + "/" + path_string,
|
||||||
None => path_string.to_string(),
|
None => path_string.to_string(),
|
||||||
@@ -284,17 +287,8 @@ impl S3Bucket {
|
|||||||
let remaining = self.timeout.saturating_sub(started_at.elapsed());
|
let remaining = self.timeout.saturating_sub(started_at.elapsed());
|
||||||
|
|
||||||
let metadata = object_output.metadata().cloned().map(StorageMetadata);
|
let metadata = object_output.metadata().cloned().map(StorageMetadata);
|
||||||
let etag = object_output
|
let etag = object_output.e_tag;
|
||||||
.e_tag
|
let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());
|
||||||
.ok_or(DownloadError::Other(anyhow::anyhow!("Missing ETag header")))?
|
|
||||||
.into();
|
|
||||||
let last_modified = object_output
|
|
||||||
.last_modified
|
|
||||||
.ok_or(DownloadError::Other(anyhow::anyhow!(
|
|
||||||
"Missing LastModified header"
|
|
||||||
)))?
|
|
||||||
.try_into()
|
|
||||||
.map_err(|e: ConversionError| DownloadError::Other(e.into()))?;
|
|
||||||
|
|
||||||
let body = object_output.body;
|
let body = object_output.body;
|
||||||
let body = ByteStreamAsStream::from(body);
|
let body = ByteStreamAsStream::from(body);
|
||||||
@@ -468,7 +462,17 @@ impl RemoteStorage for S3Bucket {
|
|||||||
// get the passed prefix or if it is not set use prefix_in_bucket value
|
// get the passed prefix or if it is not set use prefix_in_bucket value
|
||||||
let list_prefix = prefix
|
let list_prefix = prefix
|
||||||
.map(|p| self.relative_path_to_s3_object(p))
|
.map(|p| self.relative_path_to_s3_object(p))
|
||||||
.or_else(|| self.prefix_in_bucket.clone().map(|s| s + "/"));
|
.or_else(|| self.prefix_in_bucket.clone())
|
||||||
|
.map(|mut p| {
|
||||||
|
// required to end with a separator
|
||||||
|
// otherwise request will return only the entry of a prefix
|
||||||
|
if matches!(mode, ListingMode::WithDelimiter)
|
||||||
|
&& !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
|
||||||
|
{
|
||||||
|
p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
|
||||||
|
}
|
||||||
|
p
|
||||||
|
});
|
||||||
|
|
||||||
let _permit = self.permit(kind, cancel).await?;
|
let _permit = self.permit(kind, cancel).await?;
|
||||||
|
|
||||||
@@ -536,15 +540,11 @@ impl RemoteStorage for S3Bucket {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// S3 gives us prefixes like "foo/", we return them like "foo"
|
result.prefixes.extend(
|
||||||
result.prefixes.extend(prefixes.iter().filter_map(|o| {
|
prefixes
|
||||||
Some(
|
.iter()
|
||||||
self.s3_object_to_relative_path(
|
.filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
|
||||||
o.prefix()?
|
);
|
||||||
.trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR),
|
|
||||||
),
|
|
||||||
)
|
|
||||||
}));
|
|
||||||
|
|
||||||
continuation_token = match response.next_continuation_token {
|
continuation_token = match response.next_continuation_token {
|
||||||
Some(new_token) => Some(new_token),
|
Some(new_token) => Some(new_token),
|
||||||
@@ -1041,22 +1041,22 @@ mod tests {
|
|||||||
Some("/test/prefix/"),
|
Some("/test/prefix/"),
|
||||||
];
|
];
|
||||||
let expected_outputs = [
|
let expected_outputs = [
|
||||||
vec!["", "some/path", "some/path/"],
|
vec!["", "some/path", "some/path"],
|
||||||
vec!["/", "/some/path", "/some/path/"],
|
vec!["/", "/some/path", "/some/path"],
|
||||||
vec![
|
vec![
|
||||||
"test/prefix/",
|
"test/prefix/",
|
||||||
"test/prefix/some/path",
|
"test/prefix/some/path",
|
||||||
"test/prefix/some/path/",
|
"test/prefix/some/path",
|
||||||
],
|
],
|
||||||
vec![
|
vec![
|
||||||
"test/prefix/",
|
"test/prefix/",
|
||||||
"test/prefix/some/path",
|
"test/prefix/some/path",
|
||||||
"test/prefix/some/path/",
|
"test/prefix/some/path",
|
||||||
],
|
],
|
||||||
vec![
|
vec![
|
||||||
"test/prefix/",
|
"test/prefix/",
|
||||||
"test/prefix/some/path",
|
"test/prefix/some/path",
|
||||||
"test/prefix/some/path/",
|
"test/prefix/some/path",
|
||||||
],
|
],
|
||||||
];
|
];
|
||||||
|
|
||||||
|
|||||||
@@ -107,6 +107,27 @@ impl UnreliableWrapper {
|
|||||||
type VoidStorage = crate::LocalFs;
|
type VoidStorage = crate::LocalFs;
|
||||||
|
|
||||||
impl RemoteStorage for UnreliableWrapper {
|
impl RemoteStorage for UnreliableWrapper {
|
||||||
|
async fn list_prefixes(
|
||||||
|
&self,
|
||||||
|
prefix: Option<&RemotePath>,
|
||||||
|
cancel: &CancellationToken,
|
||||||
|
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||||
|
self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
|
||||||
|
.map_err(DownloadError::Other)?;
|
||||||
|
self.inner.list_prefixes(prefix, cancel).await
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn list_files(
|
||||||
|
&self,
|
||||||
|
folder: Option<&RemotePath>,
|
||||||
|
max_keys: Option<NonZeroU32>,
|
||||||
|
cancel: &CancellationToken,
|
||||||
|
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||||
|
self.attempt(RemoteOp::ListPrefixes(folder.cloned()))
|
||||||
|
.map_err(DownloadError::Other)?;
|
||||||
|
self.inner.list_files(folder, max_keys, cancel).await
|
||||||
|
}
|
||||||
|
|
||||||
async fn list(
|
async fn list(
|
||||||
&self,
|
&self,
|
||||||
prefix: Option<&RemotePath>,
|
prefix: Option<&RemotePath>,
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use camino::Utf8Path;
|
use camino::Utf8Path;
|
||||||
use remote_storage::ListingMode;
|
|
||||||
use remote_storage::RemotePath;
|
use remote_storage::RemotePath;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::{collections::HashSet, num::NonZeroU32};
|
use std::{collections::HashSet, num::NonZeroU32};
|
||||||
@@ -55,9 +54,9 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
|
|||||||
let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
|
let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
|
||||||
.context("common_prefix construction")?;
|
.context("common_prefix construction")?;
|
||||||
let root_remote_prefixes = test_client
|
let root_remote_prefixes = test_client
|
||||||
.list(None, ListingMode::WithDelimiter, None, &cancel)
|
.list_prefixes(None, &cancel)
|
||||||
.await?
|
.await
|
||||||
.prefixes
|
.context("client list root prefixes failure")?
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.collect::<HashSet<_>>();
|
.collect::<HashSet<_>>();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
@@ -66,14 +65,9 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
|
|||||||
);
|
);
|
||||||
|
|
||||||
let nested_remote_prefixes = test_client
|
let nested_remote_prefixes = test_client
|
||||||
.list(
|
.list_prefixes(Some(&base_prefix), &cancel)
|
||||||
Some(&base_prefix.add_trailing_slash()),
|
.await
|
||||||
ListingMode::WithDelimiter,
|
.context("client list nested prefixes failure")?
|
||||||
None,
|
|
||||||
&cancel,
|
|
||||||
)
|
|
||||||
.await?
|
|
||||||
.prefixes
|
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.collect::<HashSet<_>>();
|
.collect::<HashSet<_>>();
|
||||||
let remote_only_prefixes = nested_remote_prefixes
|
let remote_only_prefixes = nested_remote_prefixes
|
||||||
@@ -96,13 +90,11 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
|
|||||||
///
|
///
|
||||||
/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
|
/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
|
||||||
/// Then performs the following queries:
|
/// Then performs the following queries:
|
||||||
/// 1. `list(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
|
/// 1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
|
||||||
/// 2. `list("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt`
|
/// 2. `list_files("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt`
|
||||||
#[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
|
#[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn list_no_delimiter_works(
|
async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> anyhow::Result<()> {
|
||||||
ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs,
|
|
||||||
) -> anyhow::Result<()> {
|
|
||||||
let ctx = match ctx {
|
let ctx = match ctx {
|
||||||
MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
|
MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
|
||||||
MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
|
MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
|
||||||
@@ -115,36 +107,29 @@ async fn list_no_delimiter_works(
|
|||||||
let base_prefix =
|
let base_prefix =
|
||||||
RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
|
RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
|
||||||
let root_files = test_client
|
let root_files = test_client
|
||||||
.list(None, ListingMode::NoDelimiter, None, &cancel)
|
.list_files(None, None, &cancel)
|
||||||
.await
|
.await
|
||||||
.context("client list root files failure")?
|
.context("client list root files failure")?
|
||||||
.keys
|
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.collect::<HashSet<_>>();
|
.collect::<HashSet<_>>();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
root_files,
|
root_files,
|
||||||
ctx.remote_blobs.clone(),
|
ctx.remote_blobs.clone(),
|
||||||
"remote storage list on root mismatches with the uploads."
|
"remote storage list_files on root mismatches with the uploads."
|
||||||
);
|
);
|
||||||
|
|
||||||
// Test that max_keys limit works. In total there are about 21 files (see
|
// Test that max_keys limit works. In total there are about 21 files (see
|
||||||
// upload_simple_remote_data call in test_real_s3.rs).
|
// upload_simple_remote_data call in test_real_s3.rs).
|
||||||
let limited_root_files = test_client
|
let limited_root_files = test_client
|
||||||
.list(
|
.list_files(None, Some(NonZeroU32::new(2).unwrap()), &cancel)
|
||||||
None,
|
|
||||||
ListingMode::NoDelimiter,
|
|
||||||
Some(NonZeroU32::new(2).unwrap()),
|
|
||||||
&cancel,
|
|
||||||
)
|
|
||||||
.await
|
.await
|
||||||
.context("client list root files failure")?;
|
.context("client list root files failure")?;
|
||||||
assert_eq!(limited_root_files.keys.len(), 2);
|
assert_eq!(limited_root_files.len(), 2);
|
||||||
|
|
||||||
let nested_remote_files = test_client
|
let nested_remote_files = test_client
|
||||||
.list(Some(&base_prefix), ListingMode::NoDelimiter, None, &cancel)
|
.list_files(Some(&base_prefix), None, &cancel)
|
||||||
.await
|
.await
|
||||||
.context("client list nested files failure")?
|
.context("client list nested files failure")?
|
||||||
.keys
|
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.collect::<HashSet<_>>();
|
.collect::<HashSet<_>>();
|
||||||
let trim_remote_blobs: HashSet<_> = ctx
|
let trim_remote_blobs: HashSet<_> = ctx
|
||||||
@@ -156,7 +141,7 @@ async fn list_no_delimiter_works(
|
|||||||
.collect();
|
.collect();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
nested_remote_files, trim_remote_blobs,
|
nested_remote_files, trim_remote_blobs,
|
||||||
"remote storage list on subdirrectory mismatches with the uploads."
|
"remote storage list_files on subdirrectory mismatches with the uploads."
|
||||||
);
|
);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -214,11 +199,7 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<(
|
|||||||
|
|
||||||
ctx.client.delete_objects(&[path1, path2], &cancel).await?;
|
ctx.client.delete_objects(&[path1, path2], &cancel).await?;
|
||||||
|
|
||||||
let prefixes = ctx
|
let prefixes = ctx.client.list_prefixes(None, &cancel).await?;
|
||||||
.client
|
|
||||||
.list(None, ListingMode::WithDelimiter, None, &cancel)
|
|
||||||
.await?
|
|
||||||
.prefixes;
|
|
||||||
|
|
||||||
assert_eq!(prefixes.len(), 1);
|
assert_eq!(prefixes.len(), 1);
|
||||||
|
|
||||||
|
|||||||
@@ -57,6 +57,7 @@ enum MaybeEnabledStorage {
|
|||||||
Disabled,
|
Disabled,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
impl AsyncTestContext for MaybeEnabledStorage {
|
impl AsyncTestContext for MaybeEnabledStorage {
|
||||||
async fn setup() -> Self {
|
async fn setup() -> Self {
|
||||||
ensure_logging_ready();
|
ensure_logging_ready();
|
||||||
@@ -85,6 +86,7 @@ struct AzureWithTestBlobs {
|
|||||||
remote_blobs: HashSet<RemotePath>,
|
remote_blobs: HashSet<RemotePath>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
|
impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
|
||||||
async fn setup() -> Self {
|
async fn setup() -> Self {
|
||||||
ensure_logging_ready();
|
ensure_logging_ready();
|
||||||
@@ -132,6 +134,10 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// NOTE: the setups for the list_prefixes test and the list_files test are very similar
|
||||||
|
// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
|
||||||
|
// whereas the list_files function is concerned with listing files.
|
||||||
|
// See `RemoteStorage::list_files` documentation for more details
|
||||||
enum MaybeEnabledStorageWithSimpleTestBlobs {
|
enum MaybeEnabledStorageWithSimpleTestBlobs {
|
||||||
Enabled(AzureWithSimpleTestBlobs),
|
Enabled(AzureWithSimpleTestBlobs),
|
||||||
Disabled,
|
Disabled,
|
||||||
@@ -142,6 +148,7 @@ struct AzureWithSimpleTestBlobs {
|
|||||||
remote_blobs: HashSet<RemotePath>,
|
remote_blobs: HashSet<RemotePath>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
|
impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
|
||||||
async fn setup() -> Self {
|
async fn setup() -> Self {
|
||||||
ensure_logging_ready();
|
ensure_logging_ready();
|
||||||
|
|||||||
@@ -12,8 +12,8 @@ use anyhow::Context;
|
|||||||
use camino::Utf8Path;
|
use camino::Utf8Path;
|
||||||
use futures_util::StreamExt;
|
use futures_util::StreamExt;
|
||||||
use remote_storage::{
|
use remote_storage::{
|
||||||
DownloadError, GenericRemoteStorage, ListingMode, RemotePath, RemoteStorageConfig,
|
DownloadError, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
|
||||||
RemoteStorageKind, S3Config,
|
S3Config,
|
||||||
};
|
};
|
||||||
use test_context::test_context;
|
use test_context::test_context;
|
||||||
use test_context::AsyncTestContext;
|
use test_context::AsyncTestContext;
|
||||||
@@ -75,14 +75,11 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
|||||||
client: &Arc<GenericRemoteStorage>,
|
client: &Arc<GenericRemoteStorage>,
|
||||||
cancel: &CancellationToken,
|
cancel: &CancellationToken,
|
||||||
) -> anyhow::Result<HashSet<RemotePath>> {
|
) -> anyhow::Result<HashSet<RemotePath>> {
|
||||||
Ok(
|
Ok(retry(|| client.list_files(None, None, cancel))
|
||||||
retry(|| client.list(None, ListingMode::NoDelimiter, None, cancel))
|
.await
|
||||||
.await
|
.context("list root files failure")?
|
||||||
.context("list root files failure")?
|
.into_iter()
|
||||||
.keys
|
.collect::<HashSet<_>>())
|
||||||
.into_iter()
|
|
||||||
.collect::<HashSet<_>>(),
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let cancel = CancellationToken::new();
|
let cancel = CancellationToken::new();
|
||||||
@@ -121,7 +118,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
|||||||
// A little check to ensure that our clock is not too far off from the S3 clock
|
// A little check to ensure that our clock is not too far off from the S3 clock
|
||||||
{
|
{
|
||||||
let dl = retry(|| ctx.client.download(&path2, &cancel)).await?;
|
let dl = retry(|| ctx.client.download(&path2, &cancel)).await?;
|
||||||
let last_modified = dl.last_modified;
|
let last_modified = dl.last_modified.unwrap();
|
||||||
let half_wt = WAIT_TIME.mul_f32(0.5);
|
let half_wt = WAIT_TIME.mul_f32(0.5);
|
||||||
let t0_hwt = t0 + half_wt;
|
let t0_hwt = t0 + half_wt;
|
||||||
let t1_hwt = t1 - half_wt;
|
let t1_hwt = t1 - half_wt;
|
||||||
@@ -222,6 +219,7 @@ enum MaybeEnabledStorage {
|
|||||||
Disabled,
|
Disabled,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
impl AsyncTestContext for MaybeEnabledStorage {
|
impl AsyncTestContext for MaybeEnabledStorage {
|
||||||
async fn setup() -> Self {
|
async fn setup() -> Self {
|
||||||
ensure_logging_ready();
|
ensure_logging_ready();
|
||||||
@@ -250,6 +248,7 @@ struct S3WithTestBlobs {
|
|||||||
remote_blobs: HashSet<RemotePath>,
|
remote_blobs: HashSet<RemotePath>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
|
impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
|
||||||
async fn setup() -> Self {
|
async fn setup() -> Self {
|
||||||
ensure_logging_ready();
|
ensure_logging_ready();
|
||||||
@@ -297,6 +296,10 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// NOTE: the setups for the list_prefixes test and the list_files test are very similar
|
||||||
|
// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
|
||||||
|
// whereas the list_files function is concerned with listing files.
|
||||||
|
// See `RemoteStorage::list_files` documentation for more details
|
||||||
enum MaybeEnabledStorageWithSimpleTestBlobs {
|
enum MaybeEnabledStorageWithSimpleTestBlobs {
|
||||||
Enabled(S3WithSimpleTestBlobs),
|
Enabled(S3WithSimpleTestBlobs),
|
||||||
Disabled,
|
Disabled,
|
||||||
@@ -307,6 +310,7 @@ struct S3WithSimpleTestBlobs {
|
|||||||
remote_blobs: HashSet<RemotePath>,
|
remote_blobs: HashSet<RemotePath>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
|
impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
|
||||||
async fn setup() -> Self {
|
async fn setup() -> Self {
|
||||||
ensure_logging_ready();
|
ensure_logging_ready();
|
||||||
|
|||||||
@@ -247,7 +247,7 @@ fn scenario_4() {
|
|||||||
//
|
//
|
||||||
// This is in total 5000 + 1000 + 5000 + 1000 = 12000
|
// This is in total 5000 + 1000 + 5000 + 1000 = 12000
|
||||||
//
|
//
|
||||||
// (If we used the method from the previous scenario, and
|
// (If we used the the method from the previous scenario, and
|
||||||
// kept only snapshot at the branch point, we'd need to keep
|
// kept only snapshot at the branch point, we'd need to keep
|
||||||
// all the WAL between 10000-18000 on the main branch, so
|
// all the WAL between 10000-18000 on the main branch, so
|
||||||
// the total size would be 5000 + 1000 + 8000 = 14000. The
|
// the total size would be 5000 + 1000 + 8000 = 14000. The
|
||||||
|
|||||||
@@ -13,7 +13,6 @@ testing = ["fail/failpoints"]
|
|||||||
[dependencies]
|
[dependencies]
|
||||||
arc-swap.workspace = true
|
arc-swap.workspace = true
|
||||||
sentry.workspace = true
|
sentry.workspace = true
|
||||||
async-compression.workspace = true
|
|
||||||
async-trait.workspace = true
|
async-trait.workspace = true
|
||||||
anyhow.workspace = true
|
anyhow.workspace = true
|
||||||
bincode.workspace = true
|
bincode.workspace = true
|
||||||
@@ -22,7 +21,6 @@ camino.workspace = true
|
|||||||
chrono.workspace = true
|
chrono.workspace = true
|
||||||
heapless.workspace = true
|
heapless.workspace = true
|
||||||
hex = { workspace = true, features = ["serde"] }
|
hex = { workspace = true, features = ["serde"] }
|
||||||
humantime.workspace = true
|
|
||||||
hyper = { workspace = true, features = ["full"] }
|
hyper = { workspace = true, features = ["full"] }
|
||||||
fail.workspace = true
|
fail.workspace = true
|
||||||
futures = { workspace = true}
|
futures = { workspace = true}
|
||||||
@@ -38,7 +36,6 @@ serde_json.workspace = true
|
|||||||
signal-hook.workspace = true
|
signal-hook.workspace = true
|
||||||
thiserror.workspace = true
|
thiserror.workspace = true
|
||||||
tokio.workspace = true
|
tokio.workspace = true
|
||||||
tokio-tar.workspace = true
|
|
||||||
tokio-util.workspace = true
|
tokio-util.workspace = true
|
||||||
tracing.workspace = true
|
tracing.workspace = true
|
||||||
tracing-error.workspace = true
|
tracing-error.workspace = true
|
||||||
@@ -49,7 +46,6 @@ strum.workspace = true
|
|||||||
strum_macros.workspace = true
|
strum_macros.workspace = true
|
||||||
url.workspace = true
|
url.workspace = true
|
||||||
uuid.workspace = true
|
uuid.workspace = true
|
||||||
walkdir.workspace = true
|
|
||||||
|
|
||||||
pq_proto.workspace = true
|
pq_proto.workspace = true
|
||||||
postgres_connection.workspace = true
|
postgres_connection.workspace = true
|
||||||
|
|||||||
@@ -1,21 +0,0 @@
|
|||||||
//! Wrapper around `std::env::var` for parsing environment variables.
|
|
||||||
|
|
||||||
use std::{fmt::Display, str::FromStr};
|
|
||||||
|
|
||||||
pub fn var<V, E>(varname: &str) -> Option<V>
|
|
||||||
where
|
|
||||||
V: FromStr<Err = E>,
|
|
||||||
E: Display,
|
|
||||||
{
|
|
||||||
match std::env::var(varname) {
|
|
||||||
Ok(s) => Some(
|
|
||||||
s.parse()
|
|
||||||
.map_err(|e| format!("failed to parse env var {varname}: {e:#}"))
|
|
||||||
.unwrap(),
|
|
||||||
),
|
|
||||||
Err(std::env::VarError::NotPresent) => None,
|
|
||||||
Err(std::env::VarError::NotUnicode(_)) => {
|
|
||||||
panic!("env var {varname} is not unicode")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -34,8 +34,6 @@ pub enum Generation {
|
|||||||
/// scenarios where pageservers might otherwise issue conflicting writes to
|
/// scenarios where pageservers might otherwise issue conflicting writes to
|
||||||
/// remote storage
|
/// remote storage
|
||||||
impl Generation {
|
impl Generation {
|
||||||
pub const MAX: Self = Self::Valid(u32::MAX);
|
|
||||||
|
|
||||||
/// Create a new Generation that represents a legacy key format with
|
/// Create a new Generation that represents a legacy key format with
|
||||||
/// no generation suffix
|
/// no generation suffix
|
||||||
pub fn none() -> Self {
|
pub fn none() -> Self {
|
||||||
|
|||||||
@@ -47,10 +47,9 @@ impl<T, const L: usize> ops::Deref for HistoryBufferWithDropCounter<T, L> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(serde::Serialize, serde::Deserialize)]
|
#[derive(serde::Serialize)]
|
||||||
struct SerdeRepr<T> {
|
struct SerdeRepr<T> {
|
||||||
buffer: Vec<T>,
|
buffer: Vec<T>,
|
||||||
buffer_size: usize,
|
|
||||||
drop_count: u64,
|
drop_count: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -62,7 +61,6 @@ where
|
|||||||
let HistoryBufferWithDropCounter { buffer, drop_count } = value;
|
let HistoryBufferWithDropCounter { buffer, drop_count } = value;
|
||||||
SerdeRepr {
|
SerdeRepr {
|
||||||
buffer: buffer.iter().cloned().collect(),
|
buffer: buffer.iter().cloned().collect(),
|
||||||
buffer_size: L,
|
|
||||||
drop_count: *drop_count,
|
drop_count: *drop_count,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -80,52 +78,19 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'de, T, const L: usize> serde::de::Deserialize<'de> for HistoryBufferWithDropCounter<T, L>
|
|
||||||
where
|
|
||||||
T: Clone + serde::Deserialize<'de>,
|
|
||||||
{
|
|
||||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
|
||||||
where
|
|
||||||
D: serde::Deserializer<'de>,
|
|
||||||
{
|
|
||||||
let SerdeRepr {
|
|
||||||
buffer: des_buffer,
|
|
||||||
drop_count,
|
|
||||||
buffer_size,
|
|
||||||
} = SerdeRepr::<T>::deserialize(deserializer)?;
|
|
||||||
if buffer_size != L {
|
|
||||||
use serde::de::Error;
|
|
||||||
return Err(D::Error::custom(format!(
|
|
||||||
"invalid buffer_size, expecting {L} got {buffer_size}"
|
|
||||||
)));
|
|
||||||
}
|
|
||||||
let mut buffer = HistoryBuffer::new();
|
|
||||||
buffer.extend(des_buffer);
|
|
||||||
Ok(HistoryBufferWithDropCounter { buffer, drop_count })
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use super::HistoryBufferWithDropCounter;
|
use super::HistoryBufferWithDropCounter;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_basics() {
|
fn test_basics() {
|
||||||
let mut b = HistoryBufferWithDropCounter::<usize, 2>::default();
|
let mut b = HistoryBufferWithDropCounter::<_, 2>::default();
|
||||||
b.write(1);
|
b.write(1);
|
||||||
b.write(2);
|
b.write(2);
|
||||||
b.write(3);
|
b.write(3);
|
||||||
assert!(b.iter().any(|e| *e == 2));
|
assert!(b.iter().any(|e| *e == 2));
|
||||||
assert!(b.iter().any(|e| *e == 3));
|
assert!(b.iter().any(|e| *e == 3));
|
||||||
assert!(!b.iter().any(|e| *e == 1));
|
assert!(!b.iter().any(|e| *e == 1));
|
||||||
|
|
||||||
// round-trip serde
|
|
||||||
let round_tripped: HistoryBufferWithDropCounter<usize, 2> =
|
|
||||||
serde_json::from_str(&serde_json::to_string(&b).unwrap()).unwrap();
|
|
||||||
assert_eq!(
|
|
||||||
round_tripped.iter().cloned().collect::<Vec<_>>(),
|
|
||||||
b.iter().cloned().collect::<Vec<_>>()
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|||||||
@@ -245,7 +245,7 @@ impl std::io::Write for ChannelWriter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
SERVE_METRICS_COUNT.inc();
|
SERVE_METRICS_COUNT.inc();
|
||||||
|
|
||||||
let started_at = std::time::Instant::now();
|
let started_at = std::time::Instant::now();
|
||||||
@@ -367,6 +367,7 @@ pub fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
|
|||||||
.middleware(Middleware::post_with_info(
|
.middleware(Middleware::post_with_info(
|
||||||
add_request_id_header_to_response,
|
add_request_id_header_to_response,
|
||||||
))
|
))
|
||||||
|
.get("/metrics", |r| request_span(r, prometheus_metrics_handler))
|
||||||
.err_handler(route_error_handler)
|
.err_handler(route_error_handler)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -63,7 +63,6 @@ pub mod measured_stream;
|
|||||||
|
|
||||||
pub mod serde_percent;
|
pub mod serde_percent;
|
||||||
pub mod serde_regex;
|
pub mod serde_regex;
|
||||||
pub mod serde_system_time;
|
|
||||||
|
|
||||||
pub mod pageserver_feedback;
|
pub mod pageserver_feedback;
|
||||||
|
|
||||||
@@ -88,12 +87,6 @@ pub mod failpoint_support;
|
|||||||
|
|
||||||
pub mod yielding_loop;
|
pub mod yielding_loop;
|
||||||
|
|
||||||
pub mod zstd;
|
|
||||||
|
|
||||||
pub mod env;
|
|
||||||
|
|
||||||
pub mod poison;
|
|
||||||
|
|
||||||
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
|
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
|
||||||
///
|
///
|
||||||
/// we have several cases:
|
/// we have several cases:
|
||||||
|
|||||||
@@ -63,7 +63,6 @@ impl UnwrittenLockFile {
|
|||||||
pub fn create_exclusive(lock_file_path: &Utf8Path) -> anyhow::Result<UnwrittenLockFile> {
|
pub fn create_exclusive(lock_file_path: &Utf8Path) -> anyhow::Result<UnwrittenLockFile> {
|
||||||
let lock_file = fs::OpenOptions::new()
|
let lock_file = fs::OpenOptions::new()
|
||||||
.create(true) // O_CREAT
|
.create(true) // O_CREAT
|
||||||
.truncate(true)
|
|
||||||
.write(true)
|
.write(true)
|
||||||
.open(lock_file_path)
|
.open(lock_file_path)
|
||||||
.context("open lock file")?;
|
.context("open lock file")?;
|
||||||
|
|||||||
@@ -29,10 +29,12 @@ pub struct PageserverFeedback {
|
|||||||
// Serialize with RFC3339 format.
|
// Serialize with RFC3339 format.
|
||||||
#[serde(with = "serde_systemtime")]
|
#[serde(with = "serde_systemtime")]
|
||||||
pub replytime: SystemTime,
|
pub replytime: SystemTime,
|
||||||
/// Used to track feedbacks from different shards. Always zero for unsharded tenants.
|
|
||||||
pub shard_number: u32,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// NOTE: Do not forget to increment this number when adding new fields to PageserverFeedback.
|
||||||
|
// Do not remove previously available fields because this might be backwards incompatible.
|
||||||
|
pub const PAGESERVER_FEEDBACK_FIELDS_NUMBER: u8 = 5;
|
||||||
|
|
||||||
impl PageserverFeedback {
|
impl PageserverFeedback {
|
||||||
pub fn empty() -> PageserverFeedback {
|
pub fn empty() -> PageserverFeedback {
|
||||||
PageserverFeedback {
|
PageserverFeedback {
|
||||||
@@ -41,7 +43,6 @@ impl PageserverFeedback {
|
|||||||
remote_consistent_lsn: Lsn::INVALID,
|
remote_consistent_lsn: Lsn::INVALID,
|
||||||
disk_consistent_lsn: Lsn::INVALID,
|
disk_consistent_lsn: Lsn::INVALID,
|
||||||
replytime: *PG_EPOCH,
|
replytime: *PG_EPOCH,
|
||||||
shard_number: 0,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -58,26 +59,17 @@ impl PageserverFeedback {
|
|||||||
//
|
//
|
||||||
// TODO: change serialized fields names once all computes migrate to rename.
|
// TODO: change serialized fields names once all computes migrate to rename.
|
||||||
pub fn serialize(&self, buf: &mut BytesMut) {
|
pub fn serialize(&self, buf: &mut BytesMut) {
|
||||||
let buf_ptr = buf.len();
|
buf.put_u8(PAGESERVER_FEEDBACK_FIELDS_NUMBER); // # of keys
|
||||||
buf.put_u8(0); // # of keys, will be filled later
|
|
||||||
let mut nkeys = 0;
|
|
||||||
|
|
||||||
nkeys += 1;
|
|
||||||
buf.put_slice(b"current_timeline_size\0");
|
buf.put_slice(b"current_timeline_size\0");
|
||||||
buf.put_i32(8);
|
buf.put_i32(8);
|
||||||
buf.put_u64(self.current_timeline_size);
|
buf.put_u64(self.current_timeline_size);
|
||||||
|
|
||||||
nkeys += 1;
|
|
||||||
buf.put_slice(b"ps_writelsn\0");
|
buf.put_slice(b"ps_writelsn\0");
|
||||||
buf.put_i32(8);
|
buf.put_i32(8);
|
||||||
buf.put_u64(self.last_received_lsn.0);
|
buf.put_u64(self.last_received_lsn.0);
|
||||||
|
|
||||||
nkeys += 1;
|
|
||||||
buf.put_slice(b"ps_flushlsn\0");
|
buf.put_slice(b"ps_flushlsn\0");
|
||||||
buf.put_i32(8);
|
buf.put_i32(8);
|
||||||
buf.put_u64(self.disk_consistent_lsn.0);
|
buf.put_u64(self.disk_consistent_lsn.0);
|
||||||
|
|
||||||
nkeys += 1;
|
|
||||||
buf.put_slice(b"ps_applylsn\0");
|
buf.put_slice(b"ps_applylsn\0");
|
||||||
buf.put_i32(8);
|
buf.put_i32(8);
|
||||||
buf.put_u64(self.remote_consistent_lsn.0);
|
buf.put_u64(self.remote_consistent_lsn.0);
|
||||||
@@ -88,19 +80,9 @@ impl PageserverFeedback {
|
|||||||
.expect("failed to serialize pg_replytime earlier than PG_EPOCH")
|
.expect("failed to serialize pg_replytime earlier than PG_EPOCH")
|
||||||
.as_micros() as i64;
|
.as_micros() as i64;
|
||||||
|
|
||||||
nkeys += 1;
|
|
||||||
buf.put_slice(b"ps_replytime\0");
|
buf.put_slice(b"ps_replytime\0");
|
||||||
buf.put_i32(8);
|
buf.put_i32(8);
|
||||||
buf.put_i64(timestamp);
|
buf.put_i64(timestamp);
|
||||||
|
|
||||||
if self.shard_number > 0 {
|
|
||||||
nkeys += 1;
|
|
||||||
buf.put_slice(b"shard_number\0");
|
|
||||||
buf.put_i32(4);
|
|
||||||
buf.put_u32(self.shard_number);
|
|
||||||
}
|
|
||||||
|
|
||||||
buf[buf_ptr] = nkeys;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Deserialize PageserverFeedback message
|
// Deserialize PageserverFeedback message
|
||||||
@@ -143,8 +125,9 @@ impl PageserverFeedback {
|
|||||||
}
|
}
|
||||||
b"shard_number" => {
|
b"shard_number" => {
|
||||||
let len = buf.get_i32();
|
let len = buf.get_i32();
|
||||||
assert_eq!(len, 4);
|
// TODO: this will be implemented in the next update,
|
||||||
rf.shard_number = buf.get_u32();
|
// for now, we just skip the value.
|
||||||
|
buf.advance(len as usize);
|
||||||
}
|
}
|
||||||
_ => {
|
_ => {
|
||||||
let len = buf.get_i32();
|
let len = buf.get_i32();
|
||||||
@@ -217,7 +200,10 @@ mod tests {
|
|||||||
rf.serialize(&mut data);
|
rf.serialize(&mut data);
|
||||||
|
|
||||||
// Add an extra field to the buffer and adjust number of keys
|
// Add an extra field to the buffer and adjust number of keys
|
||||||
data[0] += 1;
|
if let Some(first) = data.first_mut() {
|
||||||
|
*first = PAGESERVER_FEEDBACK_FIELDS_NUMBER + 1;
|
||||||
|
}
|
||||||
|
|
||||||
data.put_slice(b"new_field_one\0");
|
data.put_slice(b"new_field_one\0");
|
||||||
data.put_i32(8);
|
data.put_i32(8);
|
||||||
data.put_u64(42);
|
data.put_u64(42);
|
||||||
|
|||||||
@@ -1,121 +0,0 @@
|
|||||||
//! Protect a piece of state from reuse after it is left in an inconsistent state.
|
|
||||||
//!
|
|
||||||
//! # Example
|
|
||||||
//!
|
|
||||||
//! ```
|
|
||||||
//! # tokio_test::block_on(async {
|
|
||||||
//! use utils::poison::Poison;
|
|
||||||
//! use std::time::Duration;
|
|
||||||
//!
|
|
||||||
//! struct State {
|
|
||||||
//! clean: bool,
|
|
||||||
//! }
|
|
||||||
//! let state = tokio::sync::Mutex::new(Poison::new("mystate", State { clean: true }));
|
|
||||||
//!
|
|
||||||
//! let mut mutex_guard = state.lock().await;
|
|
||||||
//! let mut poison_guard = mutex_guard.check_and_arm()?;
|
|
||||||
//! let state = poison_guard.data_mut();
|
|
||||||
//! state.clean = false;
|
|
||||||
//! // If we get cancelled at this await point, subsequent check_and_arm() calls will fail.
|
|
||||||
//! tokio::time::sleep(Duration::from_secs(10)).await;
|
|
||||||
//! state.clean = true;
|
|
||||||
//! poison_guard.disarm();
|
|
||||||
//! # Ok::<(), utils::poison::Error>(())
|
|
||||||
//! # });
|
|
||||||
//! ```
|
|
||||||
|
|
||||||
use tracing::warn;
|
|
||||||
|
|
||||||
pub struct Poison<T> {
|
|
||||||
what: &'static str,
|
|
||||||
state: State,
|
|
||||||
data: T,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Clone, Copy)]
|
|
||||||
enum State {
|
|
||||||
Clean,
|
|
||||||
Armed,
|
|
||||||
Poisoned { at: chrono::DateTime<chrono::Utc> },
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T> Poison<T> {
|
|
||||||
/// We log `what` `warning!` level if the [`Guard`] gets dropped without being [`Guard::disarm`]ed.
|
|
||||||
pub fn new(what: &'static str, data: T) -> Self {
|
|
||||||
Self {
|
|
||||||
what,
|
|
||||||
state: State::Clean,
|
|
||||||
data,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Check for poisoning and return a [`Guard`] that provides access to the wrapped state.
|
|
||||||
pub fn check_and_arm(&mut self) -> Result<Guard<T>, Error> {
|
|
||||||
match self.state {
|
|
||||||
State::Clean => {
|
|
||||||
self.state = State::Armed;
|
|
||||||
Ok(Guard(self))
|
|
||||||
}
|
|
||||||
State::Armed => unreachable!("transient state"),
|
|
||||||
State::Poisoned { at } => Err(Error::Poisoned {
|
|
||||||
what: self.what,
|
|
||||||
at,
|
|
||||||
}),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Use [`Self::data`] and [`Self::data_mut`] to access the wrapped state.
|
|
||||||
/// Once modifications are done, use [`Self::disarm`].
|
|
||||||
/// If [`Guard`] gets dropped instead of calling [`Self::disarm`], the state is poisoned
|
|
||||||
/// and subsequent calls to [`Poison::check_and_arm`] will fail with an error.
|
|
||||||
pub struct Guard<'a, T>(&'a mut Poison<T>);
|
|
||||||
|
|
||||||
impl<'a, T> Guard<'a, T> {
|
|
||||||
pub fn data(&self) -> &T {
|
|
||||||
&self.0.data
|
|
||||||
}
|
|
||||||
pub fn data_mut(&mut self) -> &mut T {
|
|
||||||
&mut self.0.data
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn disarm(self) {
|
|
||||||
match self.0.state {
|
|
||||||
State::Clean => unreachable!("we set it to Armed in check_and_arm()"),
|
|
||||||
State::Armed => {
|
|
||||||
self.0.state = State::Clean;
|
|
||||||
}
|
|
||||||
State::Poisoned { at } => {
|
|
||||||
unreachable!("we fail check_and_arm() if it's in that state: {at}")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a, T> Drop for Guard<'a, T> {
|
|
||||||
fn drop(&mut self) {
|
|
||||||
match self.0.state {
|
|
||||||
State::Clean => {
|
|
||||||
// set by disarm()
|
|
||||||
}
|
|
||||||
State::Armed => {
|
|
||||||
// still armed => poison it
|
|
||||||
let at = chrono::Utc::now();
|
|
||||||
self.0.state = State::Poisoned { at };
|
|
||||||
warn!(at=?at, "poisoning {}", self.0.what);
|
|
||||||
}
|
|
||||||
State::Poisoned { at } => {
|
|
||||||
unreachable!("we fail check_and_arm() if it's in that state: {at}")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(thiserror::Error, Debug)]
|
|
||||||
pub enum Error {
|
|
||||||
#[error("poisoned at {at}: {what}")]
|
|
||||||
Poisoned {
|
|
||||||
what: &'static str,
|
|
||||||
at: chrono::DateTime<chrono::Utc>,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
@@ -182,18 +182,6 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Check if [`Self::wait_for`] or [`Self::wait_for_timeout`] would wait if called with `num`.
|
|
||||||
pub fn would_wait_for(&self, num: V) -> Result<(), V> {
|
|
||||||
let internal = self.internal.lock().unwrap();
|
|
||||||
let cnt = internal.current.cnt_value();
|
|
||||||
drop(internal);
|
|
||||||
if cnt >= num {
|
|
||||||
Ok(())
|
|
||||||
} else {
|
|
||||||
Err(cnt)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Register and return a channel that will be notified when a number arrives,
|
/// Register and return a channel that will be notified when a number arrives,
|
||||||
/// or None, if it has already arrived.
|
/// or None, if it has already arrived.
|
||||||
fn queue_for_wait(&self, num: V) -> Result<Option<Receiver<()>>, SeqWaitError> {
|
fn queue_for_wait(&self, num: V) -> Result<Option<Receiver<()>>, SeqWaitError> {
|
||||||
|
|||||||
@@ -1,55 +0,0 @@
|
|||||||
//! A `serde::{Deserialize,Serialize}` type for SystemTime with RFC3339 format and millisecond precision.
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, serde::Serialize, serde::Deserialize)]
|
|
||||||
#[serde(transparent)]
|
|
||||||
pub struct SystemTime(
|
|
||||||
#[serde(
|
|
||||||
deserialize_with = "deser_rfc3339_millis",
|
|
||||||
serialize_with = "ser_rfc3339_millis"
|
|
||||||
)]
|
|
||||||
pub std::time::SystemTime,
|
|
||||||
);
|
|
||||||
|
|
||||||
fn ser_rfc3339_millis<S: serde::ser::Serializer>(
|
|
||||||
ts: &std::time::SystemTime,
|
|
||||||
serializer: S,
|
|
||||||
) -> Result<S::Ok, S::Error> {
|
|
||||||
serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<std::time::SystemTime, D::Error>
|
|
||||||
where
|
|
||||||
D: serde::de::Deserializer<'de>,
|
|
||||||
{
|
|
||||||
let s: String = serde::de::Deserialize::deserialize(deserializer)?;
|
|
||||||
humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom)
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use super::*;
|
|
||||||
|
|
||||||
/// Helper function to make a SystemTime have millisecond precision by truncating additional nanoseconds.
|
|
||||||
fn to_millisecond_precision(time: SystemTime) -> SystemTime {
|
|
||||||
match time.0.duration_since(std::time::SystemTime::UNIX_EPOCH) {
|
|
||||||
Ok(duration) => {
|
|
||||||
let total_millis = duration.as_secs() * 1_000 + u64::from(duration.subsec_millis());
|
|
||||||
SystemTime(
|
|
||||||
std::time::SystemTime::UNIX_EPOCH
|
|
||||||
+ std::time::Duration::from_millis(total_millis),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
Err(_) => time,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_serialize_deserialize() {
|
|
||||||
let input = SystemTime(std::time::SystemTime::now());
|
|
||||||
let expected_serialized = format!("\"{}\"", humantime::format_rfc3339_millis(input.0));
|
|
||||||
let serialized = serde_json::to_string(&input).unwrap();
|
|
||||||
assert_eq!(expected_serialized, serialized);
|
|
||||||
let deserialized: SystemTime = serde_json::from_str(&expected_serialized).unwrap();
|
|
||||||
assert_eq!(to_millisecond_precision(input), deserialized);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -110,49 +110,6 @@ impl<T> OnceCell<T> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns a guard to an existing initialized value, or returns an unique initialization
|
|
||||||
/// permit which can be used to initialize this `OnceCell` using `OnceCell::set`.
|
|
||||||
pub async fn get_or_init_detached(&self) -> Result<Guard<'_, T>, InitPermit> {
|
|
||||||
// It looks like OnceCell::get_or_init could be implemented using this method instead of
|
|
||||||
// duplication. However, that makes the future be !Send due to possibly holding on to the
|
|
||||||
// MutexGuard over an await point.
|
|
||||||
loop {
|
|
||||||
let sem = {
|
|
||||||
let guard = self.inner.lock().unwrap();
|
|
||||||
if guard.value.is_some() {
|
|
||||||
return Ok(Guard(guard));
|
|
||||||
}
|
|
||||||
guard.init_semaphore.clone()
|
|
||||||
};
|
|
||||||
|
|
||||||
{
|
|
||||||
let permit = {
|
|
||||||
// increment the count for the duration of queued
|
|
||||||
let _guard = CountWaitingInitializers::start(self);
|
|
||||||
sem.acquire().await
|
|
||||||
};
|
|
||||||
|
|
||||||
let Ok(permit) = permit else {
|
|
||||||
let guard = self.inner.lock().unwrap();
|
|
||||||
if !Arc::ptr_eq(&sem, &guard.init_semaphore) {
|
|
||||||
// there was a take_and_deinit in between
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
assert!(
|
|
||||||
guard.value.is_some(),
|
|
||||||
"semaphore got closed, must be initialized"
|
|
||||||
);
|
|
||||||
return Ok(Guard(guard));
|
|
||||||
};
|
|
||||||
|
|
||||||
permit.forget();
|
|
||||||
}
|
|
||||||
|
|
||||||
let permit = InitPermit(sem);
|
|
||||||
return Err(permit);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
|
/// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
|
||||||
/// to complete initializing the inner value.
|
/// to complete initializing the inner value.
|
||||||
///
|
///
|
||||||
@@ -192,14 +149,6 @@ impl<T> OnceCell<T> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Like [`Guard::take_and_deinit`], but will return `None` if this OnceCell was never
|
|
||||||
/// initialized.
|
|
||||||
pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> {
|
|
||||||
let inner = self.inner.get_mut().unwrap();
|
|
||||||
|
|
||||||
inner.take_and_deinit()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete.
|
/// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete.
|
||||||
pub fn initializer_count(&self) -> usize {
|
pub fn initializer_count(&self) -> usize {
|
||||||
self.initializers.load(Ordering::Relaxed)
|
self.initializers.load(Ordering::Relaxed)
|
||||||
@@ -253,24 +202,16 @@ impl<'a, T> Guard<'a, T> {
|
|||||||
///
|
///
|
||||||
/// The permit will be on a semaphore part of the new internal value, and any following
|
/// The permit will be on a semaphore part of the new internal value, and any following
|
||||||
/// [`OnceCell::get_or_init`] will wait on it to complete.
|
/// [`OnceCell::get_or_init`] will wait on it to complete.
|
||||||
pub fn take_and_deinit(mut self) -> (T, InitPermit) {
|
pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
|
||||||
self.0
|
|
||||||
.take_and_deinit()
|
|
||||||
.expect("guard is not created unless value has been initialized")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T> Inner<T> {
|
|
||||||
pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> {
|
|
||||||
let value = self.value.take()?;
|
|
||||||
|
|
||||||
let mut swapped = Inner::default();
|
let mut swapped = Inner::default();
|
||||||
let sem = swapped.init_semaphore.clone();
|
let sem = swapped.init_semaphore.clone();
|
||||||
// acquire and forget right away, moving the control over to InitPermit
|
// acquire and forget right away, moving the control over to InitPermit
|
||||||
sem.try_acquire().expect("we just created this").forget();
|
sem.try_acquire().expect("we just created this").forget();
|
||||||
let permit = InitPermit(sem);
|
std::mem::swap(&mut *self.0, &mut swapped);
|
||||||
std::mem::swap(self, &mut swapped);
|
swapped
|
||||||
Some((value, permit))
|
.value
|
||||||
|
.map(|v| (v, InitPermit(sem)))
|
||||||
|
.expect("guard is not created unless value has been initialized")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -279,13 +220,6 @@ impl<T> Inner<T> {
|
|||||||
/// On drop, this type will return the permit.
|
/// On drop, this type will return the permit.
|
||||||
pub struct InitPermit(Arc<tokio::sync::Semaphore>);
|
pub struct InitPermit(Arc<tokio::sync::Semaphore>);
|
||||||
|
|
||||||
impl std::fmt::Debug for InitPermit {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
||||||
let ptr = Arc::as_ptr(&self.0) as *const ();
|
|
||||||
f.debug_tuple("InitPermit").field(&ptr).finish()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Drop for InitPermit {
|
impl Drop for InitPermit {
|
||||||
fn drop(&mut self) {
|
fn drop(&mut self) {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
@@ -547,57 +481,4 @@ mod tests {
|
|||||||
|
|
||||||
assert_eq!("t1", *cell.get().unwrap());
|
assert_eq!("t1", *cell.get().unwrap());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::test(start_paused = true)]
|
|
||||||
async fn detached_init_smoke() {
|
|
||||||
let target = OnceCell::default();
|
|
||||||
|
|
||||||
let Err(permit) = target.get_or_init_detached().await else {
|
|
||||||
unreachable!("it is not initialized")
|
|
||||||
};
|
|
||||||
|
|
||||||
tokio::time::timeout(
|
|
||||||
std::time::Duration::from_secs(3600 * 24 * 7 * 365),
|
|
||||||
target.get_or_init(|permit2| async { Ok::<_, Infallible>((11, permit2)) }),
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
.expect_err("should timeout since we are already holding the permit");
|
|
||||||
|
|
||||||
target.set(42, permit);
|
|
||||||
|
|
||||||
let (_answer, permit) = {
|
|
||||||
let guard = target
|
|
||||||
.get_or_init(|permit| async { Ok::<_, Infallible>((11, permit)) })
|
|
||||||
.await
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
assert_eq!(*guard, 42);
|
|
||||||
|
|
||||||
guard.take_and_deinit()
|
|
||||||
};
|
|
||||||
|
|
||||||
assert!(target.get().is_none());
|
|
||||||
|
|
||||||
target.set(11, permit);
|
|
||||||
|
|
||||||
assert_eq!(*target.get().unwrap(), 11);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[tokio::test]
|
|
||||||
async fn take_and_deinit_on_mut() {
|
|
||||||
use std::convert::Infallible;
|
|
||||||
|
|
||||||
let mut target = OnceCell::<u32>::default();
|
|
||||||
assert!(target.take_and_deinit().is_none());
|
|
||||||
|
|
||||||
target
|
|
||||||
.get_or_init(|permit| async move { Ok::<_, Infallible>((42, permit)) })
|
|
||||||
.await
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let again = target.take_and_deinit();
|
|
||||||
assert!(matches!(again, Some((42, _))), "{again:?}");
|
|
||||||
|
|
||||||
assert!(target.take_and_deinit().is_none());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,60 +1,27 @@
|
|||||||
use std::{alloc::Layout, cmp::Ordering, ops::RangeBounds};
|
use std::{alloc::Layout, cmp::Ordering, ops::RangeBounds};
|
||||||
|
|
||||||
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
|
||||||
pub enum VecMapOrdering {
|
|
||||||
Greater,
|
|
||||||
GreaterOrEqual,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Ordered map datastructure implemented in a Vec.
|
/// Ordered map datastructure implemented in a Vec.
|
||||||
/// Append only - can only add keys that are larger than the
|
/// Append only - can only add keys that are larger than the
|
||||||
/// current max key.
|
/// current max key.
|
||||||
/// Ordering can be adjusted using [`VecMapOrdering`]
|
|
||||||
/// during `VecMap` construction.
|
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
pub struct VecMap<K, V> {
|
pub struct VecMap<K, V>(Vec<(K, V)>);
|
||||||
data: Vec<(K, V)>,
|
|
||||||
ordering: VecMapOrdering,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<K, V> Default for VecMap<K, V> {
|
impl<K, V> Default for VecMap<K, V> {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
VecMap {
|
VecMap(Default::default())
|
||||||
data: Default::default(),
|
|
||||||
ordering: VecMapOrdering::Greater,
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(thiserror::Error, Debug)]
|
#[derive(Debug)]
|
||||||
pub enum VecMapError {
|
pub struct InvalidKey;
|
||||||
#[error("Key violates ordering constraint")]
|
|
||||||
InvalidKey,
|
|
||||||
#[error("Mismatched ordering constraints")]
|
|
||||||
ExtendOrderingError,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<K: Ord, V> VecMap<K, V> {
|
impl<K: Ord, V> VecMap<K, V> {
|
||||||
pub fn new(ordering: VecMapOrdering) -> Self {
|
|
||||||
Self {
|
|
||||||
data: Vec::new(),
|
|
||||||
ordering,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn with_capacity(capacity: usize, ordering: VecMapOrdering) -> Self {
|
|
||||||
Self {
|
|
||||||
data: Vec::with_capacity(capacity),
|
|
||||||
ordering,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn is_empty(&self) -> bool {
|
pub fn is_empty(&self) -> bool {
|
||||||
self.data.is_empty()
|
self.0.is_empty()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn as_slice(&self) -> &[(K, V)] {
|
pub fn as_slice(&self) -> &[(K, V)] {
|
||||||
self.data.as_slice()
|
self.0.as_slice()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// This function may panic if given a range where the lower bound is
|
/// This function may panic if given a range where the lower bound is
|
||||||
@@ -62,7 +29,7 @@ impl<K: Ord, V> VecMap<K, V> {
|
|||||||
pub fn slice_range<R: RangeBounds<K>>(&self, range: R) -> &[(K, V)] {
|
pub fn slice_range<R: RangeBounds<K>>(&self, range: R) -> &[(K, V)] {
|
||||||
use std::ops::Bound::*;
|
use std::ops::Bound::*;
|
||||||
|
|
||||||
let binary_search = |k: &K| self.data.binary_search_by_key(&k, extract_key);
|
let binary_search = |k: &K| self.0.binary_search_by_key(&k, extract_key);
|
||||||
|
|
||||||
let start_idx = match range.start_bound() {
|
let start_idx = match range.start_bound() {
|
||||||
Unbounded => 0,
|
Unbounded => 0,
|
||||||
@@ -74,7 +41,7 @@ impl<K: Ord, V> VecMap<K, V> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
let end_idx = match range.end_bound() {
|
let end_idx = match range.end_bound() {
|
||||||
Unbounded => self.data.len(),
|
Unbounded => self.0.len(),
|
||||||
Included(k) => match binary_search(k) {
|
Included(k) => match binary_search(k) {
|
||||||
Ok(idx) => idx + 1,
|
Ok(idx) => idx + 1,
|
||||||
Err(idx) => idx,
|
Err(idx) => idx,
|
||||||
@@ -82,30 +49,34 @@ impl<K: Ord, V> VecMap<K, V> {
|
|||||||
Excluded(k) => binary_search(k).unwrap_or_else(std::convert::identity),
|
Excluded(k) => binary_search(k).unwrap_or_else(std::convert::identity),
|
||||||
};
|
};
|
||||||
|
|
||||||
&self.data[start_idx..end_idx]
|
&self.0[start_idx..end_idx]
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Add a key value pair to the map.
|
/// Add a key value pair to the map.
|
||||||
/// If `key` is not respective of the `self` ordering the
|
/// If `key` is less than or equal to the current maximum key
|
||||||
/// pair will not be added and `InvalidKey` error will be returned.
|
/// the pair will not be added and InvalidKey error will be returned.
|
||||||
pub fn append(&mut self, key: K, value: V) -> Result<usize, VecMapError> {
|
pub fn append(&mut self, key: K, value: V) -> Result<usize, InvalidKey> {
|
||||||
self.validate_key_order(&key)?;
|
if let Some((last_key, _last_value)) = self.0.last() {
|
||||||
|
if &key <= last_key {
|
||||||
|
return Err(InvalidKey);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let delta_size = self.instrument_vec_op(|vec| vec.push((key, value)));
|
let delta_size = self.instrument_vec_op(|vec| vec.push((key, value)));
|
||||||
Ok(delta_size)
|
Ok(delta_size)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Update the maximum key value pair or add a new key value pair to the map.
|
/// Update the maximum key value pair or add a new key value pair to the map.
|
||||||
/// If `key` is not respective of the `self` ordering no updates or additions
|
/// If `key` is less than the current maximum key no updates or additions
|
||||||
/// will occur and `InvalidKey` error will be returned.
|
/// will occur and InvalidKey error will be returned.
|
||||||
pub fn append_or_update_last(
|
pub fn append_or_update_last(
|
||||||
&mut self,
|
&mut self,
|
||||||
key: K,
|
key: K,
|
||||||
mut value: V,
|
mut value: V,
|
||||||
) -> Result<(Option<V>, usize), VecMapError> {
|
) -> Result<(Option<V>, usize), InvalidKey> {
|
||||||
if let Some((last_key, last_value)) = self.data.last_mut() {
|
if let Some((last_key, last_value)) = self.0.last_mut() {
|
||||||
match key.cmp(last_key) {
|
match key.cmp(last_key) {
|
||||||
Ordering::Less => return Err(VecMapError::InvalidKey),
|
Ordering::Less => return Err(InvalidKey),
|
||||||
Ordering::Equal => {
|
Ordering::Equal => {
|
||||||
std::mem::swap(last_value, &mut value);
|
std::mem::swap(last_value, &mut value);
|
||||||
const DELTA_SIZE: usize = 0;
|
const DELTA_SIZE: usize = 0;
|
||||||
@@ -129,67 +100,40 @@ impl<K: Ord, V> VecMap<K, V> {
|
|||||||
V: Clone,
|
V: Clone,
|
||||||
{
|
{
|
||||||
let split_idx = self
|
let split_idx = self
|
||||||
.data
|
.0
|
||||||
.binary_search_by_key(&cutoff, extract_key)
|
.binary_search_by_key(&cutoff, extract_key)
|
||||||
.unwrap_or_else(std::convert::identity);
|
.unwrap_or_else(std::convert::identity);
|
||||||
|
|
||||||
(
|
(
|
||||||
VecMap {
|
VecMap(self.0[..split_idx].to_vec()),
|
||||||
data: self.data[..split_idx].to_vec(),
|
VecMap(self.0[split_idx..].to_vec()),
|
||||||
ordering: self.ordering,
|
|
||||||
},
|
|
||||||
VecMap {
|
|
||||||
data: self.data[split_idx..].to_vec(),
|
|
||||||
ordering: self.ordering,
|
|
||||||
},
|
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Move items from `other` to the end of `self`, leaving `other` empty.
|
/// Move items from `other` to the end of `self`, leaving `other` empty.
|
||||||
/// If the `other` ordering is different from `self` ordering
|
/// If any keys in `other` is less than or equal to any key in `self`,
|
||||||
/// `ExtendOrderingError` error will be returned.
|
/// `InvalidKey` error will be returned and no mutation will occur.
|
||||||
/// If any keys in `other` is not respective of the ordering defined in
|
pub fn extend(&mut self, other: &mut Self) -> Result<usize, InvalidKey> {
|
||||||
/// `self`, `InvalidKey` error will be returned and no mutation will occur.
|
let self_last_opt = self.0.last().map(extract_key);
|
||||||
pub fn extend(&mut self, other: &mut Self) -> Result<usize, VecMapError> {
|
let other_first_opt = other.0.last().map(extract_key);
|
||||||
if self.ordering != other.ordering {
|
|
||||||
return Err(VecMapError::ExtendOrderingError);
|
|
||||||
}
|
|
||||||
|
|
||||||
let other_first_opt = other.data.last().map(extract_key);
|
if let (Some(self_last), Some(other_first)) = (self_last_opt, other_first_opt) {
|
||||||
if let Some(other_first) = other_first_opt {
|
if self_last >= other_first {
|
||||||
self.validate_key_order(other_first)?;
|
return Err(InvalidKey);
|
||||||
}
|
|
||||||
|
|
||||||
let delta_size = self.instrument_vec_op(|vec| vec.append(&mut other.data));
|
|
||||||
Ok(delta_size)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Validate the current last key in `self` and key being
|
|
||||||
/// inserted against the order defined in `self`.
|
|
||||||
fn validate_key_order(&self, key: &K) -> Result<(), VecMapError> {
|
|
||||||
if let Some(last_key) = self.data.last().map(extract_key) {
|
|
||||||
match (&self.ordering, &key.cmp(last_key)) {
|
|
||||||
(VecMapOrdering::Greater, Ordering::Less | Ordering::Equal) => {
|
|
||||||
return Err(VecMapError::InvalidKey);
|
|
||||||
}
|
|
||||||
(VecMapOrdering::Greater, Ordering::Greater) => {}
|
|
||||||
(VecMapOrdering::GreaterOrEqual, Ordering::Less) => {
|
|
||||||
return Err(VecMapError::InvalidKey);
|
|
||||||
}
|
|
||||||
(VecMapOrdering::GreaterOrEqual, Ordering::Equal | Ordering::Greater) => {}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
let delta_size = self.instrument_vec_op(|vec| vec.append(&mut other.0));
|
||||||
|
Ok(delta_size)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Instrument an operation on the underlying [`Vec`].
|
/// Instrument an operation on the underlying [`Vec`].
|
||||||
/// Will panic if the operation decreases capacity.
|
/// Will panic if the operation decreases capacity.
|
||||||
/// Returns the increase in memory usage caused by the op.
|
/// Returns the increase in memory usage caused by the op.
|
||||||
fn instrument_vec_op(&mut self, op: impl FnOnce(&mut Vec<(K, V)>)) -> usize {
|
fn instrument_vec_op(&mut self, op: impl FnOnce(&mut Vec<(K, V)>)) -> usize {
|
||||||
let old_cap = self.data.capacity();
|
let old_cap = self.0.capacity();
|
||||||
op(&mut self.data);
|
op(&mut self.0);
|
||||||
let new_cap = self.data.capacity();
|
let new_cap = self.0.capacity();
|
||||||
|
|
||||||
match old_cap.cmp(&new_cap) {
|
match old_cap.cmp(&new_cap) {
|
||||||
Ordering::Less => {
|
Ordering::Less => {
|
||||||
@@ -201,36 +145,6 @@ impl<K: Ord, V> VecMap<K, V> {
|
|||||||
Ordering::Greater => panic!("VecMap capacity shouldn't ever decrease"),
|
Ordering::Greater => panic!("VecMap capacity shouldn't ever decrease"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Similar to `from_iter` defined in `FromIter` trait except
|
|
||||||
/// that it accepts an [`VecMapOrdering`]
|
|
||||||
pub fn from_iter<I: IntoIterator<Item = (K, V)>>(iter: I, ordering: VecMapOrdering) -> Self {
|
|
||||||
let iter = iter.into_iter();
|
|
||||||
let initial_capacity = {
|
|
||||||
match iter.size_hint() {
|
|
||||||
(lower_bound, None) => lower_bound,
|
|
||||||
(_, Some(upper_bound)) => upper_bound,
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut vec_map = VecMap::with_capacity(initial_capacity, ordering);
|
|
||||||
for (key, value) in iter {
|
|
||||||
vec_map
|
|
||||||
.append(key, value)
|
|
||||||
.expect("The passed collection needs to be sorted!");
|
|
||||||
}
|
|
||||||
|
|
||||||
vec_map
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<K: Ord, V> IntoIterator for VecMap<K, V> {
|
|
||||||
type Item = (K, V);
|
|
||||||
type IntoIter = std::vec::IntoIter<(K, V)>;
|
|
||||||
|
|
||||||
fn into_iter(self) -> Self::IntoIter {
|
|
||||||
self.data.into_iter()
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn extract_key<K, V>(entry: &(K, V)) -> &K {
|
fn extract_key<K, V>(entry: &(K, V)) -> &K {
|
||||||
@@ -241,7 +155,7 @@ fn extract_key<K, V>(entry: &(K, V)) -> &K {
|
|||||||
mod tests {
|
mod tests {
|
||||||
use std::{collections::BTreeMap, ops::Bound};
|
use std::{collections::BTreeMap, ops::Bound};
|
||||||
|
|
||||||
use super::{VecMap, VecMapOrdering};
|
use super::VecMap;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn unbounded_range() {
|
fn unbounded_range() {
|
||||||
@@ -396,59 +310,5 @@ mod tests {
|
|||||||
left.extend(&mut one_map).unwrap_err();
|
left.extend(&mut one_map).unwrap_err();
|
||||||
assert_eq!(left.as_slice(), &[(0, ()), (1, ())]);
|
assert_eq!(left.as_slice(), &[(0, ()), (1, ())]);
|
||||||
assert_eq!(one_map.as_slice(), &[(1, ())]);
|
assert_eq!(one_map.as_slice(), &[(1, ())]);
|
||||||
|
|
||||||
let mut map_greater_or_equal = VecMap::new(VecMapOrdering::GreaterOrEqual);
|
|
||||||
map_greater_or_equal.append(2, ()).unwrap();
|
|
||||||
map_greater_or_equal.append(2, ()).unwrap();
|
|
||||||
|
|
||||||
left.extend(&mut map_greater_or_equal).unwrap_err();
|
|
||||||
assert_eq!(left.as_slice(), &[(0, ()), (1, ())]);
|
|
||||||
assert_eq!(map_greater_or_equal.as_slice(), &[(2, ()), (2, ())]);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn extend_with_ordering() {
|
|
||||||
let mut left = VecMap::new(VecMapOrdering::GreaterOrEqual);
|
|
||||||
left.append(0, ()).unwrap();
|
|
||||||
assert_eq!(left.as_slice(), &[(0, ())]);
|
|
||||||
|
|
||||||
let mut greater_right = VecMap::new(VecMapOrdering::Greater);
|
|
||||||
greater_right.append(0, ()).unwrap();
|
|
||||||
left.extend(&mut greater_right).unwrap_err();
|
|
||||||
assert_eq!(left.as_slice(), &[(0, ())]);
|
|
||||||
|
|
||||||
let mut greater_or_equal_right = VecMap::new(VecMapOrdering::GreaterOrEqual);
|
|
||||||
greater_or_equal_right.append(2, ()).unwrap();
|
|
||||||
greater_or_equal_right.append(2, ()).unwrap();
|
|
||||||
left.extend(&mut greater_or_equal_right).unwrap();
|
|
||||||
assert_eq!(left.as_slice(), &[(0, ()), (2, ()), (2, ())]);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn vec_map_from_sorted() {
|
|
||||||
let vec = vec![(1, ()), (2, ()), (3, ()), (6, ())];
|
|
||||||
let vec_map = VecMap::from_iter(vec, VecMapOrdering::Greater);
|
|
||||||
assert_eq!(vec_map.as_slice(), &[(1, ()), (2, ()), (3, ()), (6, ())]);
|
|
||||||
|
|
||||||
let vec = vec![(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())];
|
|
||||||
let vec_map = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual);
|
|
||||||
assert_eq!(
|
|
||||||
vec_map.as_slice(),
|
|
||||||
&[(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())]
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
#[should_panic]
|
|
||||||
fn vec_map_from_unsorted_greater() {
|
|
||||||
let vec = vec![(1, ()), (2, ()), (2, ()), (3, ()), (6, ())];
|
|
||||||
let _ = VecMap::from_iter(vec, VecMapOrdering::Greater);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
#[should_panic]
|
|
||||||
fn vec_map_from_unsorted_greater_or_equal() {
|
|
||||||
let vec = vec![(1, ()), (2, ()), (3, ()), (6, ()), (5, ())];
|
|
||||||
let _ = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,78 +0,0 @@
|
|||||||
use std::io::SeekFrom;
|
|
||||||
|
|
||||||
use anyhow::{Context, Result};
|
|
||||||
use async_compression::{
|
|
||||||
tokio::{bufread::ZstdDecoder, write::ZstdEncoder},
|
|
||||||
zstd::CParameter,
|
|
||||||
Level,
|
|
||||||
};
|
|
||||||
use camino::Utf8Path;
|
|
||||||
use nix::NixPath;
|
|
||||||
use tokio::{
|
|
||||||
fs::{File, OpenOptions},
|
|
||||||
io::AsyncBufRead,
|
|
||||||
io::AsyncSeekExt,
|
|
||||||
io::AsyncWriteExt,
|
|
||||||
};
|
|
||||||
use tokio_tar::{Archive, Builder, HeaderMode};
|
|
||||||
use walkdir::WalkDir;
|
|
||||||
|
|
||||||
/// Creates a Zstandard tarball.
|
|
||||||
pub async fn create_zst_tarball(path: &Utf8Path, tarball: &Utf8Path) -> Result<(File, u64)> {
|
|
||||||
let file = OpenOptions::new()
|
|
||||||
.create(true)
|
|
||||||
.truncate(true)
|
|
||||||
.read(true)
|
|
||||||
.write(true)
|
|
||||||
.open(&tarball)
|
|
||||||
.await
|
|
||||||
.with_context(|| format!("tempfile creation {tarball}"))?;
|
|
||||||
|
|
||||||
let mut paths = Vec::new();
|
|
||||||
for entry in WalkDir::new(path) {
|
|
||||||
let entry = entry?;
|
|
||||||
let metadata = entry.metadata().expect("error getting dir entry metadata");
|
|
||||||
// Also allow directories so that we also get empty directories
|
|
||||||
if !(metadata.is_file() || metadata.is_dir()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
let path = entry.into_path();
|
|
||||||
paths.push(path);
|
|
||||||
}
|
|
||||||
// Do a sort to get a more consistent listing
|
|
||||||
paths.sort_unstable();
|
|
||||||
let zstd = ZstdEncoder::with_quality_and_params(
|
|
||||||
file,
|
|
||||||
Level::Default,
|
|
||||||
&[CParameter::enable_long_distance_matching(true)],
|
|
||||||
);
|
|
||||||
let mut builder = Builder::new(zstd);
|
|
||||||
// Use reproducible header mode
|
|
||||||
builder.mode(HeaderMode::Deterministic);
|
|
||||||
for p in paths {
|
|
||||||
let rel_path = p.strip_prefix(path)?;
|
|
||||||
if rel_path.is_empty() {
|
|
||||||
// The top directory should not be compressed,
|
|
||||||
// the tar crate doesn't like that
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
builder.append_path_with_name(&p, rel_path).await?;
|
|
||||||
}
|
|
||||||
let mut zstd = builder.into_inner().await?;
|
|
||||||
zstd.shutdown().await?;
|
|
||||||
let mut compressed = zstd.into_inner();
|
|
||||||
let compressed_len = compressed.metadata().await?.len();
|
|
||||||
compressed.seek(SeekFrom::Start(0)).await?;
|
|
||||||
Ok((compressed, compressed_len))
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Creates a Zstandard tarball.
|
|
||||||
pub async fn extract_zst_tarball(
|
|
||||||
path: &Utf8Path,
|
|
||||||
tarball: impl AsyncBufRead + Unpin,
|
|
||||||
) -> Result<()> {
|
|
||||||
let decoder = Box::pin(ZstdDecoder::new(tarball));
|
|
||||||
let mut archive = Archive::new(decoder);
|
|
||||||
archive.unpack(path).await?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
@@ -69,7 +69,7 @@ pub struct Config {
|
|||||||
/// should be removed once we have a better solution there.
|
/// should be removed once we have a better solution there.
|
||||||
sys_buffer_bytes: u64,
|
sys_buffer_bytes: u64,
|
||||||
|
|
||||||
/// Minimum fraction of total system memory reserved *before* the cgroup threshold; in
|
/// Minimum fraction of total system memory reserved *before* the the cgroup threshold; in
|
||||||
/// other words, providing a ceiling for the highest value of the threshold by enforcing that
|
/// other words, providing a ceiling for the highest value of the threshold by enforcing that
|
||||||
/// there's at least `cgroup_min_overhead_fraction` of the total memory remaining beyond the
|
/// there's at least `cgroup_min_overhead_fraction` of the total memory remaining beyond the
|
||||||
/// threshold.
|
/// threshold.
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user