mirror of
https://github.com/neondatabase/neon.git
synced 2026-03-12 04:40:38 +00:00
Compare commits
132 Commits
proxy-cpla
...
proxy-asyn
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9fe38ed415 | ||
|
|
e4570fb31f | ||
|
|
dd7c4b79e3 | ||
|
|
a12b338aac | ||
|
|
f34134faaf | ||
|
|
c003b43781 | ||
|
|
5dda371c2b | ||
|
|
a60035b23a | ||
|
|
18fd73d84a | ||
|
|
ee9ec26808 | ||
|
|
e22c072064 | ||
|
|
89f023e6b0 | ||
|
|
8426fb886b | ||
|
|
28e7fa98c4 | ||
|
|
a9fda8c832 | ||
|
|
fa12d60237 | ||
|
|
d551bfee09 | ||
|
|
e69ff3fc00 | ||
|
|
25d9dc6eaf | ||
|
|
139d1346d5 | ||
|
|
0bd16182f7 | ||
|
|
6a5650d40c | ||
|
|
47addc15f1 | ||
|
|
b91c58a8bf | ||
|
|
00d9c2d9a8 | ||
|
|
3a673dce67 | ||
|
|
35e9fb360b | ||
|
|
0d21187322 | ||
|
|
e8a98adcd0 | ||
|
|
98be8b9430 | ||
|
|
6eb946e2de | ||
|
|
681a04d287 | ||
|
|
3df67bf4d7 | ||
|
|
0d8e68003a | ||
|
|
637ad4a638 | ||
|
|
8d0f701767 | ||
|
|
5191f6ef0e | ||
|
|
a54ea8fb1c | ||
|
|
d5708e7435 | ||
|
|
fd49005cb3 | ||
|
|
3023de156e | ||
|
|
e49e931bc4 | ||
|
|
13b9135d4e | ||
|
|
41bb1e42b8 | ||
|
|
cb4b40f9c1 | ||
|
|
9e567d9814 | ||
|
|
1c012958c7 | ||
|
|
e5c50bb12b | ||
|
|
926662eb7c | ||
|
|
3366cd34ba | ||
|
|
2d5a8462c8 | ||
|
|
110282ee7e | ||
|
|
f752c40f58 | ||
|
|
83cdbbb89a | ||
|
|
5288f9621e | ||
|
|
e8338c60f9 | ||
|
|
94505fd672 | ||
|
|
e92fb94149 | ||
|
|
40f15c3123 | ||
|
|
5299f917d6 | ||
|
|
99a56b5606 | ||
|
|
1628b5b145 | ||
|
|
db72543f4d | ||
|
|
d47e4a2a41 | ||
|
|
f86845f64b | ||
|
|
0bb04ebe19 | ||
|
|
5efe95a008 | ||
|
|
c0ff4f18dc | ||
|
|
fd88d4608c | ||
|
|
221414de4b | ||
|
|
dbac2d2c47 | ||
|
|
4f4f787119 | ||
|
|
bcab344490 | ||
|
|
f212630da2 | ||
|
|
a306d0a54b | ||
|
|
1081a4d246 | ||
|
|
47b705cffe | ||
|
|
2d3c9f0d43 | ||
|
|
21b3e1d13b | ||
|
|
0788760451 | ||
|
|
74b2314a5d | ||
|
|
edcaae6290 | ||
|
|
4fc95d2d71 | ||
|
|
534c099b42 | ||
|
|
ec01292b55 | ||
|
|
66fc465484 | ||
|
|
55da8eff4f | ||
|
|
0fa517eb80 | ||
|
|
8ceb4f0a69 | ||
|
|
6019ccef06 | ||
|
|
0c6367a732 | ||
|
|
e17bc6afb4 | ||
|
|
ac7fc6110b | ||
|
|
862a6b7018 | ||
|
|
4810c22607 | ||
|
|
9d754e984f | ||
|
|
375e15815c | ||
|
|
7ce613354e | ||
|
|
ae15acdee7 | ||
|
|
c5f64fe54f | ||
|
|
40852b955d | ||
|
|
b30b15e7cb | ||
|
|
36b875388f | ||
|
|
3f77f26aa2 | ||
|
|
8b10407be4 | ||
|
|
944313ffe1 | ||
|
|
d443d07518 | ||
|
|
3de416a016 | ||
|
|
bc05d7eb9c | ||
|
|
d8da51e78a | ||
|
|
6e3834d506 | ||
|
|
582cec53c5 | ||
|
|
9957c6a9a0 | ||
|
|
a5777bab09 | ||
|
|
90a8ff55fa | ||
|
|
3b95e8072a | ||
|
|
8ee54ffd30 | ||
|
|
3ab9f56f5f | ||
|
|
7ddc7b4990 | ||
|
|
63213fc814 | ||
|
|
090123a429 | ||
|
|
39d1818ae9 | ||
|
|
90be79fcf5 | ||
|
|
c52b80b930 | ||
|
|
722f271f6e | ||
|
|
be1d8fc4f7 | ||
|
|
25c4b676e0 | ||
|
|
6633332e67 | ||
|
|
5928f6709c | ||
|
|
63b2060aef | ||
|
|
24c5a5ac16 | ||
|
|
7f9cc1bd5e |
@@ -22,6 +22,7 @@
|
|||||||
!s3_scrubber/
|
!s3_scrubber/
|
||||||
!safekeeper/
|
!safekeeper/
|
||||||
!storage_broker/
|
!storage_broker/
|
||||||
|
!storage_controller/
|
||||||
!trace/
|
!trace/
|
||||||
!vendor/postgres-*/
|
!vendor/postgres-*/
|
||||||
!workspace_hack/
|
!workspace_hack/
|
||||||
|
|||||||
@@ -150,7 +150,7 @@ runs:
|
|||||||
|
|
||||||
# Use aws s3 cp (instead of aws s3 sync) to keep files from previous runs to make old URLs work,
|
# Use aws s3 cp (instead of aws s3 sync) to keep files from previous runs to make old URLs work,
|
||||||
# and to keep files on the host to upload them to the database
|
# and to keep files on the host to upload them to the database
|
||||||
time aws s3 cp --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
|
time s5cmd --log error cp "${WORKDIR}/report/*" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}/"
|
||||||
|
|
||||||
# Generate redirect
|
# Generate redirect
|
||||||
cat <<EOF > ${WORKDIR}/index.html
|
cat <<EOF > ${WORKDIR}/index.html
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ inputs:
|
|||||||
required: true
|
required: true
|
||||||
api_host:
|
api_host:
|
||||||
desctiption: 'Neon API host'
|
desctiption: 'Neon API host'
|
||||||
default: console.stage.neon.tech
|
default: console-stage.neon.build
|
||||||
outputs:
|
outputs:
|
||||||
dsn:
|
dsn:
|
||||||
description: 'Created Branch DSN (for main database)'
|
description: 'Created Branch DSN (for main database)'
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ inputs:
|
|||||||
required: true
|
required: true
|
||||||
api_host:
|
api_host:
|
||||||
desctiption: 'Neon API host'
|
desctiption: 'Neon API host'
|
||||||
default: console.stage.neon.tech
|
default: console-stage.neon.build
|
||||||
|
|
||||||
runs:
|
runs:
|
||||||
using: "composite"
|
using: "composite"
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ inputs:
|
|||||||
default: 15
|
default: 15
|
||||||
api_host:
|
api_host:
|
||||||
desctiption: 'Neon API host'
|
desctiption: 'Neon API host'
|
||||||
default: console.stage.neon.tech
|
default: console-stage.neon.build
|
||||||
provisioner:
|
provisioner:
|
||||||
desctiption: 'k8s-pod or k8s-neonvm'
|
desctiption: 'k8s-pod or k8s-neonvm'
|
||||||
default: 'k8s-pod'
|
default: 'k8s-pod'
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ inputs:
|
|||||||
required: true
|
required: true
|
||||||
api_host:
|
api_host:
|
||||||
desctiption: 'Neon API host'
|
desctiption: 'Neon API host'
|
||||||
default: console.stage.neon.tech
|
default: console-stage.neon.build
|
||||||
|
|
||||||
runs:
|
runs:
|
||||||
using: "composite"
|
using: "composite"
|
||||||
|
|||||||
1
.github/workflows/approved-for-ci-run.yml
vendored
1
.github/workflows/approved-for-ci-run.yml
vendored
@@ -18,6 +18,7 @@ on:
|
|||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
|
group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
|
||||||
|
cancel-in-progress: false
|
||||||
|
|
||||||
env:
|
env:
|
||||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
|||||||
58
.github/workflows/benchmarking.yml
vendored
58
.github/workflows/benchmarking.yml
vendored
@@ -147,15 +147,16 @@ jobs:
|
|||||||
"neonvm-captest-new"
|
"neonvm-captest-new"
|
||||||
],
|
],
|
||||||
"db_size": [ "10gb" ],
|
"db_size": [ "10gb" ],
|
||||||
"include": [{ "platform": "neon-captest-freetier", "db_size": "3gb" },
|
"include": [{ "platform": "neon-captest-freetier", "db_size": "3gb" },
|
||||||
{ "platform": "neon-captest-new", "db_size": "50gb" },
|
{ "platform": "neon-captest-new", "db_size": "50gb" },
|
||||||
{ "platform": "neonvm-captest-freetier", "db_size": "3gb" },
|
{ "platform": "neonvm-captest-freetier", "db_size": "3gb" },
|
||||||
{ "platform": "neonvm-captest-new", "db_size": "50gb" }]
|
{ "platform": "neonvm-captest-new", "db_size": "50gb" },
|
||||||
|
{ "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
|
||||||
}'
|
}'
|
||||||
|
|
||||||
if [ "$(date +%A)" = "Saturday" ]; then
|
if [ "$(date +%A)" = "Saturday" ]; then
|
||||||
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
|
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
|
||||||
{ "platform": "rds-aurora", "db_size": "50gb"}]')
|
{ "platform": "rds-aurora", "db_size": "50gb"}]')
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
||||||
@@ -171,7 +172,7 @@ jobs:
|
|||||||
|
|
||||||
if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
|
if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
|
||||||
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
|
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
|
||||||
{ "platform": "rds-aurora" }]')
|
{ "platform": "rds-aurora" }]')
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
||||||
@@ -190,7 +191,7 @@ jobs:
|
|||||||
|
|
||||||
if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
|
if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
|
||||||
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
|
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
|
||||||
{ "platform": "rds-aurora", "scale": "10" }]')
|
{ "platform": "rds-aurora", "scale": "10" }]')
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
||||||
@@ -253,6 +254,9 @@ jobs:
|
|||||||
neon-captest-reuse)
|
neon-captest-reuse)
|
||||||
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
|
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
|
||||||
;;
|
;;
|
||||||
|
neonvm-captest-sharding-reuse)
|
||||||
|
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
|
||||||
|
;;
|
||||||
neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
|
neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
|
||||||
CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
|
CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
|
||||||
;;
|
;;
|
||||||
@@ -270,11 +274,15 @@ jobs:
|
|||||||
|
|
||||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
QUERY="SELECT version();"
|
QUERIES=("SELECT version()")
|
||||||
if [[ "${PLATFORM}" = "neon"* ]]; then
|
if [[ "${PLATFORM}" = "neon"* ]]; then
|
||||||
QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
|
QUERIES+=("SHOW neon.tenant_id")
|
||||||
|
QUERIES+=("SHOW neon.timeline_id")
|
||||||
fi
|
fi
|
||||||
psql ${CONNSTR} -c "${QUERY}"
|
|
||||||
|
for q in "${QUERIES[@]}"; do
|
||||||
|
psql ${CONNSTR} -c "${q}"
|
||||||
|
done
|
||||||
|
|
||||||
- name: Benchmark init
|
- name: Benchmark init
|
||||||
uses: ./.github/actions/run-python-test-set
|
uses: ./.github/actions/run-python-test-set
|
||||||
@@ -401,11 +409,15 @@ jobs:
|
|||||||
|
|
||||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
QUERY="SELECT version();"
|
QUERIES=("SELECT version()")
|
||||||
if [[ "${PLATFORM}" = "neon"* ]]; then
|
if [[ "${PLATFORM}" = "neon"* ]]; then
|
||||||
QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
|
QUERIES+=("SHOW neon.tenant_id")
|
||||||
|
QUERIES+=("SHOW neon.timeline_id")
|
||||||
fi
|
fi
|
||||||
psql ${CONNSTR} -c "${QUERY}"
|
|
||||||
|
for q in "${QUERIES[@]}"; do
|
||||||
|
psql ${CONNSTR} -c "${q}"
|
||||||
|
done
|
||||||
|
|
||||||
- name: ClickBench benchmark
|
- name: ClickBench benchmark
|
||||||
uses: ./.github/actions/run-python-test-set
|
uses: ./.github/actions/run-python-test-set
|
||||||
@@ -507,11 +519,15 @@ jobs:
|
|||||||
|
|
||||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
QUERY="SELECT version();"
|
QUERIES=("SELECT version()")
|
||||||
if [[ "${PLATFORM}" = "neon"* ]]; then
|
if [[ "${PLATFORM}" = "neon"* ]]; then
|
||||||
QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
|
QUERIES+=("SHOW neon.tenant_id")
|
||||||
|
QUERIES+=("SHOW neon.timeline_id")
|
||||||
fi
|
fi
|
||||||
psql ${CONNSTR} -c "${QUERY}"
|
|
||||||
|
for q in "${QUERIES[@]}"; do
|
||||||
|
psql ${CONNSTR} -c "${q}"
|
||||||
|
done
|
||||||
|
|
||||||
- name: Run TPC-H benchmark
|
- name: Run TPC-H benchmark
|
||||||
uses: ./.github/actions/run-python-test-set
|
uses: ./.github/actions/run-python-test-set
|
||||||
@@ -597,11 +613,15 @@ jobs:
|
|||||||
|
|
||||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
QUERY="SELECT version();"
|
QUERIES=("SELECT version()")
|
||||||
if [[ "${PLATFORM}" = "neon"* ]]; then
|
if [[ "${PLATFORM}" = "neon"* ]]; then
|
||||||
QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
|
QUERIES+=("SHOW neon.tenant_id")
|
||||||
|
QUERIES+=("SHOW neon.timeline_id")
|
||||||
fi
|
fi
|
||||||
psql ${CONNSTR} -c "${QUERY}"
|
|
||||||
|
for q in "${QUERIES[@]}"; do
|
||||||
|
psql ${CONNSTR} -c "${q}"
|
||||||
|
done
|
||||||
|
|
||||||
- name: Run user examples
|
- name: Run user examples
|
||||||
uses: ./.github/actions/run-python-test-set
|
uses: ./.github/actions/run-python-test-set
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ defaults:
|
|||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: build-build-tools-image-${{ inputs.image-tag }}
|
group: build-build-tools-image-${{ inputs.image-tag }}
|
||||||
|
cancel-in-progress: false
|
||||||
|
|
||||||
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
|
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
|
||||||
permissions: {}
|
permissions: {}
|
||||||
|
|||||||
11
.github/workflows/build_and_test.yml
vendored
11
.github/workflows/build_and_test.yml
vendored
@@ -735,7 +735,7 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
mkdir -p .docker-custom
|
mkdir -p .docker-custom
|
||||||
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
|
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
|
||||||
- uses: docker/setup-buildx-action@v3
|
- uses: docker/setup-buildx-action@v2
|
||||||
|
|
||||||
- uses: docker/login-action@v3
|
- uses: docker/login-action@v3
|
||||||
with:
|
with:
|
||||||
@@ -792,7 +792,7 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
mkdir -p .docker-custom
|
mkdir -p .docker-custom
|
||||||
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
|
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
|
||||||
- uses: docker/setup-buildx-action@v3
|
- uses: docker/setup-buildx-action@v2
|
||||||
with:
|
with:
|
||||||
# Disable parallelism for docker buildkit.
|
# Disable parallelism for docker buildkit.
|
||||||
# As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.
|
# As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.
|
||||||
@@ -865,7 +865,7 @@ jobs:
|
|||||||
run:
|
run:
|
||||||
shell: sh -eu {0}
|
shell: sh -eu {0}
|
||||||
env:
|
env:
|
||||||
VM_BUILDER_VERSION: v0.23.2
|
VM_BUILDER_VERSION: v0.28.1
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
@@ -1127,15 +1127,15 @@ jobs:
|
|||||||
-f deployProxy=false \
|
-f deployProxy=false \
|
||||||
-f deployStorage=true \
|
-f deployStorage=true \
|
||||||
-f deployStorageBroker=true \
|
-f deployStorageBroker=true \
|
||||||
|
-f deployStorageController=true \
|
||||||
-f branch=main \
|
-f branch=main \
|
||||||
-f dockerTag=${{needs.tag.outputs.build-tag}} \
|
-f dockerTag=${{needs.tag.outputs.build-tag}} \
|
||||||
-f deployPreprodRegion=true
|
-f deployPreprodRegion=true
|
||||||
|
|
||||||
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
|
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
|
||||||
-f deployPgSniRouter=false \
|
|
||||||
-f deployProxy=false \
|
|
||||||
-f deployStorage=true \
|
-f deployStorage=true \
|
||||||
-f deployStorageBroker=true \
|
-f deployStorageBroker=true \
|
||||||
|
-f deployStorageController=true \
|
||||||
-f branch=main \
|
-f branch=main \
|
||||||
-f dockerTag=${{needs.tag.outputs.build-tag}}
|
-f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||||
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
|
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
|
||||||
@@ -1144,6 +1144,7 @@ jobs:
|
|||||||
-f deployProxy=true \
|
-f deployProxy=true \
|
||||||
-f deployStorage=false \
|
-f deployStorage=false \
|
||||||
-f deployStorageBroker=false \
|
-f deployStorageBroker=false \
|
||||||
|
-f deployStorageController=false \
|
||||||
-f branch=main \
|
-f branch=main \
|
||||||
-f dockerTag=${{needs.tag.outputs.build-tag}} \
|
-f dockerTag=${{needs.tag.outputs.build-tag}} \
|
||||||
-f deployPreprodRegion=true
|
-f deployPreprodRegion=true
|
||||||
|
|||||||
@@ -28,7 +28,9 @@ jobs:
|
|||||||
- name: Get build-tools image tag for the current commit
|
- name: Get build-tools image tag for the current commit
|
||||||
id: get-build-tools-tag
|
id: get-build-tools-tag
|
||||||
env:
|
env:
|
||||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
# Usually, for COMMIT_SHA, we use `github.event.pull_request.head.sha || github.sha`, but here, even for PRs,
|
||||||
|
# we want to use `github.sha` i.e. point to a phantom merge commit to determine the image tag correctly.
|
||||||
|
COMMIT_SHA: ${{ github.sha }}
|
||||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
run: |
|
run: |
|
||||||
LAST_BUILD_TOOLS_SHA=$(
|
LAST_BUILD_TOOLS_SHA=$(
|
||||||
|
|||||||
1
.github/workflows/pin-build-tools-image.yml
vendored
1
.github/workflows/pin-build-tools-image.yml
vendored
@@ -20,6 +20,7 @@ defaults:
|
|||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: pin-build-tools-image-${{ inputs.from-tag }}
|
group: pin-build-tools-image-${{ inputs.from-tag }}
|
||||||
|
cancel-in-progress: false
|
||||||
|
|
||||||
permissions: {}
|
permissions: {}
|
||||||
|
|
||||||
|
|||||||
90
.github/workflows/trigger-e2e-tests.yml
vendored
90
.github/workflows/trigger-e2e-tests.yml
vendored
@@ -62,14 +62,14 @@ jobs:
|
|||||||
|
|
||||||
trigger-e2e-tests:
|
trigger-e2e-tests:
|
||||||
needs: [ tag ]
|
needs: [ tag ]
|
||||||
runs-on: [ self-hosted, gen3, small ]
|
runs-on: ubuntu-latest
|
||||||
env:
|
env:
|
||||||
TAG: ${{ needs.tag.outputs.build-tag }}
|
TAG: ${{ needs.tag.outputs.build-tag }}
|
||||||
container:
|
|
||||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
|
||||||
options: --init
|
|
||||||
steps:
|
steps:
|
||||||
- name: check if ecr image are present
|
- name: check if ecr image are present
|
||||||
|
env:
|
||||||
|
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||||
|
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||||
run: |
|
run: |
|
||||||
for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
|
for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
|
||||||
OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
|
OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
|
||||||
@@ -79,41 +79,55 @@ jobs:
|
|||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
- name: Set PR's status to pending and request a remote CI test
|
- name: Set e2e-platforms
|
||||||
|
id: e2e-platforms
|
||||||
|
env:
|
||||||
|
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||||
|
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
run: |
|
run: |
|
||||||
# For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit
|
# Default set of platforms to run e2e tests on
|
||||||
# but we need to use a real sha of a latest commit in the PR's branch for the e2e job,
|
platforms='["docker", "k8s"]'
|
||||||
# to place a job run status update later.
|
|
||||||
COMMIT_SHA=${{ github.event.pull_request.head.sha }}
|
|
||||||
# For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those
|
|
||||||
COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
|
|
||||||
|
|
||||||
REMOTE_REPO="${{ github.repository_owner }}/cloud"
|
# If the PR changes vendor/, pgxn/ or libs/vm_monitor/ directories, or Dockerfile.compute-node, add k8s-neonvm to the list of platforms.
|
||||||
|
# If the workflow run is not a pull request, add k8s-neonvm to the list.
|
||||||
|
if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then
|
||||||
|
for f in $(gh api "/repos/${GITHUB_REPOSITORY}/pulls/${PR_NUMBER}/files" --paginate --jq '.[].filename'); do
|
||||||
|
case "$f" in
|
||||||
|
vendor/*|pgxn/*|libs/vm_monitor/*|Dockerfile.compute-node)
|
||||||
|
platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
# no-op
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
else
|
||||||
|
platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
|
||||||
|
fi
|
||||||
|
|
||||||
curl -f -X POST \
|
echo "e2e-platforms=${platforms}" | tee -a $GITHUB_OUTPUT
|
||||||
https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
|
|
||||||
-H "Accept: application/vnd.github.v3+json" \
|
|
||||||
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
|
|
||||||
--data \
|
|
||||||
"{
|
|
||||||
\"state\": \"pending\",
|
|
||||||
\"context\": \"neon-cloud-e2e\",
|
|
||||||
\"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
|
|
||||||
}"
|
|
||||||
|
|
||||||
curl -f -X POST \
|
- name: Set PR's status to pending and request a remote CI test
|
||||||
https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
|
env:
|
||||||
-H "Accept: application/vnd.github.v3+json" \
|
E2E_PLATFORMS: ${{ steps.e2e-platforms.outputs.e2e-platforms }}
|
||||||
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
|
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||||
--data \
|
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||||
"{
|
run: |
|
||||||
\"ref\": \"main\",
|
REMOTE_REPO="${GITHUB_REPOSITORY_OWNER}/cloud"
|
||||||
\"inputs\": {
|
|
||||||
\"ci_job_name\": \"neon-cloud-e2e\",
|
gh api "/repos/${GITHUB_REPOSITORY}/statuses/${COMMIT_SHA}" \
|
||||||
\"commit_hash\": \"$COMMIT_SHA\",
|
--method POST \
|
||||||
\"remote_repo\": \"${{ github.repository }}\",
|
--raw-field "state=pending" \
|
||||||
\"storage_image_tag\": \"${TAG}\",
|
--raw-field "description=[$REMOTE_REPO] Remote CI job is about to start" \
|
||||||
\"compute_image_tag\": \"${TAG}\",
|
--raw-field "context=neon-cloud-e2e"
|
||||||
\"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
|
|
||||||
}
|
gh workflow --repo ${REMOTE_REPO} \
|
||||||
}"
|
run testing.yml \
|
||||||
|
--ref "main" \
|
||||||
|
--raw-field "ci_job_name=neon-cloud-e2e" \
|
||||||
|
--raw-field "commit_hash=$COMMIT_SHA" \
|
||||||
|
--raw-field "remote_repo=${GITHUB_REPOSITORY}" \
|
||||||
|
--raw-field "storage_image_tag=${TAG}" \
|
||||||
|
--raw-field "compute_image_tag=${TAG}" \
|
||||||
|
--raw-field "concurrency_group=${E2E_CONCURRENCY_GROUP}" \
|
||||||
|
--raw-field "e2e-platforms=${E2E_PLATFORMS}"
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
/compute_tools/ @neondatabase/control-plane @neondatabase/compute
|
/compute_tools/ @neondatabase/control-plane @neondatabase/compute
|
||||||
/control_plane/attachment_service @neondatabase/storage
|
/storage_controller @neondatabase/storage
|
||||||
/libs/pageserver_api/ @neondatabase/storage
|
/libs/pageserver_api/ @neondatabase/storage
|
||||||
/libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers
|
/libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers
|
||||||
/libs/remote_storage/ @neondatabase/storage
|
/libs/remote_storage/ @neondatabase/storage
|
||||||
|
|||||||
719
Cargo.lock
generated
719
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
27
Cargo.toml
27
Cargo.toml
@@ -3,7 +3,7 @@ resolver = "2"
|
|||||||
members = [
|
members = [
|
||||||
"compute_tools",
|
"compute_tools",
|
||||||
"control_plane",
|
"control_plane",
|
||||||
"control_plane/attachment_service",
|
"control_plane/storcon_cli",
|
||||||
"pageserver",
|
"pageserver",
|
||||||
"pageserver/compaction",
|
"pageserver/compaction",
|
||||||
"pageserver/ctl",
|
"pageserver/ctl",
|
||||||
@@ -12,6 +12,7 @@ members = [
|
|||||||
"proxy",
|
"proxy",
|
||||||
"safekeeper",
|
"safekeeper",
|
||||||
"storage_broker",
|
"storage_broker",
|
||||||
|
"storage_controller",
|
||||||
"s3_scrubber",
|
"s3_scrubber",
|
||||||
"workspace_hack",
|
"workspace_hack",
|
||||||
"trace",
|
"trace",
|
||||||
@@ -43,6 +44,7 @@ license = "Apache-2.0"
|
|||||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||||
arc-swap = "1.6"
|
arc-swap = "1.6"
|
||||||
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
|
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
|
||||||
|
atomic-take = "1.1.0"
|
||||||
azure_core = "0.18"
|
azure_core = "0.18"
|
||||||
azure_identity = "0.18"
|
azure_identity = "0.18"
|
||||||
azure_storage = "0.18"
|
azure_storage = "0.18"
|
||||||
@@ -55,6 +57,7 @@ aws-sdk-s3 = "1.14"
|
|||||||
aws-sdk-iam = "1.15.0"
|
aws-sdk-iam = "1.15.0"
|
||||||
aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] }
|
aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] }
|
||||||
aws-smithy-types = "1.1.4"
|
aws-smithy-types = "1.1.4"
|
||||||
|
aws-smithy-runtime = "1.1.8"
|
||||||
aws-credential-types = "1.1.4"
|
aws-credential-types = "1.1.4"
|
||||||
aws-sigv4 = { version = "1.2.0", features = ["sign-http"] }
|
aws-sigv4 = { version = "1.2.0", features = ["sign-http"] }
|
||||||
aws-types = "1.1.7"
|
aws-types = "1.1.7"
|
||||||
@@ -96,7 +99,7 @@ http-types = { version = "2", default-features = false }
|
|||||||
humantime = "2.1"
|
humantime = "2.1"
|
||||||
humantime-serde = "1.1.1"
|
humantime-serde = "1.1.1"
|
||||||
hyper = "0.14"
|
hyper = "0.14"
|
||||||
hyper-tungstenite = "0.11"
|
hyper-tungstenite = "0.13.0"
|
||||||
inotify = "0.10.2"
|
inotify = "0.10.2"
|
||||||
ipnet = "2.9.0"
|
ipnet = "2.9.0"
|
||||||
itertools = "0.10"
|
itertools = "0.10"
|
||||||
@@ -105,7 +108,8 @@ lasso = "0.7"
|
|||||||
leaky-bucket = "1.0.1"
|
leaky-bucket = "1.0.1"
|
||||||
libc = "0.2"
|
libc = "0.2"
|
||||||
md5 = "0.7.0"
|
md5 = "0.7.0"
|
||||||
measured = { version = "0.0.13", features=["default", "lasso"] }
|
measured = { version = "0.0.21", features=["lasso"] }
|
||||||
|
measured-process = { version = "0.0.21" }
|
||||||
memoffset = "0.8"
|
memoffset = "0.8"
|
||||||
native-tls = "0.2"
|
native-tls = "0.2"
|
||||||
nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
|
nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
|
||||||
@@ -154,11 +158,12 @@ socket2 = "0.5"
|
|||||||
strum = "0.24"
|
strum = "0.24"
|
||||||
strum_macros = "0.24"
|
strum_macros = "0.24"
|
||||||
"subtle" = "2.5.0"
|
"subtle" = "2.5.0"
|
||||||
svg_fmt = "0.4.1"
|
# https://github.com/nical/rust_debug/pull/4
|
||||||
|
svg_fmt = { git = "https://github.com/neondatabase/fork--nical--rust_debug", branch = "neon" }
|
||||||
sync_wrapper = "0.1.2"
|
sync_wrapper = "0.1.2"
|
||||||
tar = "0.4"
|
tar = "0.4"
|
||||||
task-local-extensions = "0.1.4"
|
task-local-extensions = "0.1.4"
|
||||||
test-context = "0.1"
|
test-context = "0.3"
|
||||||
thiserror = "1.0"
|
thiserror = "1.0"
|
||||||
tikv-jemallocator = "0.5"
|
tikv-jemallocator = "0.5"
|
||||||
tikv-jemalloc-ctl = "0.5"
|
tikv-jemalloc-ctl = "0.5"
|
||||||
@@ -190,11 +195,11 @@ env_logger = "0.10"
|
|||||||
log = "0.4"
|
log = "0.4"
|
||||||
|
|
||||||
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
|
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
|
||||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="socket-config" }
|
||||||
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", branch="socket-config" }
|
||||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="socket-config" }
|
||||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="socket-config" }
|
||||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="socket-config" }
|
||||||
|
|
||||||
## Other git libraries
|
## Other git libraries
|
||||||
heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
|
heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
|
||||||
@@ -234,7 +239,7 @@ tonic-build = "0.9"
|
|||||||
|
|
||||||
# This is only needed for proxy's tests.
|
# This is only needed for proxy's tests.
|
||||||
# TODO: we should probably fork `tokio-postgres-rustls` instead.
|
# TODO: we should probably fork `tokio-postgres-rustls` instead.
|
||||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="socket-config" }
|
||||||
|
|
||||||
# bug fixes for UUID
|
# bug fixes for UUID
|
||||||
parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
|
parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
|
||||||
|
|||||||
@@ -58,6 +58,12 @@ RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v$
|
|||||||
&& mv protoc/include/google /usr/local/include/google \
|
&& mv protoc/include/google /usr/local/include/google \
|
||||||
&& rm -rf protoc.zip protoc
|
&& rm -rf protoc.zip protoc
|
||||||
|
|
||||||
|
# s5cmd
|
||||||
|
ENV S5CMD_VERSION=2.2.2
|
||||||
|
RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/s5cmd_${S5CMD_VERSION}_Linux-$(uname -m | sed 's/x86_64/64bit/g' | sed 's/aarch64/arm64/g').tar.gz" | tar zxvf - s5cmd \
|
||||||
|
&& chmod +x s5cmd \
|
||||||
|
&& mv s5cmd /usr/local/bin/s5cmd
|
||||||
|
|
||||||
# LLVM
|
# LLVM
|
||||||
ENV LLVM_VERSION=17
|
ENV LLVM_VERSION=17
|
||||||
RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
|
RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
|
||||||
|
|||||||
@@ -944,6 +944,9 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
|
|||||||
COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
|
COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
|
||||||
COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
|
COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
|
||||||
|
|
||||||
|
# Create remote extension download directory
|
||||||
|
RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions
|
||||||
|
|
||||||
# Install:
|
# Install:
|
||||||
# libreadline8 for psql
|
# libreadline8 for psql
|
||||||
# libicu67, locales for collations (including ICU and plpgsql_check)
|
# libicu67, locales for collations (including ICU and plpgsql_check)
|
||||||
|
|||||||
@@ -818,9 +818,15 @@ impl ComputeNode {
|
|||||||
Client::connect(zenith_admin_connstr.as_str(), NoTls)
|
Client::connect(zenith_admin_connstr.as_str(), NoTls)
|
||||||
.context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?;
|
.context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?;
|
||||||
// Disable forwarding so that users don't get a cloud_admin role
|
// Disable forwarding so that users don't get a cloud_admin role
|
||||||
client.simple_query("SET neon.forward_ddl = false")?;
|
|
||||||
client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
|
let mut func = || {
|
||||||
client.simple_query("GRANT zenith_admin TO cloud_admin")?;
|
client.simple_query("SET neon.forward_ddl = false")?;
|
||||||
|
client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
|
||||||
|
client.simple_query("GRANT zenith_admin TO cloud_admin")?;
|
||||||
|
Ok::<_, anyhow::Error>(())
|
||||||
|
};
|
||||||
|
func().context("apply_config setup cloud_admin")?;
|
||||||
|
|
||||||
drop(client);
|
drop(client);
|
||||||
|
|
||||||
// reconnect with connstring with expected name
|
// reconnect with connstring with expected name
|
||||||
@@ -832,24 +838,29 @@ impl ComputeNode {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// Disable DDL forwarding because control plane already knows about these roles/databases.
|
// Disable DDL forwarding because control plane already knows about these roles/databases.
|
||||||
client.simple_query("SET neon.forward_ddl = false")?;
|
client
|
||||||
|
.simple_query("SET neon.forward_ddl = false")
|
||||||
|
.context("apply_config SET neon.forward_ddl = false")?;
|
||||||
|
|
||||||
// Proceed with post-startup configuration. Note, that order of operations is important.
|
// Proceed with post-startup configuration. Note, that order of operations is important.
|
||||||
let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
|
let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
|
||||||
create_neon_superuser(spec, &mut client)?;
|
create_neon_superuser(spec, &mut client).context("apply_config create_neon_superuser")?;
|
||||||
cleanup_instance(&mut client)?;
|
cleanup_instance(&mut client).context("apply_config cleanup_instance")?;
|
||||||
handle_roles(spec, &mut client)?;
|
handle_roles(spec, &mut client).context("apply_config handle_roles")?;
|
||||||
handle_databases(spec, &mut client)?;
|
handle_databases(spec, &mut client).context("apply_config handle_databases")?;
|
||||||
handle_role_deletions(spec, connstr.as_str(), &mut client)?;
|
handle_role_deletions(spec, connstr.as_str(), &mut client)
|
||||||
|
.context("apply_config handle_role_deletions")?;
|
||||||
handle_grants(
|
handle_grants(
|
||||||
spec,
|
spec,
|
||||||
&mut client,
|
&mut client,
|
||||||
connstr.as_str(),
|
connstr.as_str(),
|
||||||
self.has_feature(ComputeFeature::AnonExtension),
|
self.has_feature(ComputeFeature::AnonExtension),
|
||||||
)?;
|
)
|
||||||
handle_extensions(spec, &mut client)?;
|
.context("apply_config handle_grants")?;
|
||||||
handle_extension_neon(&mut client)?;
|
handle_extensions(spec, &mut client).context("apply_config handle_extensions")?;
|
||||||
create_availability_check_data(&mut client)?;
|
handle_extension_neon(&mut client).context("apply_config handle_extension_neon")?;
|
||||||
|
create_availability_check_data(&mut client)
|
||||||
|
.context("apply_config create_availability_check_data")?;
|
||||||
|
|
||||||
// 'Close' connection
|
// 'Close' connection
|
||||||
drop(client);
|
drop(client);
|
||||||
@@ -857,7 +868,7 @@ impl ComputeNode {
|
|||||||
// Run migrations separately to not hold up cold starts
|
// Run migrations separately to not hold up cold starts
|
||||||
thread::spawn(move || {
|
thread::spawn(move || {
|
||||||
let mut client = Client::connect(connstr.as_str(), NoTls)?;
|
let mut client = Client::connect(connstr.as_str(), NoTls)?;
|
||||||
handle_migrations(&mut client)
|
handle_migrations(&mut client).context("apply_config handle_migrations")
|
||||||
});
|
});
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -1262,10 +1273,12 @@ LIMIT 100",
|
|||||||
.await
|
.await
|
||||||
.map_err(DownloadError::Other);
|
.map_err(DownloadError::Other);
|
||||||
|
|
||||||
self.ext_download_progress
|
if download_size.is_ok() {
|
||||||
.write()
|
self.ext_download_progress
|
||||||
.expect("bad lock")
|
.write()
|
||||||
.insert(ext_archive_name.to_string(), (download_start, true));
|
.expect("bad lock")
|
||||||
|
.insert(ext_archive_name.to_string(), (download_start, true));
|
||||||
|
}
|
||||||
|
|
||||||
download_size
|
download_size
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,8 +6,8 @@ use std::path::Path;
|
|||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
|
|
||||||
use crate::pg_helpers::escape_conf_value;
|
use crate::pg_helpers::escape_conf_value;
|
||||||
use crate::pg_helpers::PgOptionsSerialize;
|
use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize};
|
||||||
use compute_api::spec::{ComputeMode, ComputeSpec};
|
use compute_api::spec::{ComputeMode, ComputeSpec, GenericOption};
|
||||||
|
|
||||||
/// Check that `line` is inside a text file and put it there if it is not.
|
/// Check that `line` is inside a text file and put it there if it is not.
|
||||||
/// Create file if it doesn't exist.
|
/// Create file if it doesn't exist.
|
||||||
@@ -92,6 +92,27 @@ pub fn write_postgres_conf(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if cfg!(target_os = "linux") {
|
||||||
|
// Check /proc/sys/vm/overcommit_memory -- if it equals 2 (i.e. linux memory overcommit is
|
||||||
|
// disabled), then the control plane has enabled swap and we should set
|
||||||
|
// dynamic_shared_memory_type = 'mmap'.
|
||||||
|
//
|
||||||
|
// This is (maybe?) temporary - for more, see https://github.com/neondatabase/cloud/issues/12047.
|
||||||
|
let overcommit_memory_contents = std::fs::read_to_string("/proc/sys/vm/overcommit_memory")
|
||||||
|
// ignore any errors - they may be expected to occur under certain situations (e.g. when
|
||||||
|
// not running in Linux).
|
||||||
|
.unwrap_or_else(|_| String::new());
|
||||||
|
if overcommit_memory_contents.trim() == "2" {
|
||||||
|
let opt = GenericOption {
|
||||||
|
name: "dynamic_shared_memory_type".to_owned(),
|
||||||
|
value: Some("mmap".to_owned()),
|
||||||
|
vartype: "enum".to_owned(),
|
||||||
|
};
|
||||||
|
|
||||||
|
write!(file, "{}", opt.to_pg_setting())?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// If there are any extra options in the 'settings' field, append those
|
// If there are any extra options in the 'settings' field, append those
|
||||||
if spec.cluster.settings.is_some() {
|
if spec.cluster.settings.is_some() {
|
||||||
writeln!(file, "# Managed by compute_ctl: begin")?;
|
writeln!(file, "# Managed by compute_ctl: begin")?;
|
||||||
|
|||||||
@@ -44,7 +44,7 @@ pub fn escape_conf_value(s: &str) -> String {
|
|||||||
format!("'{}'", res)
|
format!("'{}'", res)
|
||||||
}
|
}
|
||||||
|
|
||||||
trait GenericOptionExt {
|
pub trait GenericOptionExt {
|
||||||
fn to_pg_option(&self) -> String;
|
fn to_pg_option(&self) -> String;
|
||||||
fn to_pg_setting(&self) -> String;
|
fn to_pg_setting(&self) -> String;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ use std::fs::File;
|
|||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
|
|
||||||
use anyhow::{anyhow, bail, Result};
|
use anyhow::{anyhow, bail, Context, Result};
|
||||||
use postgres::config::Config;
|
use postgres::config::Config;
|
||||||
use postgres::{Client, NoTls};
|
use postgres::{Client, NoTls};
|
||||||
use reqwest::StatusCode;
|
use reqwest::StatusCode;
|
||||||
@@ -302,9 +302,9 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
|||||||
RoleAction::Create => {
|
RoleAction::Create => {
|
||||||
// This branch only runs when roles are created through the console, so it is
|
// This branch only runs when roles are created through the console, so it is
|
||||||
// safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
|
// safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
|
||||||
// from neon_superuser. (NOTE: REPLICATION has been removed from here for now).
|
// from neon_superuser.
|
||||||
let mut query: String = format!(
|
let mut query: String = format!(
|
||||||
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
|
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
|
||||||
name.pg_quote()
|
name.pg_quote()
|
||||||
);
|
);
|
||||||
info!("running role create query: '{}'", &query);
|
info!("running role create query: '{}'", &query);
|
||||||
@@ -698,7 +698,8 @@ pub fn handle_grants(
|
|||||||
|
|
||||||
// it is important to run this after all grants
|
// it is important to run this after all grants
|
||||||
if enable_anon_extension {
|
if enable_anon_extension {
|
||||||
handle_extension_anon(spec, &db.owner, &mut db_client, false)?;
|
handle_extension_anon(spec, &db.owner, &mut db_client, false)
|
||||||
|
.context("handle_grants handle_extension_anon")?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -743,21 +744,24 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
|
|||||||
// which may happen in two cases:
|
// which may happen in two cases:
|
||||||
// - extension was just installed
|
// - extension was just installed
|
||||||
// - extension was already installed and is up to date
|
// - extension was already installed and is up to date
|
||||||
// DISABLED due to compute node unpinning epic
|
let query = "ALTER EXTENSION neon UPDATE";
|
||||||
// let query = "ALTER EXTENSION neon UPDATE";
|
info!("update neon extension version with query: {}", query);
|
||||||
// info!("update neon extension version with query: {}", query);
|
if let Err(e) = client.simple_query(query) {
|
||||||
// client.simple_query(query)?;
|
error!(
|
||||||
|
"failed to upgrade neon extension during `handle_extension_neon`: {}",
|
||||||
|
e
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[instrument(skip_all)]
|
#[instrument(skip_all)]
|
||||||
pub fn handle_neon_extension_upgrade(_client: &mut Client) -> Result<()> {
|
pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
|
||||||
info!("handle neon extension upgrade (not really)");
|
info!("handle neon extension upgrade");
|
||||||
// DISABLED due to compute node unpinning epic
|
let query = "ALTER EXTENSION neon UPDATE";
|
||||||
// let query = "ALTER EXTENSION neon UPDATE";
|
info!("update neon extension version with query: {}", query);
|
||||||
// info!("update neon extension version with query: {}", query);
|
client.simple_query(query)?;
|
||||||
// client.simple_query(query)?;
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -806,43 +810,40 @@ $$;"#,
|
|||||||
"",
|
"",
|
||||||
"",
|
"",
|
||||||
"",
|
"",
|
||||||
|
"",
|
||||||
// Add new migrations below.
|
// Add new migrations below.
|
||||||
r#"
|
|
||||||
DO $$
|
|
||||||
DECLARE
|
|
||||||
role_name TEXT;
|
|
||||||
BEGIN
|
|
||||||
FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
|
|
||||||
LOOP
|
|
||||||
RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
|
|
||||||
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
|
|
||||||
END LOOP;
|
|
||||||
END
|
|
||||||
$$;"#,
|
|
||||||
];
|
];
|
||||||
|
|
||||||
let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
|
let mut func = || {
|
||||||
client.simple_query(query)?;
|
let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
|
||||||
|
client.simple_query(query)?;
|
||||||
|
|
||||||
query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
|
let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
|
||||||
client.simple_query(query)?;
|
client.simple_query(query)?;
|
||||||
|
|
||||||
query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
|
let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
|
||||||
client.simple_query(query)?;
|
client.simple_query(query)?;
|
||||||
|
|
||||||
query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
|
let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
|
||||||
client.simple_query(query)?;
|
client.simple_query(query)?;
|
||||||
|
|
||||||
query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
|
let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
|
||||||
client.simple_query(query)?;
|
client.simple_query(query)?;
|
||||||
|
Ok::<_, anyhow::Error>(())
|
||||||
|
};
|
||||||
|
func().context("handle_migrations prepare")?;
|
||||||
|
|
||||||
query = "SELECT id FROM neon_migration.migration_id";
|
let query = "SELECT id FROM neon_migration.migration_id";
|
||||||
let row = client.query_one(query, &[])?;
|
let row = client
|
||||||
|
.query_one(query, &[])
|
||||||
|
.context("handle_migrations get migration_id")?;
|
||||||
let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
|
let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
|
||||||
let starting_migration_id = current_migration;
|
let starting_migration_id = current_migration;
|
||||||
|
|
||||||
query = "BEGIN";
|
let query = "BEGIN";
|
||||||
client.simple_query(query)?;
|
client
|
||||||
|
.simple_query(query)
|
||||||
|
.context("handle_migrations begin")?;
|
||||||
|
|
||||||
while current_migration < migrations.len() {
|
while current_migration < migrations.len() {
|
||||||
let migration = &migrations[current_migration];
|
let migration = &migrations[current_migration];
|
||||||
@@ -850,7 +851,9 @@ $$;"#,
|
|||||||
info!("Skip migration id={}", current_migration);
|
info!("Skip migration id={}", current_migration);
|
||||||
} else {
|
} else {
|
||||||
info!("Running migration:\n{}\n", migration);
|
info!("Running migration:\n{}\n", migration);
|
||||||
client.simple_query(migration)?;
|
client.simple_query(migration).with_context(|| {
|
||||||
|
format!("handle_migrations current_migration={}", current_migration)
|
||||||
|
})?;
|
||||||
}
|
}
|
||||||
current_migration += 1;
|
current_migration += 1;
|
||||||
}
|
}
|
||||||
@@ -858,10 +861,14 @@ $$;"#,
|
|||||||
"UPDATE neon_migration.migration_id SET id={}",
|
"UPDATE neon_migration.migration_id SET id={}",
|
||||||
migrations.len()
|
migrations.len()
|
||||||
);
|
);
|
||||||
client.simple_query(&setval)?;
|
client
|
||||||
|
.simple_query(&setval)
|
||||||
|
.context("handle_migrations update id")?;
|
||||||
|
|
||||||
query = "COMMIT";
|
let query = "COMMIT";
|
||||||
client.simple_query(query)?;
|
client
|
||||||
|
.simple_query(query)
|
||||||
|
.context("handle_migrations commit")?;
|
||||||
|
|
||||||
info!(
|
info!(
|
||||||
"Ran {} migrations",
|
"Ran {} migrations",
|
||||||
|
|||||||
@@ -86,7 +86,10 @@ where
|
|||||||
.stdout(process_log_file)
|
.stdout(process_log_file)
|
||||||
.stderr(same_file_for_stderr)
|
.stderr(same_file_for_stderr)
|
||||||
.args(args);
|
.args(args);
|
||||||
let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command));
|
|
||||||
|
let filled_cmd = fill_env_vars_prefixed_neon(fill_remote_storage_secrets_vars(
|
||||||
|
fill_rust_env_vars(background_command),
|
||||||
|
));
|
||||||
filled_cmd.envs(envs);
|
filled_cmd.envs(envs);
|
||||||
|
|
||||||
let pid_file_to_check = match &initial_pid_file {
|
let pid_file_to_check = match &initial_pid_file {
|
||||||
@@ -268,6 +271,15 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
|
|||||||
cmd
|
cmd
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command {
|
||||||
|
for (var, val) in std::env::vars() {
|
||||||
|
if var.starts_with("NEON_PAGESERVER_") {
|
||||||
|
cmd = cmd.env(var, val);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
cmd
|
||||||
|
}
|
||||||
|
|
||||||
/// Add a `pre_exec` to the cmd that, inbetween fork() and exec(),
|
/// Add a `pre_exec` to the cmd that, inbetween fork() and exec(),
|
||||||
/// 1. Claims a pidfile with a fcntl lock on it and
|
/// 1. Claims a pidfile with a fcntl lock on it and
|
||||||
/// 2. Sets up the pidfile's file descriptor so that it (and the lock)
|
/// 2. Sets up the pidfile's file descriptor so that it (and the lock)
|
||||||
|
|||||||
@@ -14,9 +14,7 @@ use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
|
|||||||
use control_plane::safekeeper::SafekeeperNode;
|
use control_plane::safekeeper::SafekeeperNode;
|
||||||
use control_plane::storage_controller::StorageController;
|
use control_plane::storage_controller::StorageController;
|
||||||
use control_plane::{broker, local_env};
|
use control_plane::{broker, local_env};
|
||||||
use pageserver_api::controller_api::{
|
use pageserver_api::controller_api::PlacementPolicy;
|
||||||
NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, PlacementPolicy,
|
|
||||||
};
|
|
||||||
use pageserver_api::models::{
|
use pageserver_api::models::{
|
||||||
ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
|
ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
|
||||||
};
|
};
|
||||||
@@ -1060,21 +1058,6 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Some(("set-state", subcommand_args)) => {
|
|
||||||
let pageserver = get_pageserver(env, subcommand_args)?;
|
|
||||||
let scheduling = subcommand_args.get_one("scheduling");
|
|
||||||
let availability = subcommand_args.get_one("availability");
|
|
||||||
|
|
||||||
let storage_controller = StorageController::from_env(env);
|
|
||||||
storage_controller
|
|
||||||
.node_configure(NodeConfigureRequest {
|
|
||||||
node_id: pageserver.conf.id,
|
|
||||||
scheduling: scheduling.cloned(),
|
|
||||||
availability: availability.cloned(),
|
|
||||||
})
|
|
||||||
.await?;
|
|
||||||
}
|
|
||||||
|
|
||||||
Some(("status", subcommand_args)) => {
|
Some(("status", subcommand_args)) => {
|
||||||
match get_pageserver(env, subcommand_args)?.check_status().await {
|
match get_pageserver(env, subcommand_args)?.check_status().await {
|
||||||
Ok(_) => println!("Page server is up and running"),
|
Ok(_) => println!("Page server is up and running"),
|
||||||
@@ -1248,7 +1231,7 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
|
|||||||
match ComputeControlPlane::load(env.clone()) {
|
match ComputeControlPlane::load(env.clone()) {
|
||||||
Ok(cplane) => {
|
Ok(cplane) => {
|
||||||
for (_k, node) in cplane.endpoints {
|
for (_k, node) in cplane.endpoints {
|
||||||
if let Err(e) = node.stop(if immediate { "immediate" } else { "fast " }, false) {
|
if let Err(e) = node.stop(if immediate { "immediate" } else { "fast" }, false) {
|
||||||
eprintln!("postgres stop failed: {e:#}");
|
eprintln!("postgres stop failed: {e:#}");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1434,6 +1417,7 @@ fn cli() -> Command {
|
|||||||
.subcommand(
|
.subcommand(
|
||||||
Command::new("timeline")
|
Command::new("timeline")
|
||||||
.about("Manage timelines")
|
.about("Manage timelines")
|
||||||
|
.arg_required_else_help(true)
|
||||||
.subcommand(Command::new("list")
|
.subcommand(Command::new("list")
|
||||||
.about("List all timelines, available to this pageserver")
|
.about("List all timelines, available to this pageserver")
|
||||||
.arg(tenant_id_arg.clone()))
|
.arg(tenant_id_arg.clone()))
|
||||||
@@ -1515,12 +1499,6 @@ fn cli() -> Command {
|
|||||||
.about("Restart local pageserver")
|
.about("Restart local pageserver")
|
||||||
.arg(pageserver_config_args.clone())
|
.arg(pageserver_config_args.clone())
|
||||||
)
|
)
|
||||||
.subcommand(Command::new("set-state")
|
|
||||||
.arg(Arg::new("availability").value_parser(value_parser!(NodeAvailability)).long("availability").action(ArgAction::Set).help("Availability state: offline,active"))
|
|
||||||
.arg(Arg::new("scheduling").value_parser(value_parser!(NodeSchedulingPolicy)).long("scheduling").action(ArgAction::Set).help("Scheduling state: draining,pause,filling,active"))
|
|
||||||
.about("Set scheduling or availability state of pageserver node")
|
|
||||||
.arg(pageserver_config_args.clone())
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
.subcommand(
|
.subcommand(
|
||||||
Command::new("storage_controller")
|
Command::new("storage_controller")
|
||||||
|
|||||||
@@ -156,6 +156,7 @@ pub struct SafekeeperConf {
|
|||||||
pub remote_storage: Option<String>,
|
pub remote_storage: Option<String>,
|
||||||
pub backup_threads: Option<u32>,
|
pub backup_threads: Option<u32>,
|
||||||
pub auth_enabled: bool,
|
pub auth_enabled: bool,
|
||||||
|
pub listen_addr: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for SafekeeperConf {
|
impl Default for SafekeeperConf {
|
||||||
@@ -169,6 +170,7 @@ impl Default for SafekeeperConf {
|
|||||||
remote_storage: None,
|
remote_storage: None,
|
||||||
backup_threads: None,
|
backup_threads: None,
|
||||||
auth_enabled: false,
|
auth_enabled: false,
|
||||||
|
listen_addr: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -389,6 +389,10 @@ impl PageServerNode {
|
|||||||
.remove("image_creation_threshold")
|
.remove("image_creation_threshold")
|
||||||
.map(|x| x.parse::<usize>())
|
.map(|x| x.parse::<usize>())
|
||||||
.transpose()?,
|
.transpose()?,
|
||||||
|
image_layer_creation_check_threshold: settings
|
||||||
|
.remove("image_layer_creation_check_threshold")
|
||||||
|
.map(|x| x.parse::<u8>())
|
||||||
|
.transpose()?,
|
||||||
pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
|
pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
|
||||||
walreceiver_connect_timeout: settings
|
walreceiver_connect_timeout: settings
|
||||||
.remove("walreceiver_connect_timeout")
|
.remove("walreceiver_connect_timeout")
|
||||||
@@ -501,6 +505,12 @@ impl PageServerNode {
|
|||||||
.map(|x| x.parse::<usize>())
|
.map(|x| x.parse::<usize>())
|
||||||
.transpose()
|
.transpose()
|
||||||
.context("Failed to parse 'image_creation_threshold' as non zero integer")?,
|
.context("Failed to parse 'image_creation_threshold' as non zero integer")?,
|
||||||
|
image_layer_creation_check_threshold: settings
|
||||||
|
.remove("image_layer_creation_check_threshold")
|
||||||
|
.map(|x| x.parse::<u8>())
|
||||||
|
.transpose()
|
||||||
|
.context("Failed to parse 'image_creation_check_threshold' as integer")?,
|
||||||
|
|
||||||
pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
|
pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
|
||||||
walreceiver_connect_timeout: settings
|
walreceiver_connect_timeout: settings
|
||||||
.remove("walreceiver_connect_timeout")
|
.remove("walreceiver_connect_timeout")
|
||||||
|
|||||||
@@ -70,24 +70,31 @@ pub struct SafekeeperNode {
|
|||||||
pub pg_connection_config: PgConnectionConfig,
|
pub pg_connection_config: PgConnectionConfig,
|
||||||
pub env: LocalEnv,
|
pub env: LocalEnv,
|
||||||
pub http_client: reqwest::Client,
|
pub http_client: reqwest::Client,
|
||||||
|
pub listen_addr: String,
|
||||||
pub http_base_url: String,
|
pub http_base_url: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SafekeeperNode {
|
impl SafekeeperNode {
|
||||||
pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode {
|
pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode {
|
||||||
|
let listen_addr = if let Some(ref listen_addr) = conf.listen_addr {
|
||||||
|
listen_addr.clone()
|
||||||
|
} else {
|
||||||
|
"127.0.0.1".to_string()
|
||||||
|
};
|
||||||
SafekeeperNode {
|
SafekeeperNode {
|
||||||
id: conf.id,
|
id: conf.id,
|
||||||
conf: conf.clone(),
|
conf: conf.clone(),
|
||||||
pg_connection_config: Self::safekeeper_connection_config(conf.pg_port),
|
pg_connection_config: Self::safekeeper_connection_config(&listen_addr, conf.pg_port),
|
||||||
env: env.clone(),
|
env: env.clone(),
|
||||||
http_client: reqwest::Client::new(),
|
http_client: reqwest::Client::new(),
|
||||||
http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
|
http_base_url: format!("http://{}:{}/v1", listen_addr, conf.http_port),
|
||||||
|
listen_addr,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Construct libpq connection string for connecting to this safekeeper.
|
/// Construct libpq connection string for connecting to this safekeeper.
|
||||||
fn safekeeper_connection_config(port: u16) -> PgConnectionConfig {
|
fn safekeeper_connection_config(addr: &str, port: u16) -> PgConnectionConfig {
|
||||||
PgConnectionConfig::new_host_port(url::Host::parse("127.0.0.1").unwrap(), port)
|
PgConnectionConfig::new_host_port(url::Host::parse(addr).unwrap(), port)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf {
|
pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf {
|
||||||
@@ -111,8 +118,8 @@ impl SafekeeperNode {
|
|||||||
);
|
);
|
||||||
io::stdout().flush().unwrap();
|
io::stdout().flush().unwrap();
|
||||||
|
|
||||||
let listen_pg = format!("127.0.0.1:{}", self.conf.pg_port);
|
let listen_pg = format!("{}:{}", self.listen_addr, self.conf.pg_port);
|
||||||
let listen_http = format!("127.0.0.1:{}", self.conf.http_port);
|
let listen_http = format!("{}:{}", self.listen_addr, self.conf.http_port);
|
||||||
let id = self.id;
|
let id = self.id;
|
||||||
let datadir = self.datadir_path();
|
let datadir = self.datadir_path();
|
||||||
|
|
||||||
@@ -139,7 +146,7 @@ impl SafekeeperNode {
|
|||||||
availability_zone,
|
availability_zone,
|
||||||
];
|
];
|
||||||
if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port {
|
if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port {
|
||||||
let listen_pg_tenant_only = format!("127.0.0.1:{}", pg_tenant_only_port);
|
let listen_pg_tenant_only = format!("{}:{}", self.listen_addr, pg_tenant_only_port);
|
||||||
args.extend(["--listen-pg-tenant-only".to_owned(), listen_pg_tenant_only]);
|
args.extend(["--listen-pg-tenant-only".to_owned(), listen_pg_tenant_only]);
|
||||||
}
|
}
|
||||||
if !self.conf.sync {
|
if !self.conf.sync {
|
||||||
|
|||||||
23
control_plane/storcon_cli/Cargo.toml
Normal file
23
control_plane/storcon_cli/Cargo.toml
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
[package]
|
||||||
|
name = "storcon_cli"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition.workspace = true
|
||||||
|
license.workspace = true
|
||||||
|
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
anyhow.workspace = true
|
||||||
|
clap.workspace = true
|
||||||
|
comfy-table.workspace = true
|
||||||
|
hyper.workspace = true
|
||||||
|
pageserver_api.workspace = true
|
||||||
|
pageserver_client.workspace = true
|
||||||
|
reqwest.workspace = true
|
||||||
|
serde.workspace = true
|
||||||
|
serde_json = { workspace = true, features = ["raw_value"] }
|
||||||
|
thiserror.workspace = true
|
||||||
|
tokio.workspace = true
|
||||||
|
tracing.workspace = true
|
||||||
|
utils.workspace = true
|
||||||
|
workspace_hack.workspace = true
|
||||||
|
|
||||||
681
control_plane/storcon_cli/src/main.rs
Normal file
681
control_plane/storcon_cli/src/main.rs
Normal file
@@ -0,0 +1,681 @@
|
|||||||
|
use std::{collections::HashMap, str::FromStr, time::Duration};
|
||||||
|
|
||||||
|
use clap::{Parser, Subcommand};
|
||||||
|
use hyper::{Method, StatusCode};
|
||||||
|
use pageserver_api::{
|
||||||
|
controller_api::{
|
||||||
|
NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
|
||||||
|
TenantDescribeResponse, TenantPolicyRequest,
|
||||||
|
},
|
||||||
|
models::{
|
||||||
|
LocationConfigSecondary, ShardParameters, TenantConfig, TenantConfigRequest,
|
||||||
|
TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
|
||||||
|
},
|
||||||
|
shard::{ShardStripeSize, TenantShardId},
|
||||||
|
};
|
||||||
|
use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
|
||||||
|
use reqwest::Url;
|
||||||
|
use serde::{de::DeserializeOwned, Serialize};
|
||||||
|
use utils::id::{NodeId, TenantId};
|
||||||
|
|
||||||
|
use pageserver_api::controller_api::{
|
||||||
|
NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
|
||||||
|
TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse,
|
||||||
|
};
|
||||||
|
|
||||||
|
#[derive(Subcommand, Debug)]
|
||||||
|
enum Command {
|
||||||
|
/// Register a pageserver with the storage controller. This shouldn't usually be necessary,
|
||||||
|
/// since pageservers auto-register when they start up
|
||||||
|
NodeRegister {
|
||||||
|
#[arg(long)]
|
||||||
|
node_id: NodeId,
|
||||||
|
|
||||||
|
#[arg(long)]
|
||||||
|
listen_pg_addr: String,
|
||||||
|
#[arg(long)]
|
||||||
|
listen_pg_port: u16,
|
||||||
|
|
||||||
|
#[arg(long)]
|
||||||
|
listen_http_addr: String,
|
||||||
|
#[arg(long)]
|
||||||
|
listen_http_port: u16,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Modify a node's configuration in the storage controller
|
||||||
|
NodeConfigure {
|
||||||
|
#[arg(long)]
|
||||||
|
node_id: NodeId,
|
||||||
|
|
||||||
|
/// Availability is usually auto-detected based on heartbeats. Set 'offline' here to
|
||||||
|
/// manually mark a node offline
|
||||||
|
#[arg(long)]
|
||||||
|
availability: Option<NodeAvailabilityArg>,
|
||||||
|
/// Scheduling policy controls whether tenant shards may be scheduled onto this node.
|
||||||
|
#[arg(long)]
|
||||||
|
scheduling: Option<NodeSchedulingPolicy>,
|
||||||
|
},
|
||||||
|
/// Modify a tenant's policies in the storage controller
|
||||||
|
TenantPolicy {
|
||||||
|
#[arg(long)]
|
||||||
|
tenant_id: TenantId,
|
||||||
|
/// Placement policy controls whether a tenant is `detached`, has only a secondary location (`secondary`),
|
||||||
|
/// or is in the normal attached state with N secondary locations (`attached:N`)
|
||||||
|
#[arg(long)]
|
||||||
|
placement: Option<PlacementPolicyArg>,
|
||||||
|
/// Scheduling policy enables pausing the controller's scheduling activity involving this tenant. `active` is normal,
|
||||||
|
/// `essential` disables optimization scheduling changes, `pause` disables all scheduling changes, and `stop` prevents
|
||||||
|
/// all reconciliation activity including for scheduling changes already made. `pause` and `stop` can make a tenant
|
||||||
|
/// unavailable, and are only for use in emergencies.
|
||||||
|
#[arg(long)]
|
||||||
|
scheduling: Option<ShardSchedulingPolicyArg>,
|
||||||
|
},
|
||||||
|
/// List nodes known to the storage controller
|
||||||
|
Nodes {},
|
||||||
|
/// List tenants known to the storage controller
|
||||||
|
Tenants {},
|
||||||
|
/// Create a new tenant in the storage controller, and by extension on pageservers.
|
||||||
|
TenantCreate {
|
||||||
|
#[arg(long)]
|
||||||
|
tenant_id: TenantId,
|
||||||
|
},
|
||||||
|
/// Delete a tenant in the storage controller, and by extension on pageservers.
|
||||||
|
TenantDelete {
|
||||||
|
#[arg(long)]
|
||||||
|
tenant_id: TenantId,
|
||||||
|
},
|
||||||
|
/// Split an existing tenant into a higher number of shards than its current shard count.
|
||||||
|
TenantShardSplit {
|
||||||
|
#[arg(long)]
|
||||||
|
tenant_id: TenantId,
|
||||||
|
#[arg(long)]
|
||||||
|
shard_count: u8,
|
||||||
|
/// Optional, in 8kiB pages. e.g. set 2048 for 16MB stripes.
|
||||||
|
#[arg(long)]
|
||||||
|
stripe_size: Option<u32>,
|
||||||
|
},
|
||||||
|
/// Migrate the attached location for a tenant shard to a specific pageserver.
|
||||||
|
TenantShardMigrate {
|
||||||
|
#[arg(long)]
|
||||||
|
tenant_shard_id: TenantShardId,
|
||||||
|
#[arg(long)]
|
||||||
|
node: NodeId,
|
||||||
|
},
|
||||||
|
/// Modify the pageserver tenant configuration of a tenant: this is the configuration structure
|
||||||
|
/// that is passed through to pageservers, and does not affect storage controller behavior.
|
||||||
|
TenantConfig {
|
||||||
|
#[arg(long)]
|
||||||
|
tenant_id: TenantId,
|
||||||
|
#[arg(long)]
|
||||||
|
config: String,
|
||||||
|
},
|
||||||
|
/// Attempt to balance the locations for a tenant across pageservers. This is a client-side
|
||||||
|
/// alternative to the storage controller's scheduling optimization behavior.
|
||||||
|
TenantScatter {
|
||||||
|
#[arg(long)]
|
||||||
|
tenant_id: TenantId,
|
||||||
|
},
|
||||||
|
/// Print details about a particular tenant, including all its shards' states.
|
||||||
|
TenantDescribe {
|
||||||
|
#[arg(long)]
|
||||||
|
tenant_id: TenantId,
|
||||||
|
},
|
||||||
|
/// For a tenant which hasn't been onboarded to the storage controller yet, add it in secondary
|
||||||
|
/// mode so that it can warm up content on a pageserver.
|
||||||
|
TenantWarmup {
|
||||||
|
#[arg(long)]
|
||||||
|
tenant_id: TenantId,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Parser)]
|
||||||
|
#[command(
|
||||||
|
author,
|
||||||
|
version,
|
||||||
|
about,
|
||||||
|
long_about = "CLI for Storage Controller Support/Debug"
|
||||||
|
)]
|
||||||
|
#[command(arg_required_else_help(true))]
|
||||||
|
struct Cli {
|
||||||
|
#[arg(long)]
|
||||||
|
/// URL to storage controller. e.g. http://127.0.0.1:1234 when using `neon_local`
|
||||||
|
api: Url,
|
||||||
|
|
||||||
|
#[arg(long)]
|
||||||
|
/// JWT token for authenticating with storage controller. Depending on the API used, this
|
||||||
|
/// should have either `pageserverapi` or `admin` scopes: for convenience, you should mint
|
||||||
|
/// a token with both scopes to use with this tool.
|
||||||
|
jwt: Option<String>,
|
||||||
|
|
||||||
|
#[command(subcommand)]
|
||||||
|
command: Command,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
struct PlacementPolicyArg(PlacementPolicy);
|
||||||
|
|
||||||
|
impl FromStr for PlacementPolicyArg {
|
||||||
|
type Err = anyhow::Error;
|
||||||
|
|
||||||
|
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||||
|
match s {
|
||||||
|
"detached" => Ok(Self(PlacementPolicy::Detached)),
|
||||||
|
"secondary" => Ok(Self(PlacementPolicy::Secondary)),
|
||||||
|
_ if s.starts_with("attached:") => {
|
||||||
|
let mut splitter = s.split(':');
|
||||||
|
let _prefix = splitter.next().unwrap();
|
||||||
|
match splitter.next().and_then(|s| s.parse::<usize>().ok()) {
|
||||||
|
Some(n) => Ok(Self(PlacementPolicy::Attached(n))),
|
||||||
|
None => Err(anyhow::anyhow!(
|
||||||
|
"Invalid format '{s}', a valid example is 'attached:1'"
|
||||||
|
)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => Err(anyhow::anyhow!(
|
||||||
|
"Unknown placement policy '{s}', try detached,secondary,attached:<n>"
|
||||||
|
)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
struct ShardSchedulingPolicyArg(ShardSchedulingPolicy);
|
||||||
|
|
||||||
|
impl FromStr for ShardSchedulingPolicyArg {
|
||||||
|
type Err = anyhow::Error;
|
||||||
|
|
||||||
|
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||||
|
match s {
|
||||||
|
"active" => Ok(Self(ShardSchedulingPolicy::Active)),
|
||||||
|
"essential" => Ok(Self(ShardSchedulingPolicy::Essential)),
|
||||||
|
"pause" => Ok(Self(ShardSchedulingPolicy::Pause)),
|
||||||
|
"stop" => Ok(Self(ShardSchedulingPolicy::Stop)),
|
||||||
|
_ => Err(anyhow::anyhow!(
|
||||||
|
"Unknown scheduling policy '{s}', try active,essential,pause,stop"
|
||||||
|
)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
struct NodeAvailabilityArg(NodeAvailabilityWrapper);
|
||||||
|
|
||||||
|
impl FromStr for NodeAvailabilityArg {
|
||||||
|
type Err = anyhow::Error;
|
||||||
|
|
||||||
|
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||||
|
match s {
|
||||||
|
"active" => Ok(Self(NodeAvailabilityWrapper::Active)),
|
||||||
|
"offline" => Ok(Self(NodeAvailabilityWrapper::Offline)),
|
||||||
|
_ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct Client {
|
||||||
|
base_url: Url,
|
||||||
|
jwt_token: Option<String>,
|
||||||
|
client: reqwest::Client,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Client {
|
||||||
|
fn new(base_url: Url, jwt_token: Option<String>) -> Self {
|
||||||
|
Self {
|
||||||
|
base_url,
|
||||||
|
jwt_token,
|
||||||
|
client: reqwest::ClientBuilder::new()
|
||||||
|
.build()
|
||||||
|
.expect("Failed to construct http client"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Simple HTTP request wrapper for calling into storage controller
|
||||||
|
async fn dispatch<RQ, RS>(
|
||||||
|
&self,
|
||||||
|
method: hyper::Method,
|
||||||
|
path: String,
|
||||||
|
body: Option<RQ>,
|
||||||
|
) -> mgmt_api::Result<RS>
|
||||||
|
where
|
||||||
|
RQ: Serialize + Sized,
|
||||||
|
RS: DeserializeOwned + Sized,
|
||||||
|
{
|
||||||
|
// The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
|
||||||
|
// for general purpose API access.
|
||||||
|
let url = Url::from_str(&format!(
|
||||||
|
"http://{}:{}/{path}",
|
||||||
|
self.base_url.host_str().unwrap(),
|
||||||
|
self.base_url.port().unwrap()
|
||||||
|
))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let mut builder = self.client.request(method, url);
|
||||||
|
if let Some(body) = body {
|
||||||
|
builder = builder.json(&body)
|
||||||
|
}
|
||||||
|
if let Some(jwt_token) = &self.jwt_token {
|
||||||
|
builder = builder.header(
|
||||||
|
reqwest::header::AUTHORIZATION,
|
||||||
|
format!("Bearer {jwt_token}"),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?;
|
||||||
|
let response = response.error_from_body().await?;
|
||||||
|
|
||||||
|
response
|
||||||
|
.json()
|
||||||
|
.await
|
||||||
|
.map_err(pageserver_client::mgmt_api::Error::ReceiveBody)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() -> anyhow::Result<()> {
|
||||||
|
let cli = Cli::parse();
|
||||||
|
|
||||||
|
let storcon_client = Client::new(cli.api.clone(), cli.jwt.clone());
|
||||||
|
|
||||||
|
let mut trimmed = cli.api.to_string();
|
||||||
|
trimmed.pop();
|
||||||
|
let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref());
|
||||||
|
|
||||||
|
match cli.command {
|
||||||
|
Command::NodeRegister {
|
||||||
|
node_id,
|
||||||
|
listen_pg_addr,
|
||||||
|
listen_pg_port,
|
||||||
|
listen_http_addr,
|
||||||
|
listen_http_port,
|
||||||
|
} => {
|
||||||
|
storcon_client
|
||||||
|
.dispatch::<_, ()>(
|
||||||
|
Method::POST,
|
||||||
|
"control/v1/node".to_string(),
|
||||||
|
Some(NodeRegisterRequest {
|
||||||
|
node_id,
|
||||||
|
listen_pg_addr,
|
||||||
|
listen_pg_port,
|
||||||
|
listen_http_addr,
|
||||||
|
listen_http_port,
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
|
Command::TenantCreate { tenant_id } => {
|
||||||
|
vps_client
|
||||||
|
.tenant_create(&TenantCreateRequest {
|
||||||
|
new_tenant_id: TenantShardId::unsharded(tenant_id),
|
||||||
|
generation: None,
|
||||||
|
shard_parameters: ShardParameters::default(),
|
||||||
|
placement_policy: Some(PlacementPolicy::Attached(1)),
|
||||||
|
config: TenantConfig::default(),
|
||||||
|
})
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
|
Command::TenantDelete { tenant_id } => {
|
||||||
|
let status = vps_client
|
||||||
|
.tenant_delete(TenantShardId::unsharded(tenant_id))
|
||||||
|
.await?;
|
||||||
|
tracing::info!("Delete status: {}", status);
|
||||||
|
}
|
||||||
|
Command::Nodes {} => {
|
||||||
|
let resp = storcon_client
|
||||||
|
.dispatch::<(), Vec<NodeDescribeResponse>>(
|
||||||
|
Method::GET,
|
||||||
|
"control/v1/node".to_string(),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
let mut table = comfy_table::Table::new();
|
||||||
|
table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
|
||||||
|
for node in resp {
|
||||||
|
table.add_row([
|
||||||
|
format!("{}", node.id),
|
||||||
|
node.listen_http_addr,
|
||||||
|
format!("{:?}", node.scheduling),
|
||||||
|
format!("{:?}", node.availability),
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
println!("{table}");
|
||||||
|
}
|
||||||
|
Command::NodeConfigure {
|
||||||
|
node_id,
|
||||||
|
availability,
|
||||||
|
scheduling,
|
||||||
|
} => {
|
||||||
|
let req = NodeConfigureRequest {
|
||||||
|
node_id,
|
||||||
|
availability: availability.map(|a| a.0),
|
||||||
|
scheduling,
|
||||||
|
};
|
||||||
|
storcon_client
|
||||||
|
.dispatch::<_, ()>(
|
||||||
|
Method::PUT,
|
||||||
|
format!("control/v1/node/{node_id}/config"),
|
||||||
|
Some(req),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
|
Command::Tenants {} => {
|
||||||
|
let resp = storcon_client
|
||||||
|
.dispatch::<(), Vec<TenantDescribeResponse>>(
|
||||||
|
Method::GET,
|
||||||
|
"control/v1/tenant".to_string(),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
let mut table = comfy_table::Table::new();
|
||||||
|
table.set_header([
|
||||||
|
"TenantId",
|
||||||
|
"ShardCount",
|
||||||
|
"StripeSize",
|
||||||
|
"Placement",
|
||||||
|
"Scheduling",
|
||||||
|
]);
|
||||||
|
for tenant in resp {
|
||||||
|
let shard_zero = tenant.shards.into_iter().next().unwrap();
|
||||||
|
table.add_row([
|
||||||
|
format!("{}", tenant.tenant_id),
|
||||||
|
format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
|
||||||
|
format!("{:?}", tenant.stripe_size),
|
||||||
|
format!("{:?}", tenant.policy),
|
||||||
|
format!("{:?}", shard_zero.scheduling_policy),
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
|
||||||
|
println!("{table}");
|
||||||
|
}
|
||||||
|
Command::TenantPolicy {
|
||||||
|
tenant_id,
|
||||||
|
placement,
|
||||||
|
scheduling,
|
||||||
|
} => {
|
||||||
|
let req = TenantPolicyRequest {
|
||||||
|
scheduling: scheduling.map(|s| s.0),
|
||||||
|
placement: placement.map(|p| p.0),
|
||||||
|
};
|
||||||
|
storcon_client
|
||||||
|
.dispatch::<_, ()>(
|
||||||
|
Method::PUT,
|
||||||
|
format!("control/v1/tenant/{tenant_id}/policy"),
|
||||||
|
Some(req),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
|
Command::TenantShardSplit {
|
||||||
|
tenant_id,
|
||||||
|
shard_count,
|
||||||
|
stripe_size,
|
||||||
|
} => {
|
||||||
|
let req = TenantShardSplitRequest {
|
||||||
|
new_shard_count: shard_count,
|
||||||
|
new_stripe_size: stripe_size.map(ShardStripeSize),
|
||||||
|
};
|
||||||
|
|
||||||
|
let response = storcon_client
|
||||||
|
.dispatch::<TenantShardSplitRequest, TenantShardSplitResponse>(
|
||||||
|
Method::PUT,
|
||||||
|
format!("control/v1/tenant/{tenant_id}/shard_split"),
|
||||||
|
Some(req),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
println!(
|
||||||
|
"Split tenant {} into {} shards: {}",
|
||||||
|
tenant_id,
|
||||||
|
shard_count,
|
||||||
|
response
|
||||||
|
.new_shards
|
||||||
|
.iter()
|
||||||
|
.map(|s| format!("{:?}", s))
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
.join(",")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
Command::TenantShardMigrate {
|
||||||
|
tenant_shard_id,
|
||||||
|
node,
|
||||||
|
} => {
|
||||||
|
let req = TenantShardMigrateRequest {
|
||||||
|
tenant_shard_id,
|
||||||
|
node_id: node,
|
||||||
|
};
|
||||||
|
|
||||||
|
storcon_client
|
||||||
|
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
|
||||||
|
Method::PUT,
|
||||||
|
format!("control/v1/tenant/{tenant_shard_id}/migrate"),
|
||||||
|
Some(req),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
|
Command::TenantConfig { tenant_id, config } => {
|
||||||
|
let tenant_conf = serde_json::from_str(&config)?;
|
||||||
|
|
||||||
|
vps_client
|
||||||
|
.tenant_config(&TenantConfigRequest {
|
||||||
|
tenant_id,
|
||||||
|
config: tenant_conf,
|
||||||
|
})
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
|
Command::TenantScatter { tenant_id } => {
|
||||||
|
// Find the shards
|
||||||
|
let locate_response = storcon_client
|
||||||
|
.dispatch::<(), TenantLocateResponse>(
|
||||||
|
Method::GET,
|
||||||
|
format!("control/v1/tenant/{tenant_id}/locate"),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
let shards = locate_response.shards;
|
||||||
|
|
||||||
|
let mut node_to_shards: HashMap<NodeId, Vec<TenantShardId>> = HashMap::new();
|
||||||
|
let shard_count = shards.len();
|
||||||
|
for s in shards {
|
||||||
|
let entry = node_to_shards.entry(s.node_id).or_default();
|
||||||
|
entry.push(s.shard_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load list of available nodes
|
||||||
|
let nodes_resp = storcon_client
|
||||||
|
.dispatch::<(), Vec<NodeDescribeResponse>>(
|
||||||
|
Method::GET,
|
||||||
|
"control/v1/node".to_string(),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
for node in nodes_resp {
|
||||||
|
if matches!(node.availability, NodeAvailabilityWrapper::Active) {
|
||||||
|
node_to_shards.entry(node.id).or_default();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let max_shard_per_node = shard_count / node_to_shards.len();
|
||||||
|
|
||||||
|
loop {
|
||||||
|
let mut migrate_shard = None;
|
||||||
|
for shards in node_to_shards.values_mut() {
|
||||||
|
if shards.len() > max_shard_per_node {
|
||||||
|
// Pick the emptiest
|
||||||
|
migrate_shard = Some(shards.pop().unwrap());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let Some(migrate_shard) = migrate_shard else {
|
||||||
|
break;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Pick the emptiest node to migrate to
|
||||||
|
let mut destinations = node_to_shards
|
||||||
|
.iter()
|
||||||
|
.map(|(k, v)| (k, v.len()))
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
destinations.sort_by_key(|i| i.1);
|
||||||
|
let (destination_node, destination_count) = *destinations.first().unwrap();
|
||||||
|
if destination_count + 1 > max_shard_per_node {
|
||||||
|
// Even the emptiest destination doesn't have space: we're done
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
let destination_node = *destination_node;
|
||||||
|
|
||||||
|
node_to_shards
|
||||||
|
.get_mut(&destination_node)
|
||||||
|
.unwrap()
|
||||||
|
.push(migrate_shard);
|
||||||
|
|
||||||
|
println!("Migrate {} -> {} ...", migrate_shard, destination_node);
|
||||||
|
|
||||||
|
storcon_client
|
||||||
|
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
|
||||||
|
Method::PUT,
|
||||||
|
format!("control/v1/tenant/{migrate_shard}/migrate"),
|
||||||
|
Some(TenantShardMigrateRequest {
|
||||||
|
tenant_shard_id: migrate_shard,
|
||||||
|
node_id: destination_node,
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
println!("Migrate {} -> {} OK", migrate_shard, destination_node);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Spread the shards across the nodes
|
||||||
|
}
|
||||||
|
Command::TenantDescribe { tenant_id } => {
|
||||||
|
let describe_response = storcon_client
|
||||||
|
.dispatch::<(), TenantDescribeResponse>(
|
||||||
|
Method::GET,
|
||||||
|
format!("control/v1/tenant/{tenant_id}"),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
let shards = describe_response.shards;
|
||||||
|
let mut table = comfy_table::Table::new();
|
||||||
|
table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]);
|
||||||
|
for shard in shards {
|
||||||
|
let secondary = shard
|
||||||
|
.node_secondary
|
||||||
|
.iter()
|
||||||
|
.map(|n| format!("{}", n))
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
.join(",");
|
||||||
|
|
||||||
|
let mut status_parts = Vec::new();
|
||||||
|
if shard.is_reconciling {
|
||||||
|
status_parts.push("reconciling");
|
||||||
|
}
|
||||||
|
|
||||||
|
if shard.is_pending_compute_notification {
|
||||||
|
status_parts.push("pending_compute");
|
||||||
|
}
|
||||||
|
|
||||||
|
if shard.is_splitting {
|
||||||
|
status_parts.push("splitting");
|
||||||
|
}
|
||||||
|
let status = status_parts.join(",");
|
||||||
|
|
||||||
|
table.add_row([
|
||||||
|
format!("{}", shard.tenant_shard_id),
|
||||||
|
shard
|
||||||
|
.node_attached
|
||||||
|
.map(|n| format!("{}", n))
|
||||||
|
.unwrap_or(String::new()),
|
||||||
|
secondary,
|
||||||
|
shard.last_error,
|
||||||
|
status,
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
println!("{table}");
|
||||||
|
}
|
||||||
|
Command::TenantWarmup { tenant_id } => {
|
||||||
|
let describe_response = storcon_client
|
||||||
|
.dispatch::<(), TenantDescribeResponse>(
|
||||||
|
Method::GET,
|
||||||
|
format!("control/v1/tenant/{tenant_id}"),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
match describe_response {
|
||||||
|
Ok(describe) => {
|
||||||
|
if matches!(describe.policy, PlacementPolicy::Secondary) {
|
||||||
|
// Fine: it's already known to controller in secondary mode: calling
|
||||||
|
// again to put it into secondary mode won't cause problems.
|
||||||
|
} else {
|
||||||
|
anyhow::bail!("Tenant already present with policy {:?}", describe.policy);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _)) => {
|
||||||
|
// Fine: this tenant isn't know to the storage controller yet.
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
// Unexpected API error
|
||||||
|
return Err(e.into());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
vps_client
|
||||||
|
.location_config(
|
||||||
|
TenantShardId::unsharded(tenant_id),
|
||||||
|
pageserver_api::models::LocationConfig {
|
||||||
|
mode: pageserver_api::models::LocationConfigMode::Secondary,
|
||||||
|
generation: None,
|
||||||
|
secondary_conf: Some(LocationConfigSecondary { warm: true }),
|
||||||
|
shard_number: 0,
|
||||||
|
shard_count: 0,
|
||||||
|
shard_stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE.0,
|
||||||
|
tenant_conf: TenantConfig::default(),
|
||||||
|
},
|
||||||
|
None,
|
||||||
|
true,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let describe_response = storcon_client
|
||||||
|
.dispatch::<(), TenantDescribeResponse>(
|
||||||
|
Method::GET,
|
||||||
|
format!("control/v1/tenant/{tenant_id}"),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let secondary_ps_id = describe_response
|
||||||
|
.shards
|
||||||
|
.first()
|
||||||
|
.unwrap()
|
||||||
|
.node_secondary
|
||||||
|
.first()
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
println!("Tenant {tenant_id} warming up on pageserver {secondary_ps_id}");
|
||||||
|
loop {
|
||||||
|
let (status, progress) = vps_client
|
||||||
|
.tenant_secondary_download(
|
||||||
|
TenantShardId::unsharded(tenant_id),
|
||||||
|
Some(Duration::from_secs(10)),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
println!(
|
||||||
|
"Progress: {}/{} layers, {}/{} bytes",
|
||||||
|
progress.layers_downloaded,
|
||||||
|
progress.layers_total,
|
||||||
|
progress.bytes_downloaded,
|
||||||
|
progress.bytes_total
|
||||||
|
);
|
||||||
|
match status {
|
||||||
|
StatusCode::OK => {
|
||||||
|
println!("Download complete");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
StatusCode::ACCEPTED => {
|
||||||
|
// Loop
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
anyhow::bail!("Unexpected download status: {status}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
@@ -2,8 +2,8 @@
|
|||||||
# see https://diesel.rs/guides/configuring-diesel-cli
|
# see https://diesel.rs/guides/configuring-diesel-cli
|
||||||
|
|
||||||
[print_schema]
|
[print_schema]
|
||||||
file = "control_plane/attachment_service/src/schema.rs"
|
file = "storage_controller/src/schema.rs"
|
||||||
custom_type_derives = ["diesel::query_builder::QueryId"]
|
custom_type_derives = ["diesel::query_builder::QueryId"]
|
||||||
|
|
||||||
[migrations_directory]
|
[migrations_directory]
|
||||||
dir = "control_plane/attachment_service/migrations"
|
dir = "storage_controller/migrations"
|
||||||
|
|||||||
@@ -7,6 +7,11 @@ Below you will find a brief overview of each subdir in the source tree in alphab
|
|||||||
Neon storage broker, providing messaging between safekeepers and pageservers.
|
Neon storage broker, providing messaging between safekeepers and pageservers.
|
||||||
[storage_broker.md](./storage_broker.md)
|
[storage_broker.md](./storage_broker.md)
|
||||||
|
|
||||||
|
`storage_controller`:
|
||||||
|
|
||||||
|
Neon storage controller, manages a cluster of pageservers and exposes an API that enables
|
||||||
|
managing a many-sharded tenant as a single entity.
|
||||||
|
|
||||||
`/control_plane`:
|
`/control_plane`:
|
||||||
|
|
||||||
Local control plane.
|
Local control plane.
|
||||||
|
|||||||
150
docs/storage_controller.md
Normal file
150
docs/storage_controller.md
Normal file
@@ -0,0 +1,150 @@
|
|||||||
|
# Storage Controller
|
||||||
|
|
||||||
|
## Concepts
|
||||||
|
|
||||||
|
The storage controller sits between administrative API clients and pageservers, and handles the details of mapping tenants to pageserver tenant shards. For example, creating a tenant is one API call to the storage controller,
|
||||||
|
which is mapped into many API calls to many pageservers (for multiple shards, and for secondary locations).
|
||||||
|
|
||||||
|
It implements a pageserver-compatible API that may be used for CRUD operations on tenants and timelines, translating these requests into appropriate operations on the shards within a tenant, which may be on many different pageservers. Using this API, the storage controller may be used in the same way as the pageserver's administrative HTTP API, hiding
|
||||||
|
the underlying details of how data is spread across multiple nodes.
|
||||||
|
|
||||||
|
The storage controller also manages generations, high availability (via secondary locations) and live migrations for tenants under its management. This is done with a reconciliation loop pattern, where tenants have an “intent” state and a “reconcile” task that tries to make the outside world match the intent.
|
||||||
|
|
||||||
|
## APIs
|
||||||
|
|
||||||
|
The storage controller’s HTTP server implements four logically separate APIs:
|
||||||
|
|
||||||
|
- `/v1/...` path is the pageserver-compatible API. This has to be at the path root because that’s where clients expect to find it on a pageserver.
|
||||||
|
- `/control/v1/...` path is the storage controller’s API, which enables operations such as registering and management pageservers, or executing shard splits.
|
||||||
|
- `/debug/v1/...` path contains endpoints which are either exclusively used in tests, or are for use by engineers when supporting a deployed system.
|
||||||
|
- `/upcall/v1/...` path contains endpoints that are called by pageservers. This includes the `/re-attach` and `/validate` APIs used by pageservers
|
||||||
|
to ensure data safety with generation numbers.
|
||||||
|
|
||||||
|
The API is authenticated with a JWT token, and tokens must have scope `pageserverapi` (i.e. the same scope as pageservers’ APIs).
|
||||||
|
|
||||||
|
See the `http.rs` file in the source for where the HTTP APIs are implemented.
|
||||||
|
|
||||||
|
## Database
|
||||||
|
|
||||||
|
The storage controller uses a postgres database to persist a subset of its state. Note that the storage controller does _not_ keep all its state in the database: this is a design choice to enable most operations to be done efficiently in memory, rather than having to read from the database. See `persistence.rs` for a more comprehensive comment explaining what we do and do not persist: a useful metaphor is that we persist objects like tenants and nodes, but we do not
|
||||||
|
persist the _relationships_ between them: the attachment state of a tenant's shards to nodes is kept in memory and
|
||||||
|
rebuilt on startup.
|
||||||
|
|
||||||
|
The file `[persistence.rs](http://persistence.rs)` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.
|
||||||
|
|
||||||
|
The `diesel` crate is used for defining models & migrations.
|
||||||
|
|
||||||
|
Running a local cluster with `cargo neon` automatically starts a vanilla postgress process to host the storage controller’s database.
|
||||||
|
|
||||||
|
### Diesel tip: migrations
|
||||||
|
|
||||||
|
If you need to modify the database schema, here’s how to create a migration:
|
||||||
|
|
||||||
|
- Install the diesel CLI with `cargo install diesel_cli`
|
||||||
|
- Use `diesel migration generate <name>` to create a new migration
|
||||||
|
- Populate the SQL files in the `migrations/` subdirectory
|
||||||
|
- Use `DATABASE_URL=... diesel migration run` to apply the migration you just wrote: this will update the `[schema.rs](http://schema.rs)` file automatically.
|
||||||
|
- This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/attachment_service`
|
||||||
|
- Commit the migration files and the changes to schema.rs
|
||||||
|
- If you need to iterate, you can rewind migrations with `diesel migration revert -a` and then `diesel migration run` again.
|
||||||
|
- The migrations are build into the storage controller binary, and automatically run at startup after it is deployed, so once you’ve committed a migration no further steps are needed.
|
||||||
|
|
||||||
|
## storcon_cli
|
||||||
|
|
||||||
|
The `storcon_cli` tool enables interactive management of the storage controller. This is usually
|
||||||
|
only necessary for debug, but may also be used to manage nodes (e.g. marking a node as offline).
|
||||||
|
|
||||||
|
`storcon_cli --help` includes details on commands.
|
||||||
|
|
||||||
|
# Deploying
|
||||||
|
|
||||||
|
This section is aimed at engineers deploying the storage controller outside of Neon's cloud platform, as
|
||||||
|
part of a self-hosted system.
|
||||||
|
|
||||||
|
_General note: since the default `neon_local` environment includes a storage controller, this is a useful
|
||||||
|
reference when figuring out deployment._
|
||||||
|
|
||||||
|
## Database
|
||||||
|
|
||||||
|
It is **essential** that the database used by the storage controller is durable (**do not store it on ephemeral
|
||||||
|
local disk**). This database contains pageserver generation numbers, which are essential to data safety on the pageserver.
|
||||||
|
|
||||||
|
The resource requirements for the database are very low: a single CPU core and 1GiB of memory should work well for most deployments. The physical size of the database is typically under a gigabyte.
|
||||||
|
|
||||||
|
Set the URL to the database using the `--database-url` CLI option.
|
||||||
|
|
||||||
|
There is no need to run migrations manually: the storage controller automatically applies migrations
|
||||||
|
when it starts up.
|
||||||
|
|
||||||
|
## Configure pageservers to use the storage controller
|
||||||
|
|
||||||
|
1. The pageserver `control_plane_api` and `control_plane_api_token` should be set in the `pageserver.toml` file. The API setting should
|
||||||
|
point to the "upcall" prefix, for example `http://127.0.0.1:1234/upcall/v1/` is used in neon_local clusters.
|
||||||
|
2. Create a `metadata.json` file in the same directory as `pageserver.toml`: this enables the pageserver to automatically register itself
|
||||||
|
with the storage controller when it starts up. See the example below for the format of this file.
|
||||||
|
|
||||||
|
### Example `metadata.json`
|
||||||
|
|
||||||
|
```
|
||||||
|
{"host":"acmehost.localdomain","http_host":"acmehost.localdomain","http_port":9898,"port":64000}
|
||||||
|
```
|
||||||
|
|
||||||
|
- `port` and `host` refer to the _postgres_ port and host, and these must be accessible from wherever
|
||||||
|
postgres runs.
|
||||||
|
- `http_port` and `http_host` refer to the pageserver's HTTP api, this must be accessible from where
|
||||||
|
the storage controller runs.
|
||||||
|
|
||||||
|
## Handle compute notifications.
|
||||||
|
|
||||||
|
The storage controller independently moves tenant attachments between pageservers in response to
|
||||||
|
changes such as a pageserver node becoming unavailable, or the tenant's shard count changing. To enable
|
||||||
|
postgres clients to handle such changes, the storage controller calls an API hook when a tenant's pageserver
|
||||||
|
location changes.
|
||||||
|
|
||||||
|
The hook is configured using the storage controller's `--compute-hook-url` CLI option. If the hook requires
|
||||||
|
JWT auth, the token may be provided with `--control-plane-jwt-token`. The hook will be invoked with a `PUT` request.
|
||||||
|
|
||||||
|
In the Neon cloud service, this hook is implemented by Neon's internal cloud control plane. In `neon_local` systems
|
||||||
|
the storage controller integrates directly with neon_local to reconfigure local postgres processes instead of calling
|
||||||
|
the compute hook.
|
||||||
|
|
||||||
|
When implementing an on-premise Neon deployment, you must implement a service that handles the compute hook. This is not complicated:
|
||||||
|
the request body has format of the `ComputeHookNotifyRequest` structure, provided below for convenience.
|
||||||
|
|
||||||
|
```
|
||||||
|
struct ComputeHookNotifyRequestShard {
|
||||||
|
node_id: NodeId,
|
||||||
|
shard_number: ShardNumber,
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ComputeHookNotifyRequest {
|
||||||
|
tenant_id: TenantId,
|
||||||
|
stripe_size: Option<ShardStripeSize>,
|
||||||
|
shards: Vec<ComputeHookNotifyRequestShard>,
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
When a notification is received:
|
||||||
|
|
||||||
|
1. Modify postgres configuration for this tenant:
|
||||||
|
|
||||||
|
- set `neon.pageserver_connstr` to a comma-separated list of postgres connection strings to pageservers according to the `shards` list. The
|
||||||
|
shards identified by `NodeId` must be converted to the address+port of the node.
|
||||||
|
- if stripe_size is not None, set `neon.stripe_size` to this value
|
||||||
|
|
||||||
|
2. Send SIGHUP to postgres to reload configuration
|
||||||
|
3. Respond with 200 to the notification request. Do not return success if postgres was not updated: if an error is returned, the controller
|
||||||
|
will retry the notification until it succeeds..
|
||||||
|
|
||||||
|
### Example notification body
|
||||||
|
|
||||||
|
```
|
||||||
|
{
|
||||||
|
"tenant_id": "1f359dd625e519a1a4e8d7509690f6fc",
|
||||||
|
"stripe_size": 32768,
|
||||||
|
"shards": [
|
||||||
|
{"node_id": 344, "shard_number": 0},
|
||||||
|
{"node_id": 722, "shard_number": 1},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
```
|
||||||
@@ -10,11 +10,13 @@ libc.workspace = true
|
|||||||
once_cell.workspace = true
|
once_cell.workspace = true
|
||||||
chrono.workspace = true
|
chrono.workspace = true
|
||||||
twox-hash.workspace = true
|
twox-hash.workspace = true
|
||||||
|
measured.workspace = true
|
||||||
|
|
||||||
workspace_hack.workspace = true
|
workspace_hack.workspace = true
|
||||||
|
|
||||||
[target.'cfg(target_os = "linux")'.dependencies]
|
[target.'cfg(target_os = "linux")'.dependencies]
|
||||||
procfs.workspace = true
|
procfs.workspace = true
|
||||||
|
measured-process.workspace = true
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
rand = "0.8"
|
rand = "0.8"
|
||||||
|
|||||||
@@ -7,14 +7,19 @@
|
|||||||
//! use significantly less memory than this, but can only approximate the cardinality.
|
//! use significantly less memory than this, but can only approximate the cardinality.
|
||||||
|
|
||||||
use std::{
|
use std::{
|
||||||
collections::HashMap,
|
hash::{BuildHasher, BuildHasherDefault, Hash},
|
||||||
hash::{BuildHasher, BuildHasherDefault, Hash, Hasher},
|
sync::atomic::AtomicU8,
|
||||||
sync::{atomic::AtomicU8, Arc, RwLock},
|
|
||||||
};
|
};
|
||||||
|
|
||||||
use prometheus::{
|
use measured::{
|
||||||
core::{self, Describer},
|
label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor},
|
||||||
proto, Opts,
|
metric::{
|
||||||
|
group::{Encoding, MetricValue},
|
||||||
|
name::MetricNameEncoder,
|
||||||
|
Metric, MetricType, MetricVec,
|
||||||
|
},
|
||||||
|
text::TextEncoder,
|
||||||
|
LabelGroup,
|
||||||
};
|
};
|
||||||
use twox_hash::xxh3;
|
use twox_hash::xxh3;
|
||||||
|
|
||||||
@@ -93,203 +98,25 @@ macro_rules! register_hll {
|
|||||||
/// ```
|
/// ```
|
||||||
///
|
///
|
||||||
/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
|
/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
|
||||||
#[derive(Clone)]
|
pub type HyperLogLogVec<L, const N: usize> = MetricVec<HyperLogLogState<N>, L>;
|
||||||
pub struct HyperLogLogVec<const N: usize> {
|
pub type HyperLogLog<const N: usize> = Metric<HyperLogLogState<N>>;
|
||||||
core: Arc<HyperLogLogVecCore<N>>,
|
|
||||||
|
pub struct HyperLogLogState<const N: usize> {
|
||||||
|
shards: [AtomicU8; N],
|
||||||
}
|
}
|
||||||
|
impl<const N: usize> Default for HyperLogLogState<N> {
|
||||||
struct HyperLogLogVecCore<const N: usize> {
|
fn default() -> Self {
|
||||||
pub children: RwLock<HashMap<u64, HyperLogLog<N>, BuildHasherDefault<xxh3::Hash64>>>,
|
#[allow(clippy::declare_interior_mutable_const)]
|
||||||
pub desc: core::Desc,
|
const ZERO: AtomicU8 = AtomicU8::new(0);
|
||||||
pub opts: Opts,
|
Self { shards: [ZERO; N] }
|
||||||
}
|
|
||||||
|
|
||||||
impl<const N: usize> core::Collector for HyperLogLogVec<N> {
|
|
||||||
fn desc(&self) -> Vec<&core::Desc> {
|
|
||||||
vec![&self.core.desc]
|
|
||||||
}
|
|
||||||
|
|
||||||
fn collect(&self) -> Vec<proto::MetricFamily> {
|
|
||||||
let mut m = proto::MetricFamily::default();
|
|
||||||
m.set_name(self.core.desc.fq_name.clone());
|
|
||||||
m.set_help(self.core.desc.help.clone());
|
|
||||||
m.set_field_type(proto::MetricType::GAUGE);
|
|
||||||
|
|
||||||
let mut metrics = Vec::new();
|
|
||||||
for child in self.core.children.read().unwrap().values() {
|
|
||||||
child.core.collect_into(&mut metrics);
|
|
||||||
}
|
|
||||||
m.set_metric(metrics);
|
|
||||||
|
|
||||||
vec![m]
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<const N: usize> HyperLogLogVec<N> {
|
impl<const N: usize> MetricType for HyperLogLogState<N> {
|
||||||
/// Create a new [`HyperLogLogVec`] based on the provided
|
type Metadata = ();
|
||||||
/// [`Opts`] and partitioned by the given label names. At least one label name must be
|
|
||||||
/// provided.
|
|
||||||
pub fn new(opts: Opts, label_names: &[&str]) -> prometheus::Result<Self> {
|
|
||||||
assert!(N.is_power_of_two());
|
|
||||||
let variable_names = label_names.iter().map(|s| (*s).to_owned()).collect();
|
|
||||||
let opts = opts.variable_labels(variable_names);
|
|
||||||
|
|
||||||
let desc = opts.describe()?;
|
|
||||||
let v = HyperLogLogVecCore {
|
|
||||||
children: RwLock::new(HashMap::default()),
|
|
||||||
desc,
|
|
||||||
opts,
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok(Self { core: Arc::new(v) })
|
|
||||||
}
|
|
||||||
|
|
||||||
/// `get_metric_with_label_values` returns the [`HyperLogLog<P>`] for the given slice
|
|
||||||
/// of label values (same order as the VariableLabels in Desc). If that combination of
|
|
||||||
/// label values is accessed for the first time, a new [`HyperLogLog<P>`] is created.
|
|
||||||
///
|
|
||||||
/// An error is returned if the number of label values is not the same as the
|
|
||||||
/// number of VariableLabels in Desc.
|
|
||||||
pub fn get_metric_with_label_values(
|
|
||||||
&self,
|
|
||||||
vals: &[&str],
|
|
||||||
) -> prometheus::Result<HyperLogLog<N>> {
|
|
||||||
self.core.get_metric_with_label_values(vals)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// `with_label_values` works as `get_metric_with_label_values`, but panics if an error
|
|
||||||
/// occurs.
|
|
||||||
pub fn with_label_values(&self, vals: &[&str]) -> HyperLogLog<N> {
|
|
||||||
self.get_metric_with_label_values(vals).unwrap()
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<const N: usize> HyperLogLogVecCore<N> {
|
impl<const N: usize> HyperLogLogState<N> {
|
||||||
pub fn get_metric_with_label_values(
|
|
||||||
&self,
|
|
||||||
vals: &[&str],
|
|
||||||
) -> prometheus::Result<HyperLogLog<N>> {
|
|
||||||
let h = self.hash_label_values(vals)?;
|
|
||||||
|
|
||||||
if let Some(metric) = self.children.read().unwrap().get(&h).cloned() {
|
|
||||||
return Ok(metric);
|
|
||||||
}
|
|
||||||
|
|
||||||
self.get_or_create_metric(h, vals)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn hash_label_values(&self, vals: &[&str]) -> prometheus::Result<u64> {
|
|
||||||
if vals.len() != self.desc.variable_labels.len() {
|
|
||||||
return Err(prometheus::Error::InconsistentCardinality {
|
|
||||||
expect: self.desc.variable_labels.len(),
|
|
||||||
got: vals.len(),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut h = xxh3::Hash64::default();
|
|
||||||
for val in vals {
|
|
||||||
h.write(val.as_bytes());
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(h.finish())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn get_or_create_metric(
|
|
||||||
&self,
|
|
||||||
hash: u64,
|
|
||||||
label_values: &[&str],
|
|
||||||
) -> prometheus::Result<HyperLogLog<N>> {
|
|
||||||
let mut children = self.children.write().unwrap();
|
|
||||||
// Check exist first.
|
|
||||||
if let Some(metric) = children.get(&hash).cloned() {
|
|
||||||
return Ok(metric);
|
|
||||||
}
|
|
||||||
|
|
||||||
let metric = HyperLogLog::with_opts_and_label_values(&self.opts, label_values)?;
|
|
||||||
children.insert(hash, metric.clone());
|
|
||||||
Ok(metric)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// HLL is a probabilistic cardinality measure.
|
|
||||||
///
|
|
||||||
/// How to use this time-series for a metric name `my_metrics_total_hll`:
|
|
||||||
///
|
|
||||||
/// ```promql
|
|
||||||
/// # harmonic mean
|
|
||||||
/// 1 / (
|
|
||||||
/// sum (
|
|
||||||
/// 2 ^ -(
|
|
||||||
/// # HLL merge operation
|
|
||||||
/// max (my_metrics_total_hll{}) by (hll_shard, other_labels...)
|
|
||||||
/// )
|
|
||||||
/// ) without (hll_shard)
|
|
||||||
/// )
|
|
||||||
/// * alpha
|
|
||||||
/// * shards_count
|
|
||||||
/// * shards_count
|
|
||||||
/// ```
|
|
||||||
///
|
|
||||||
/// If you want an estimate over time, you can use the following query:
|
|
||||||
///
|
|
||||||
/// ```promql
|
|
||||||
/// # harmonic mean
|
|
||||||
/// 1 / (
|
|
||||||
/// sum (
|
|
||||||
/// 2 ^ -(
|
|
||||||
/// # HLL merge operation
|
|
||||||
/// max (
|
|
||||||
/// max_over_time(my_metrics_total_hll{}[$__rate_interval])
|
|
||||||
/// ) by (hll_shard, other_labels...)
|
|
||||||
/// )
|
|
||||||
/// ) without (hll_shard)
|
|
||||||
/// )
|
|
||||||
/// * alpha
|
|
||||||
/// * shards_count
|
|
||||||
/// * shards_count
|
|
||||||
/// ```
|
|
||||||
///
|
|
||||||
/// In the case of low cardinality, you might want to use the linear counting approximation:
|
|
||||||
///
|
|
||||||
/// ```promql
|
|
||||||
/// # LinearCounting(m, V) = m log (m / V)
|
|
||||||
/// shards_count * ln(shards_count /
|
|
||||||
/// # calculate V = how many shards contain a 0
|
|
||||||
/// count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard)
|
|
||||||
/// )
|
|
||||||
/// ```
|
|
||||||
///
|
|
||||||
/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
|
|
||||||
#[derive(Clone)]
|
|
||||||
pub struct HyperLogLog<const N: usize> {
|
|
||||||
core: Arc<HyperLogLogCore<N>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<const N: usize> HyperLogLog<N> {
|
|
||||||
/// Create a [`HyperLogLog`] with the `name` and `help` arguments.
|
|
||||||
pub fn new<S1: Into<String>, S2: Into<String>>(name: S1, help: S2) -> prometheus::Result<Self> {
|
|
||||||
assert!(N.is_power_of_two());
|
|
||||||
let opts = Opts::new(name, help);
|
|
||||||
Self::with_opts(opts)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Create a [`HyperLogLog`] with the `opts` options.
|
|
||||||
pub fn with_opts(opts: Opts) -> prometheus::Result<Self> {
|
|
||||||
Self::with_opts_and_label_values(&opts, &[])
|
|
||||||
}
|
|
||||||
|
|
||||||
fn with_opts_and_label_values(opts: &Opts, label_values: &[&str]) -> prometheus::Result<Self> {
|
|
||||||
let desc = opts.describe()?;
|
|
||||||
let labels = make_label_pairs(&desc, label_values)?;
|
|
||||||
|
|
||||||
let v = HyperLogLogCore {
|
|
||||||
shards: [0; N].map(AtomicU8::new),
|
|
||||||
desc,
|
|
||||||
labels,
|
|
||||||
};
|
|
||||||
Ok(Self { core: Arc::new(v) })
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn measure(&self, item: &impl Hash) {
|
pub fn measure(&self, item: &impl Hash) {
|
||||||
// changing the hasher will break compatibility with previous measurements.
|
// changing the hasher will break compatibility with previous measurements.
|
||||||
self.record(BuildHasherDefault::<xxh3::Hash64>::default().hash_one(item));
|
self.record(BuildHasherDefault::<xxh3::Hash64>::default().hash_one(item));
|
||||||
@@ -299,42 +126,11 @@ impl<const N: usize> HyperLogLog<N> {
|
|||||||
let p = N.ilog2() as u8;
|
let p = N.ilog2() as u8;
|
||||||
let j = hash & (N as u64 - 1);
|
let j = hash & (N as u64 - 1);
|
||||||
let rho = (hash >> p).leading_zeros() as u8 + 1 - p;
|
let rho = (hash >> p).leading_zeros() as u8 + 1 - p;
|
||||||
self.core.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
|
self.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct HyperLogLogCore<const N: usize> {
|
|
||||||
shards: [AtomicU8; N],
|
|
||||||
desc: core::Desc,
|
|
||||||
labels: Vec<proto::LabelPair>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<const N: usize> core::Collector for HyperLogLog<N> {
|
|
||||||
fn desc(&self) -> Vec<&core::Desc> {
|
|
||||||
vec![&self.core.desc]
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn collect(&self) -> Vec<proto::MetricFamily> {
|
fn take_sample(&self) -> [u8; N] {
|
||||||
let mut m = proto::MetricFamily::default();
|
self.shards.each_ref().map(|x| {
|
||||||
m.set_name(self.core.desc.fq_name.clone());
|
|
||||||
m.set_help(self.core.desc.help.clone());
|
|
||||||
m.set_field_type(proto::MetricType::GAUGE);
|
|
||||||
|
|
||||||
let mut metrics = Vec::new();
|
|
||||||
self.core.collect_into(&mut metrics);
|
|
||||||
m.set_metric(metrics);
|
|
||||||
|
|
||||||
vec![m]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<const N: usize> HyperLogLogCore<N> {
|
|
||||||
fn collect_into(&self, metrics: &mut Vec<proto::Metric>) {
|
|
||||||
self.shards.iter().enumerate().for_each(|(i, x)| {
|
|
||||||
let mut shard_label = proto::LabelPair::default();
|
|
||||||
shard_label.set_name("hll_shard".to_owned());
|
|
||||||
shard_label.set_value(format!("{i}"));
|
|
||||||
|
|
||||||
// We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus.
|
// We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus.
|
||||||
|
|
||||||
// This seems like it would be a race condition,
|
// This seems like it would be a race condition,
|
||||||
@@ -344,85 +140,90 @@ impl<const N: usize> HyperLogLogCore<N> {
|
|||||||
|
|
||||||
// TODO: maybe we shouldn't reset this on every collect, instead, only after a time window.
|
// TODO: maybe we shouldn't reset this on every collect, instead, only after a time window.
|
||||||
// this would mean that a dev port-forwarding the metrics url won't break the sampling.
|
// this would mean that a dev port-forwarding the metrics url won't break the sampling.
|
||||||
let v = x.swap(0, std::sync::atomic::Ordering::Relaxed);
|
x.swap(0, std::sync::atomic::Ordering::Relaxed)
|
||||||
|
|
||||||
let mut m = proto::Metric::default();
|
|
||||||
let mut c = proto::Gauge::default();
|
|
||||||
c.set_value(v as f64);
|
|
||||||
m.set_gauge(c);
|
|
||||||
|
|
||||||
let mut labels = Vec::with_capacity(self.labels.len() + 1);
|
|
||||||
labels.extend_from_slice(&self.labels);
|
|
||||||
labels.push(shard_label);
|
|
||||||
|
|
||||||
m.set_label(labels);
|
|
||||||
metrics.push(m);
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEncoder<W>>
|
||||||
fn make_label_pairs(
|
for HyperLogLogState<N>
|
||||||
desc: &core::Desc,
|
{
|
||||||
label_values: &[&str],
|
fn write_type(
|
||||||
) -> prometheus::Result<Vec<proto::LabelPair>> {
|
name: impl MetricNameEncoder,
|
||||||
if desc.variable_labels.len() != label_values.len() {
|
enc: &mut TextEncoder<W>,
|
||||||
return Err(prometheus::Error::InconsistentCardinality {
|
) -> Result<(), std::io::Error> {
|
||||||
expect: desc.variable_labels.len(),
|
enc.write_type(&name, measured::text::MetricType::Gauge)
|
||||||
got: label_values.len(),
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
fn collect_into(
|
||||||
|
&self,
|
||||||
|
_: &(),
|
||||||
|
labels: impl LabelGroup,
|
||||||
|
name: impl MetricNameEncoder,
|
||||||
|
enc: &mut TextEncoder<W>,
|
||||||
|
) -> Result<(), std::io::Error> {
|
||||||
|
struct I64(i64);
|
||||||
|
impl LabelValue for I64 {
|
||||||
|
fn visit<V: LabelVisitor>(&self, v: V) -> V::Output {
|
||||||
|
v.write_int(self.0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let total_len = desc.variable_labels.len() + desc.const_label_pairs.len();
|
struct HllShardLabel {
|
||||||
if total_len == 0 {
|
hll_shard: i64,
|
||||||
return Ok(vec![]);
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if desc.variable_labels.is_empty() {
|
impl LabelGroup for HllShardLabel {
|
||||||
return Ok(desc.const_label_pairs.clone());
|
fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
|
||||||
}
|
const LE: &LabelName = LabelName::from_str("hll_shard");
|
||||||
|
v.write_value(LE, &I64(self.hll_shard));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let mut label_pairs = Vec::with_capacity(total_len);
|
self.take_sample()
|
||||||
for (i, n) in desc.variable_labels.iter().enumerate() {
|
.into_iter()
|
||||||
let mut label_pair = proto::LabelPair::default();
|
.enumerate()
|
||||||
label_pair.set_name(n.clone());
|
.try_for_each(|(hll_shard, val)| {
|
||||||
label_pair.set_value(label_values[i].to_owned());
|
enc.write_metric_value(
|
||||||
label_pairs.push(label_pair);
|
name.by_ref(),
|
||||||
|
labels.by_ref().compose_with(HllShardLabel {
|
||||||
|
hll_shard: hll_shard as i64,
|
||||||
|
}),
|
||||||
|
MetricValue::Int(val as i64),
|
||||||
|
)
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
for label_pair in &desc.const_label_pairs {
|
|
||||||
label_pairs.push(label_pair.clone());
|
|
||||||
}
|
|
||||||
label_pairs.sort();
|
|
||||||
Ok(label_pairs)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
|
|
||||||
use prometheus::{proto, Opts};
|
use measured::{label::StaticLabelSet, FixedCardinalityLabel};
|
||||||
use rand::{rngs::StdRng, Rng, SeedableRng};
|
use rand::{rngs::StdRng, Rng, SeedableRng};
|
||||||
use rand_distr::{Distribution, Zipf};
|
use rand_distr::{Distribution, Zipf};
|
||||||
|
|
||||||
use crate::HyperLogLogVec;
|
use crate::HyperLogLogVec;
|
||||||
|
|
||||||
fn collect(hll: &HyperLogLogVec<32>) -> Vec<proto::Metric> {
|
#[derive(FixedCardinalityLabel, Clone, Copy)]
|
||||||
let mut metrics = vec![];
|
#[label(singleton = "x")]
|
||||||
hll.core
|
enum Label {
|
||||||
.children
|
A,
|
||||||
.read()
|
B,
|
||||||
.unwrap()
|
|
||||||
.values()
|
|
||||||
.for_each(|c| c.core.collect_into(&mut metrics));
|
|
||||||
metrics
|
|
||||||
}
|
}
|
||||||
fn get_cardinality(metrics: &[proto::Metric], filter: impl Fn(&proto::Metric) -> bool) -> f64 {
|
|
||||||
|
fn collect(hll: &HyperLogLogVec<StaticLabelSet<Label>, 32>) -> ([u8; 32], [u8; 32]) {
|
||||||
|
// cannot go through the `hll.collect_family_into` interface yet...
|
||||||
|
// need to see if I can fix the conflicting impls problem in measured.
|
||||||
|
(
|
||||||
|
hll.get_metric(hll.with_labels(Label::A)).take_sample(),
|
||||||
|
hll.get_metric(hll.with_labels(Label::B)).take_sample(),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_cardinality(samples: &[[u8; 32]]) -> f64 {
|
||||||
let mut buckets = [0.0; 32];
|
let mut buckets = [0.0; 32];
|
||||||
for metric in metrics.chunks_exact(32) {
|
for &sample in samples {
|
||||||
if filter(&metric[0]) {
|
for (i, m) in sample.into_iter().enumerate() {
|
||||||
for (i, m) in metric.iter().enumerate() {
|
buckets[i] = f64::max(buckets[i], m as f64);
|
||||||
buckets[i] = f64::max(buckets[i], m.get_gauge().get_value());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -437,7 +238,7 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn test_cardinality(n: usize, dist: impl Distribution<f64>) -> ([usize; 3], [f64; 3]) {
|
fn test_cardinality(n: usize, dist: impl Distribution<f64>) -> ([usize; 3], [f64; 3]) {
|
||||||
let hll = HyperLogLogVec::<32>::new(Opts::new("foo", "bar"), &["x"]).unwrap();
|
let hll = HyperLogLogVec::<StaticLabelSet<Label>, 32>::new();
|
||||||
|
|
||||||
let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist);
|
let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist);
|
||||||
let mut set_a = HashSet::new();
|
let mut set_a = HashSet::new();
|
||||||
@@ -445,18 +246,20 @@ mod tests {
|
|||||||
|
|
||||||
for x in iter.by_ref().take(n) {
|
for x in iter.by_ref().take(n) {
|
||||||
set_a.insert(x.to_bits());
|
set_a.insert(x.to_bits());
|
||||||
hll.with_label_values(&["a"]).measure(&x.to_bits());
|
hll.get_metric(hll.with_labels(Label::A))
|
||||||
|
.measure(&x.to_bits());
|
||||||
}
|
}
|
||||||
for x in iter.by_ref().take(n) {
|
for x in iter.by_ref().take(n) {
|
||||||
set_b.insert(x.to_bits());
|
set_b.insert(x.to_bits());
|
||||||
hll.with_label_values(&["b"]).measure(&x.to_bits());
|
hll.get_metric(hll.with_labels(Label::B))
|
||||||
|
.measure(&x.to_bits());
|
||||||
}
|
}
|
||||||
let merge = &set_a | &set_b;
|
let merge = &set_a | &set_b;
|
||||||
|
|
||||||
let metrics = collect(&hll);
|
let (a, b) = collect(&hll);
|
||||||
let len = get_cardinality(&metrics, |_| true);
|
let len = get_cardinality(&[a, b]);
|
||||||
let len_a = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "a");
|
let len_a = get_cardinality(&[a]);
|
||||||
let len_b = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "b");
|
let len_b = get_cardinality(&[b]);
|
||||||
|
|
||||||
([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b])
|
([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b])
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,6 +4,17 @@
|
|||||||
//! a default registry.
|
//! a default registry.
|
||||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||||
|
|
||||||
|
use measured::{
|
||||||
|
label::{LabelGroupSet, LabelGroupVisitor, LabelName, NoLabels},
|
||||||
|
metric::{
|
||||||
|
counter::CounterState,
|
||||||
|
gauge::GaugeState,
|
||||||
|
group::{Encoding, MetricValue},
|
||||||
|
name::{MetricName, MetricNameEncoder},
|
||||||
|
MetricEncoding, MetricFamilyEncoding,
|
||||||
|
},
|
||||||
|
FixedCardinalityLabel, LabelGroup, MetricGroup,
|
||||||
|
};
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use prometheus::core::{
|
use prometheus::core::{
|
||||||
Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec,
|
Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec,
|
||||||
@@ -11,6 +22,7 @@ use prometheus::core::{
|
|||||||
pub use prometheus::opts;
|
pub use prometheus::opts;
|
||||||
pub use prometheus::register;
|
pub use prometheus::register;
|
||||||
pub use prometheus::Error;
|
pub use prometheus::Error;
|
||||||
|
use prometheus::Registry;
|
||||||
pub use prometheus::{core, default_registry, proto};
|
pub use prometheus::{core, default_registry, proto};
|
||||||
pub use prometheus::{exponential_buckets, linear_buckets};
|
pub use prometheus::{exponential_buckets, linear_buckets};
|
||||||
pub use prometheus::{register_counter_vec, Counter, CounterVec};
|
pub use prometheus::{register_counter_vec, Counter, CounterVec};
|
||||||
@@ -23,13 +35,12 @@ pub use prometheus::{register_int_counter_vec, IntCounterVec};
|
|||||||
pub use prometheus::{register_int_gauge, IntGauge};
|
pub use prometheus::{register_int_gauge, IntGauge};
|
||||||
pub use prometheus::{register_int_gauge_vec, IntGaugeVec};
|
pub use prometheus::{register_int_gauge_vec, IntGaugeVec};
|
||||||
pub use prometheus::{Encoder, TextEncoder};
|
pub use prometheus::{Encoder, TextEncoder};
|
||||||
use prometheus::{Registry, Result};
|
|
||||||
|
|
||||||
pub mod launch_timestamp;
|
pub mod launch_timestamp;
|
||||||
mod wrappers;
|
mod wrappers;
|
||||||
pub use wrappers::{CountedReader, CountedWriter};
|
pub use wrappers::{CountedReader, CountedWriter};
|
||||||
mod hll;
|
mod hll;
|
||||||
pub use hll::{HyperLogLog, HyperLogLogVec};
|
pub use hll::{HyperLogLog, HyperLogLogState, HyperLogLogVec};
|
||||||
#[cfg(target_os = "linux")]
|
#[cfg(target_os = "linux")]
|
||||||
pub mod more_process_metrics;
|
pub mod more_process_metrics;
|
||||||
|
|
||||||
@@ -59,7 +70,7 @@ static INTERNAL_REGISTRY: Lazy<Registry> = Lazy::new(Registry::new);
|
|||||||
/// Register a collector in the internal registry. MUST be called before the first call to `gather()`.
|
/// Register a collector in the internal registry. MUST be called before the first call to `gather()`.
|
||||||
/// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector
|
/// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector
|
||||||
/// while holding the lock.
|
/// while holding the lock.
|
||||||
pub fn register_internal(c: Box<dyn Collector>) -> Result<()> {
|
pub fn register_internal(c: Box<dyn Collector>) -> prometheus::Result<()> {
|
||||||
INTERNAL_REGISTRY.register(c)
|
INTERNAL_REGISTRY.register(c)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -96,6 +107,127 @@ pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
|
|||||||
0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
|
0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
|
||||||
];
|
];
|
||||||
|
|
||||||
|
pub struct BuildInfo {
|
||||||
|
pub revision: &'static str,
|
||||||
|
pub build_tag: &'static str,
|
||||||
|
}
|
||||||
|
|
||||||
|
// todo: allow label group without the set
|
||||||
|
impl LabelGroup for BuildInfo {
|
||||||
|
fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
|
||||||
|
const REVISION: &LabelName = LabelName::from_str("revision");
|
||||||
|
v.write_value(REVISION, &self.revision);
|
||||||
|
const BUILD_TAG: &LabelName = LabelName::from_str("build_tag");
|
||||||
|
v.write_value(BUILD_TAG, &self.build_tag);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: Encoding> MetricFamilyEncoding<T> for BuildInfo
|
||||||
|
where
|
||||||
|
GaugeState: MetricEncoding<T>,
|
||||||
|
{
|
||||||
|
fn collect_family_into(
|
||||||
|
&self,
|
||||||
|
name: impl measured::metric::name::MetricNameEncoder,
|
||||||
|
enc: &mut T,
|
||||||
|
) -> Result<(), T::Err> {
|
||||||
|
enc.write_help(&name, "Build/version information")?;
|
||||||
|
GaugeState::write_type(&name, enc)?;
|
||||||
|
GaugeState {
|
||||||
|
count: std::sync::atomic::AtomicI64::new(1),
|
||||||
|
}
|
||||||
|
.collect_into(&(), self, name, enc)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(MetricGroup)]
|
||||||
|
#[metric(new(build_info: BuildInfo))]
|
||||||
|
pub struct NeonMetrics {
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
#[metric(namespace = "process")]
|
||||||
|
#[metric(init = measured_process::ProcessCollector::for_self())]
|
||||||
|
process: measured_process::ProcessCollector,
|
||||||
|
|
||||||
|
#[metric(namespace = "libmetrics")]
|
||||||
|
#[metric(init = LibMetrics::new(build_info))]
|
||||||
|
libmetrics: LibMetrics,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(MetricGroup)]
|
||||||
|
#[metric(new(build_info: BuildInfo))]
|
||||||
|
pub struct LibMetrics {
|
||||||
|
#[metric(init = build_info)]
|
||||||
|
build_info: BuildInfo,
|
||||||
|
|
||||||
|
#[metric(flatten)]
|
||||||
|
rusage: Rusage,
|
||||||
|
|
||||||
|
serve_count: CollectionCounter,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn write_gauge<Enc: Encoding>(
|
||||||
|
x: i64,
|
||||||
|
labels: impl LabelGroup,
|
||||||
|
name: impl MetricNameEncoder,
|
||||||
|
enc: &mut Enc,
|
||||||
|
) -> Result<(), Enc::Err> {
|
||||||
|
enc.write_metric_value(name, labels, MetricValue::Int(x))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Default)]
|
||||||
|
struct Rusage;
|
||||||
|
|
||||||
|
#[derive(FixedCardinalityLabel, Clone, Copy)]
|
||||||
|
#[label(singleton = "io_operation")]
|
||||||
|
enum IoOp {
|
||||||
|
Read,
|
||||||
|
Write,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: Encoding> MetricGroup<T> for Rusage
|
||||||
|
where
|
||||||
|
GaugeState: MetricEncoding<T>,
|
||||||
|
{
|
||||||
|
fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
|
||||||
|
const DISK_IO: &MetricName = MetricName::from_str("disk_io_bytes_total");
|
||||||
|
const MAXRSS: &MetricName = MetricName::from_str("maxrss_kb");
|
||||||
|
|
||||||
|
let ru = get_rusage_stats();
|
||||||
|
|
||||||
|
enc.write_help(
|
||||||
|
DISK_IO,
|
||||||
|
"Bytes written and read from disk, grouped by the operation (read|write)",
|
||||||
|
)?;
|
||||||
|
GaugeState::write_type(DISK_IO, enc)?;
|
||||||
|
write_gauge(ru.ru_inblock * BYTES_IN_BLOCK, IoOp::Read, DISK_IO, enc)?;
|
||||||
|
write_gauge(ru.ru_oublock * BYTES_IN_BLOCK, IoOp::Write, DISK_IO, enc)?;
|
||||||
|
|
||||||
|
enc.write_help(MAXRSS, "Memory usage (Maximum Resident Set Size)")?;
|
||||||
|
GaugeState::write_type(MAXRSS, enc)?;
|
||||||
|
write_gauge(ru.ru_maxrss, IoOp::Read, MAXRSS, enc)?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Default)]
|
||||||
|
struct CollectionCounter(CounterState);
|
||||||
|
|
||||||
|
impl<T: Encoding> MetricFamilyEncoding<T> for CollectionCounter
|
||||||
|
where
|
||||||
|
CounterState: MetricEncoding<T>,
|
||||||
|
{
|
||||||
|
fn collect_family_into(
|
||||||
|
&self,
|
||||||
|
name: impl measured::metric::name::MetricNameEncoder,
|
||||||
|
enc: &mut T,
|
||||||
|
) -> Result<(), T::Err> {
|
||||||
|
self.0.inc();
|
||||||
|
enc.write_help(&name, "Number of metric requests made")?;
|
||||||
|
self.0.collect_into(&(), NoLabels, name, enc)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn set_build_info_metric(revision: &str, build_tag: &str) {
|
pub fn set_build_info_metric(revision: &str, build_tag: &str) {
|
||||||
let metric = register_int_gauge_vec!(
|
let metric = register_int_gauge_vec!(
|
||||||
"libmetrics_build_info",
|
"libmetrics_build_info",
|
||||||
@@ -105,6 +237,7 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) {
|
|||||||
.expect("Failed to register build info metric");
|
.expect("Failed to register build info metric");
|
||||||
metric.with_label_values(&[revision, build_tag]).set(1);
|
metric.with_label_values(&[revision, build_tag]).set(1);
|
||||||
}
|
}
|
||||||
|
const BYTES_IN_BLOCK: i64 = 512;
|
||||||
|
|
||||||
// Records I/O stats in a "cross-platform" way.
|
// Records I/O stats in a "cross-platform" way.
|
||||||
// Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats.
|
// Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats.
|
||||||
@@ -117,7 +250,6 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) {
|
|||||||
fn update_rusage_metrics() {
|
fn update_rusage_metrics() {
|
||||||
let rusage_stats = get_rusage_stats();
|
let rusage_stats = get_rusage_stats();
|
||||||
|
|
||||||
const BYTES_IN_BLOCK: i64 = 512;
|
|
||||||
DISK_IO_BYTES
|
DISK_IO_BYTES
|
||||||
.with_label_values(&["read"])
|
.with_label_values(&["read"])
|
||||||
.set(rusage_stats.ru_inblock * BYTES_IN_BLOCK);
|
.set(rusage_stats.ru_inblock * BYTES_IN_BLOCK);
|
||||||
@@ -151,6 +283,7 @@ macro_rules! register_int_counter_pair_vec {
|
|||||||
}
|
}
|
||||||
}};
|
}};
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Create an [`IntCounterPair`] and registers to default registry.
|
/// Create an [`IntCounterPair`] and registers to default registry.
|
||||||
#[macro_export(local_inner_macros)]
|
#[macro_export(local_inner_macros)]
|
||||||
macro_rules! register_int_counter_pair {
|
macro_rules! register_int_counter_pair {
|
||||||
@@ -188,7 +321,10 @@ impl<P: Atomic> GenericCounterPairVec<P> {
|
|||||||
///
|
///
|
||||||
/// An error is returned if the number of label values is not the same as the
|
/// An error is returned if the number of label values is not the same as the
|
||||||
/// number of VariableLabels in Desc.
|
/// number of VariableLabels in Desc.
|
||||||
pub fn get_metric_with_label_values(&self, vals: &[&str]) -> Result<GenericCounterPair<P>> {
|
pub fn get_metric_with_label_values(
|
||||||
|
&self,
|
||||||
|
vals: &[&str],
|
||||||
|
) -> prometheus::Result<GenericCounterPair<P>> {
|
||||||
Ok(GenericCounterPair {
|
Ok(GenericCounterPair {
|
||||||
inc: self.inc.get_metric_with_label_values(vals)?,
|
inc: self.inc.get_metric_with_label_values(vals)?,
|
||||||
dec: self.dec.get_metric_with_label_values(vals)?,
|
dec: self.dec.get_metric_with_label_values(vals)?,
|
||||||
@@ -201,7 +337,7 @@ impl<P: Atomic> GenericCounterPairVec<P> {
|
|||||||
self.get_metric_with_label_values(vals).unwrap()
|
self.get_metric_with_label_values(vals).unwrap()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn remove_label_values(&self, res: &mut [Result<()>; 2], vals: &[&str]) {
|
pub fn remove_label_values(&self, res: &mut [prometheus::Result<()>; 2], vals: &[&str]) {
|
||||||
res[0] = self.inc.remove_label_values(vals);
|
res[0] = self.inc.remove_label_values(vals);
|
||||||
res[1] = self.dec.remove_label_values(vals);
|
res[1] = self.dec.remove_label_values(vals);
|
||||||
}
|
}
|
||||||
@@ -285,3 +421,171 @@ pub type IntCounterPair = GenericCounterPair<AtomicU64>;
|
|||||||
|
|
||||||
/// A guard for [`IntCounterPair`] that will decrement the gauge on drop
|
/// A guard for [`IntCounterPair`] that will decrement the gauge on drop
|
||||||
pub type IntCounterPairGuard = GenericCounterPairGuard<AtomicU64>;
|
pub type IntCounterPairGuard = GenericCounterPairGuard<AtomicU64>;
|
||||||
|
|
||||||
|
pub trait CounterPairAssoc {
|
||||||
|
const INC_NAME: &'static MetricName;
|
||||||
|
const DEC_NAME: &'static MetricName;
|
||||||
|
|
||||||
|
const INC_HELP: &'static str;
|
||||||
|
const DEC_HELP: &'static str;
|
||||||
|
|
||||||
|
type LabelGroupSet: LabelGroupSet;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct CounterPairVec<A: CounterPairAssoc> {
|
||||||
|
vec: measured::metric::MetricVec<MeasuredCounterPairState, A::LabelGroupSet>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<A: CounterPairAssoc> Default for CounterPairVec<A>
|
||||||
|
where
|
||||||
|
A::LabelGroupSet: Default,
|
||||||
|
{
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
vec: Default::default(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<A: CounterPairAssoc> CounterPairVec<A> {
|
||||||
|
pub fn guard(
|
||||||
|
&self,
|
||||||
|
labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>,
|
||||||
|
) -> MeasuredCounterPairGuard<'_, A> {
|
||||||
|
let id = self.vec.with_labels(labels);
|
||||||
|
self.vec.get_metric(id).inc.inc();
|
||||||
|
MeasuredCounterPairGuard { vec: &self.vec, id }
|
||||||
|
}
|
||||||
|
pub fn inc(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) {
|
||||||
|
let id = self.vec.with_labels(labels);
|
||||||
|
self.vec.get_metric(id).inc.inc();
|
||||||
|
}
|
||||||
|
pub fn dec(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) {
|
||||||
|
let id = self.vec.with_labels(labels);
|
||||||
|
self.vec.get_metric(id).dec.inc();
|
||||||
|
}
|
||||||
|
pub fn remove_metric(
|
||||||
|
&self,
|
||||||
|
labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>,
|
||||||
|
) -> Option<MeasuredCounterPairState> {
|
||||||
|
let id = self.vec.with_labels(labels);
|
||||||
|
self.vec.remove_metric(id)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T, A> ::measured::metric::group::MetricGroup<T> for CounterPairVec<A>
|
||||||
|
where
|
||||||
|
T: ::measured::metric::group::Encoding,
|
||||||
|
A: CounterPairAssoc,
|
||||||
|
::measured::metric::counter::CounterState: ::measured::metric::MetricEncoding<T>,
|
||||||
|
{
|
||||||
|
fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
|
||||||
|
// write decrement first to avoid a race condition where inc - dec < 0
|
||||||
|
T::write_help(enc, A::DEC_NAME, A::DEC_HELP)?;
|
||||||
|
self.vec
|
||||||
|
.collect_family_into(A::DEC_NAME, &mut Dec(&mut *enc))?;
|
||||||
|
|
||||||
|
T::write_help(enc, A::INC_NAME, A::INC_HELP)?;
|
||||||
|
self.vec
|
||||||
|
.collect_family_into(A::INC_NAME, &mut Inc(&mut *enc))?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(MetricGroup, Default)]
|
||||||
|
pub struct MeasuredCounterPairState {
|
||||||
|
pub inc: CounterState,
|
||||||
|
pub dec: CounterState,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl measured::metric::MetricType for MeasuredCounterPairState {
|
||||||
|
type Metadata = ();
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct MeasuredCounterPairGuard<'a, A: CounterPairAssoc> {
|
||||||
|
vec: &'a measured::metric::MetricVec<MeasuredCounterPairState, A::LabelGroupSet>,
|
||||||
|
id: measured::metric::LabelId<A::LabelGroupSet>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<A: CounterPairAssoc> Drop for MeasuredCounterPairGuard<'_, A> {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
self.vec.get_metric(self.id).dec.inc();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the inc counter to the inner encoder.
|
||||||
|
struct Inc<T>(T);
|
||||||
|
/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the dec counter to the inner encoder.
|
||||||
|
struct Dec<T>(T);
|
||||||
|
|
||||||
|
impl<T: Encoding> Encoding for Inc<T> {
|
||||||
|
type Err = T::Err;
|
||||||
|
|
||||||
|
fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
|
||||||
|
self.0.write_help(name, help)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn write_metric_value(
|
||||||
|
&mut self,
|
||||||
|
name: impl MetricNameEncoder,
|
||||||
|
labels: impl LabelGroup,
|
||||||
|
value: MetricValue,
|
||||||
|
) -> Result<(), Self::Err> {
|
||||||
|
self.0.write_metric_value(name, labels, value)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: Encoding> MetricEncoding<Inc<T>> for MeasuredCounterPairState
|
||||||
|
where
|
||||||
|
CounterState: MetricEncoding<T>,
|
||||||
|
{
|
||||||
|
fn write_type(name: impl MetricNameEncoder, enc: &mut Inc<T>) -> Result<(), T::Err> {
|
||||||
|
CounterState::write_type(name, &mut enc.0)
|
||||||
|
}
|
||||||
|
fn collect_into(
|
||||||
|
&self,
|
||||||
|
metadata: &(),
|
||||||
|
labels: impl LabelGroup,
|
||||||
|
name: impl MetricNameEncoder,
|
||||||
|
enc: &mut Inc<T>,
|
||||||
|
) -> Result<(), T::Err> {
|
||||||
|
self.inc.collect_into(metadata, labels, name, &mut enc.0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: Encoding> Encoding for Dec<T> {
|
||||||
|
type Err = T::Err;
|
||||||
|
|
||||||
|
fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
|
||||||
|
self.0.write_help(name, help)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn write_metric_value(
|
||||||
|
&mut self,
|
||||||
|
name: impl MetricNameEncoder,
|
||||||
|
labels: impl LabelGroup,
|
||||||
|
value: MetricValue,
|
||||||
|
) -> Result<(), Self::Err> {
|
||||||
|
self.0.write_metric_value(name, labels, value)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Write the dec counter to the encoder
|
||||||
|
impl<T: Encoding> MetricEncoding<Dec<T>> for MeasuredCounterPairState
|
||||||
|
where
|
||||||
|
CounterState: MetricEncoding<T>,
|
||||||
|
{
|
||||||
|
fn write_type(name: impl MetricNameEncoder, enc: &mut Dec<T>) -> Result<(), T::Err> {
|
||||||
|
CounterState::write_type(name, &mut enc.0)
|
||||||
|
}
|
||||||
|
fn collect_into(
|
||||||
|
&self,
|
||||||
|
metadata: &(),
|
||||||
|
labels: impl LabelGroup,
|
||||||
|
name: impl MetricNameEncoder,
|
||||||
|
enc: &mut Dec<T>,
|
||||||
|
) -> Result<(), T::Err> {
|
||||||
|
self.dec.collect_into(metadata, labels, name, &mut enc.0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -2,9 +2,9 @@ use std::str::FromStr;
|
|||||||
|
|
||||||
/// Request/response types for the storage controller
|
/// Request/response types for the storage controller
|
||||||
/// API (`/control/v1` prefix). Implemented by the server
|
/// API (`/control/v1` prefix). Implemented by the server
|
||||||
/// in [`attachment_service::http`]
|
/// in [`storage_controller::http`]
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use utils::id::NodeId;
|
use utils::id::{NodeId, TenantId};
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
models::{ShardParameters, TenantConfig},
|
models::{ShardParameters, TenantConfig},
|
||||||
@@ -42,6 +42,12 @@ pub struct NodeConfigureRequest {
|
|||||||
pub scheduling: Option<NodeSchedulingPolicy>,
|
pub scheduling: Option<NodeSchedulingPolicy>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize)]
|
||||||
|
pub struct TenantPolicyRequest {
|
||||||
|
pub placement: Option<PlacementPolicy>,
|
||||||
|
pub scheduling: Option<ShardSchedulingPolicy>,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
pub struct TenantLocateResponseShard {
|
pub struct TenantLocateResponseShard {
|
||||||
pub shard_id: TenantShardId,
|
pub shard_id: TenantShardId,
|
||||||
@@ -62,12 +68,27 @@ pub struct TenantLocateResponse {
|
|||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize)]
|
||||||
pub struct TenantDescribeResponse {
|
pub struct TenantDescribeResponse {
|
||||||
|
pub tenant_id: TenantId,
|
||||||
pub shards: Vec<TenantDescribeResponseShard>,
|
pub shards: Vec<TenantDescribeResponseShard>,
|
||||||
pub stripe_size: ShardStripeSize,
|
pub stripe_size: ShardStripeSize,
|
||||||
pub policy: PlacementPolicy,
|
pub policy: PlacementPolicy,
|
||||||
pub config: TenantConfig,
|
pub config: TenantConfig,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize)]
|
||||||
|
pub struct NodeDescribeResponse {
|
||||||
|
pub id: NodeId,
|
||||||
|
|
||||||
|
pub availability: NodeAvailabilityWrapper,
|
||||||
|
pub scheduling: NodeSchedulingPolicy,
|
||||||
|
|
||||||
|
pub listen_http_addr: String,
|
||||||
|
pub listen_http_port: u16,
|
||||||
|
|
||||||
|
pub listen_pg_addr: String,
|
||||||
|
pub listen_pg_port: u16,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize)]
|
||||||
pub struct TenantDescribeResponseShard {
|
pub struct TenantDescribeResponseShard {
|
||||||
pub tenant_shard_id: TenantShardId,
|
pub tenant_shard_id: TenantShardId,
|
||||||
@@ -83,6 +104,8 @@ pub struct TenantDescribeResponseShard {
|
|||||||
pub is_pending_compute_notification: bool,
|
pub is_pending_compute_notification: bool,
|
||||||
/// A shard split is currently underway
|
/// A shard split is currently underway
|
||||||
pub is_splitting: bool,
|
pub is_splitting: bool,
|
||||||
|
|
||||||
|
pub scheduling_policy: ShardSchedulingPolicy,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Explicitly migrating a particular shard is a low level operation
|
/// Explicitly migrating a particular shard is a low level operation
|
||||||
@@ -97,7 +120,7 @@ pub struct TenantShardMigrateRequest {
|
|||||||
/// Utilisation score indicating how good a candidate a pageserver
|
/// Utilisation score indicating how good a candidate a pageserver
|
||||||
/// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
|
/// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
|
||||||
/// Lower values are better.
|
/// Lower values are better.
|
||||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
|
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)]
|
||||||
pub struct UtilizationScore(pub u64);
|
pub struct UtilizationScore(pub u64);
|
||||||
|
|
||||||
impl UtilizationScore {
|
impl UtilizationScore {
|
||||||
@@ -106,7 +129,7 @@ impl UtilizationScore {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Clone, Copy)]
|
#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
|
||||||
#[serde(into = "NodeAvailabilityWrapper")]
|
#[serde(into = "NodeAvailabilityWrapper")]
|
||||||
pub enum NodeAvailability {
|
pub enum NodeAvailability {
|
||||||
// Normal, happy state
|
// Normal, happy state
|
||||||
@@ -129,7 +152,7 @@ impl Eq for NodeAvailability {}
|
|||||||
// This wrapper provides serde functionality and it should only be used to
|
// This wrapper provides serde functionality and it should only be used to
|
||||||
// communicate with external callers which don't know or care about the
|
// communicate with external callers which don't know or care about the
|
||||||
// utilisation score of the pageserver it is targeting.
|
// utilisation score of the pageserver it is targeting.
|
||||||
#[derive(Serialize, Deserialize, Clone)]
|
#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
|
||||||
pub enum NodeAvailabilityWrapper {
|
pub enum NodeAvailabilityWrapper {
|
||||||
Active,
|
Active,
|
||||||
Offline,
|
Offline,
|
||||||
@@ -155,22 +178,33 @@ impl From<NodeAvailability> for NodeAvailabilityWrapper {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl FromStr for NodeAvailability {
|
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
|
||||||
type Err = anyhow::Error;
|
pub enum ShardSchedulingPolicy {
|
||||||
|
// Normal mode: the tenant's scheduled locations may be updated at will, including
|
||||||
|
// for non-essential optimization.
|
||||||
|
Active,
|
||||||
|
|
||||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
// Disable optimizations, but permit scheduling when necessary to fulfil the PlacementPolicy.
|
||||||
match s {
|
// For example, this still permits a node's attachment location to change to a secondary in
|
||||||
// This is used when parsing node configuration requests from neon-local.
|
// response to a node failure, or to assign a new secondary if a node was removed.
|
||||||
// Assume the worst possible utilisation score
|
Essential,
|
||||||
// and let it get updated via the heartbeats.
|
|
||||||
"active" => Ok(Self::Active(UtilizationScore::worst())),
|
// No scheduling: leave the shard running wherever it currently is. Even if the shard is
|
||||||
"offline" => Ok(Self::Offline),
|
// unavailable, it will not be rescheduled to another node.
|
||||||
_ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
|
Pause,
|
||||||
}
|
|
||||||
|
// No reconciling: we will make no location_conf API calls to pageservers at all. If the
|
||||||
|
// shard is unavailable, it stays that way. If a node fails, this shard doesn't get failed over.
|
||||||
|
Stop,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for ShardSchedulingPolicy {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::Active
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
|
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
|
||||||
pub enum NodeSchedulingPolicy {
|
pub enum NodeSchedulingPolicy {
|
||||||
Active,
|
Active,
|
||||||
Filling,
|
Filling,
|
||||||
|
|||||||
@@ -1,8 +1,10 @@
|
|||||||
use anyhow::{bail, Result};
|
use anyhow::{bail, Result};
|
||||||
use byteorder::{ByteOrder, BE};
|
use byteorder::{ByteOrder, BE};
|
||||||
|
use bytes::BufMut;
|
||||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
|
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||||
use postgres_ffi::{Oid, TransactionId};
|
use postgres_ffi::{Oid, TransactionId};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
use std::ops::RangeInclusive;
|
||||||
use std::{fmt, ops::Range};
|
use std::{fmt, ops::Range};
|
||||||
|
|
||||||
use crate::reltag::{BlockNumber, RelTag, SlruKind};
|
use crate::reltag::{BlockNumber, RelTag, SlruKind};
|
||||||
@@ -21,9 +23,81 @@ pub struct Key {
|
|||||||
pub field6: u32,
|
pub field6: u32,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// The storage key size.
|
||||||
pub const KEY_SIZE: usize = 18;
|
pub const KEY_SIZE: usize = 18;
|
||||||
|
|
||||||
|
/// The metadata key size. 2B fewer than the storage key size because field2 is not fully utilized.
|
||||||
|
/// See [`Key::to_i128`] for more information on the encoding.
|
||||||
|
pub const METADATA_KEY_SIZE: usize = 16;
|
||||||
|
|
||||||
|
/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x80 is a metadata key.
|
||||||
|
pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x80;
|
||||||
|
|
||||||
|
/// The (reserved) key prefix of relation sizes.
|
||||||
|
pub const RELATION_SIZE_PREFIX: u8 = 0x81;
|
||||||
|
|
||||||
|
/// The key prefix of AUX file keys.
|
||||||
|
pub const AUX_KEY_PREFIX: u8 = 0x82;
|
||||||
|
|
||||||
|
/// Check if the key falls in the range of metadata keys.
|
||||||
|
pub const fn is_metadata_key_slice(key: &[u8]) -> bool {
|
||||||
|
key[0] >= METADATA_KEY_BEGIN_PREFIX
|
||||||
|
}
|
||||||
|
|
||||||
impl Key {
|
impl Key {
|
||||||
|
/// Check if the key falls in the range of metadata keys.
|
||||||
|
pub const fn is_metadata_key(&self) -> bool {
|
||||||
|
self.field1 >= METADATA_KEY_BEGIN_PREFIX
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Encode a metadata key to a storage key.
|
||||||
|
pub fn from_metadata_key_fixed_size(key: &[u8; METADATA_KEY_SIZE]) -> Self {
|
||||||
|
assert!(is_metadata_key_slice(key), "key not in metadata key range");
|
||||||
|
Key {
|
||||||
|
field1: key[0],
|
||||||
|
field2: u16::from_be_bytes(key[1..3].try_into().unwrap()) as u32,
|
||||||
|
field3: u32::from_be_bytes(key[3..7].try_into().unwrap()),
|
||||||
|
field4: u32::from_be_bytes(key[7..11].try_into().unwrap()),
|
||||||
|
field5: key[11],
|
||||||
|
field6: u32::from_be_bytes(key[12..16].try_into().unwrap()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Encode a metadata key to a storage key.
|
||||||
|
pub fn from_metadata_key(key: &[u8]) -> Self {
|
||||||
|
Self::from_metadata_key_fixed_size(key.try_into().expect("expect 16 byte metadata key"))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract a metadata key to a writer. The result should always be 16 bytes.
|
||||||
|
pub fn extract_metadata_key_to_writer(&self, mut writer: impl BufMut) {
|
||||||
|
writer.put_u8(self.field1);
|
||||||
|
assert!(self.field2 <= 0xFFFF);
|
||||||
|
writer.put_u16(self.field2 as u16);
|
||||||
|
writer.put_u32(self.field3);
|
||||||
|
writer.put_u32(self.field4);
|
||||||
|
writer.put_u8(self.field5);
|
||||||
|
writer.put_u32(self.field6);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get the range of metadata keys.
|
||||||
|
pub fn metadata_key_range() -> RangeInclusive<Self> {
|
||||||
|
Key {
|
||||||
|
field1: METADATA_KEY_BEGIN_PREFIX,
|
||||||
|
field2: 0,
|
||||||
|
field3: 0,
|
||||||
|
field4: 0,
|
||||||
|
field5: 0,
|
||||||
|
field6: 0,
|
||||||
|
}..=Key {
|
||||||
|
field1: u8::MAX,
|
||||||
|
field2: u16::MAX as u32,
|
||||||
|
field3: u32::MAX,
|
||||||
|
field4: u32::MAX,
|
||||||
|
field5: u8::MAX,
|
||||||
|
field6: u32::MAX,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
|
/// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
|
||||||
/// As long as Neon does not support tablespace (because of lack of access to local file system),
|
/// As long as Neon does not support tablespace (because of lack of access to local file system),
|
||||||
/// we can assume that only some predefined namespace OIDs are used which can fit in u16
|
/// we can assume that only some predefined namespace OIDs are used which can fit in u16
|
||||||
@@ -48,11 +122,11 @@ impl Key {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn next(&self) -> Key {
|
pub const fn next(&self) -> Key {
|
||||||
self.add(1)
|
self.add(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add(&self, x: u32) -> Key {
|
pub const fn add(&self, x: u32) -> Key {
|
||||||
let mut key = *self;
|
let mut key = *self;
|
||||||
|
|
||||||
let r = key.field6.overflowing_add(x);
|
let r = key.field6.overflowing_add(x);
|
||||||
@@ -81,6 +155,8 @@ impl Key {
|
|||||||
key
|
key
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Convert a 18B slice to a key. This function should not be used for metadata keys because field2 is handled differently.
|
||||||
|
/// Use [`Key::from_metadata_key`] instead.
|
||||||
pub fn from_slice(b: &[u8]) -> Self {
|
pub fn from_slice(b: &[u8]) -> Self {
|
||||||
Key {
|
Key {
|
||||||
field1: b[0],
|
field1: b[0],
|
||||||
@@ -92,6 +168,8 @@ impl Key {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Convert a key to a 18B slice. This function should not be used for metadata keys because field2 is handled differently.
|
||||||
|
/// Use [`Key::extract_metadata_key_to_writer`] instead.
|
||||||
pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
|
pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
|
||||||
buf[0] = self.field1;
|
buf[0] = self.field1;
|
||||||
BE::write_u32(&mut buf[1..5], self.field2);
|
BE::write_u32(&mut buf[1..5], self.field2);
|
||||||
@@ -475,12 +553,14 @@ pub const AUX_FILES_KEY: Key = Key {
|
|||||||
// Reverse mappings for a few Keys.
|
// Reverse mappings for a few Keys.
|
||||||
// These are needed by WAL redo manager.
|
// These are needed by WAL redo manager.
|
||||||
|
|
||||||
|
pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
|
||||||
|
|
||||||
// AUX_FILES currently stores only data for logical replication (slots etc), and
|
// AUX_FILES currently stores only data for logical replication (slots etc), and
|
||||||
// we don't preserve these on a branch because safekeepers can't follow timeline
|
// we don't preserve these on a branch because safekeepers can't follow timeline
|
||||||
// switch (and generally it likely should be optional), so ignore these.
|
// switch (and generally it likely should be optional), so ignore these.
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
pub fn is_inherited_key(key: Key) -> bool {
|
pub fn is_inherited_key(key: Key) -> bool {
|
||||||
key != AUX_FILES_KEY
|
!NON_INHERITED_RANGE.contains(&key)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
@@ -556,11 +636,14 @@ impl std::str::FromStr for Key {
|
|||||||
mod tests {
|
mod tests {
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
|
|
||||||
|
use crate::key::is_metadata_key_slice;
|
||||||
use crate::key::Key;
|
use crate::key::Key;
|
||||||
|
|
||||||
use rand::Rng;
|
use rand::Rng;
|
||||||
use rand::SeedableRng;
|
use rand::SeedableRng;
|
||||||
|
|
||||||
|
use super::AUX_KEY_PREFIX;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn display_fromstr_bijection() {
|
fn display_fromstr_bijection() {
|
||||||
let mut rng = rand::rngs::StdRng::seed_from_u64(42);
|
let mut rng = rand::rngs::StdRng::seed_from_u64(42);
|
||||||
@@ -576,4 +659,16 @@ mod tests {
|
|||||||
|
|
||||||
assert_eq!(key, Key::from_str(&format!("{key}")).unwrap());
|
assert_eq!(key, Key::from_str(&format!("{key}")).unwrap());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_metadata_keys() {
|
||||||
|
let mut metadata_key = vec![AUX_KEY_PREFIX];
|
||||||
|
metadata_key.extend_from_slice(&[0xFF; 15]);
|
||||||
|
let encoded_key = Key::from_metadata_key(&metadata_key);
|
||||||
|
let mut output_key = Vec::new();
|
||||||
|
encoded_key.extract_metadata_key_to_writer(&mut output_key);
|
||||||
|
assert_eq!(metadata_key, output_key);
|
||||||
|
assert!(encoded_key.is_metadata_key());
|
||||||
|
assert!(is_metadata_key_slice(&metadata_key));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -94,12 +94,13 @@ impl KeySpace {
|
|||||||
|
|
||||||
/// Remove all keys in `other` from `self`.
|
/// Remove all keys in `other` from `self`.
|
||||||
/// This can involve splitting or removing of existing ranges.
|
/// This can involve splitting or removing of existing ranges.
|
||||||
pub fn remove_overlapping_with(&mut self, other: &KeySpace) {
|
/// Returns the removed keyspace
|
||||||
|
pub fn remove_overlapping_with(&mut self, other: &KeySpace) -> KeySpace {
|
||||||
let (self_start, self_end) = match (self.start(), self.end()) {
|
let (self_start, self_end) = match (self.start(), self.end()) {
|
||||||
(Some(start), Some(end)) => (start, end),
|
(Some(start), Some(end)) => (start, end),
|
||||||
_ => {
|
_ => {
|
||||||
// self is empty
|
// self is empty
|
||||||
return;
|
return KeySpace::default();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -112,30 +113,37 @@ impl KeySpace {
|
|||||||
.skip_while(|range| self_start >= range.end)
|
.skip_while(|range| self_start >= range.end)
|
||||||
.take_while(|range| self_end > range.start);
|
.take_while(|range| self_end > range.start);
|
||||||
|
|
||||||
|
let mut removed_accum = KeySpaceRandomAccum::new();
|
||||||
for range in other_ranges {
|
for range in other_ranges {
|
||||||
while let Some(overlap_at) = self.overlaps_at(range) {
|
while let Some(overlap_at) = self.overlaps_at(range) {
|
||||||
let overlapped = self.ranges[overlap_at].clone();
|
let overlapped = self.ranges[overlap_at].clone();
|
||||||
|
|
||||||
if overlapped.start < range.start && overlapped.end <= range.end {
|
if overlapped.start < range.start && overlapped.end <= range.end {
|
||||||
// Higher part of the range is completely overlapped.
|
// Higher part of the range is completely overlapped.
|
||||||
|
removed_accum.add_range(range.start..self.ranges[overlap_at].end);
|
||||||
self.ranges[overlap_at].end = range.start;
|
self.ranges[overlap_at].end = range.start;
|
||||||
}
|
}
|
||||||
if overlapped.start >= range.start && overlapped.end > range.end {
|
if overlapped.start >= range.start && overlapped.end > range.end {
|
||||||
// Lower part of the range is completely overlapped.
|
// Lower part of the range is completely overlapped.
|
||||||
|
removed_accum.add_range(self.ranges[overlap_at].start..range.end);
|
||||||
self.ranges[overlap_at].start = range.end;
|
self.ranges[overlap_at].start = range.end;
|
||||||
}
|
}
|
||||||
if overlapped.start < range.start && overlapped.end > range.end {
|
if overlapped.start < range.start && overlapped.end > range.end {
|
||||||
// Middle part of the range is overlapped.
|
// Middle part of the range is overlapped.
|
||||||
|
removed_accum.add_range(range.clone());
|
||||||
self.ranges[overlap_at].end = range.start;
|
self.ranges[overlap_at].end = range.start;
|
||||||
self.ranges
|
self.ranges
|
||||||
.insert(overlap_at + 1, range.end..overlapped.end);
|
.insert(overlap_at + 1, range.end..overlapped.end);
|
||||||
}
|
}
|
||||||
if overlapped.start >= range.start && overlapped.end <= range.end {
|
if overlapped.start >= range.start && overlapped.end <= range.end {
|
||||||
// Whole range is overlapped
|
// Whole range is overlapped
|
||||||
|
removed_accum.add_range(self.ranges[overlap_at].clone());
|
||||||
self.ranges.remove(overlap_at);
|
self.ranges.remove(overlap_at);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
removed_accum.to_keyspace()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn start(&self) -> Option<Key> {
|
pub fn start(&self) -> Option<Key> {
|
||||||
@@ -553,7 +561,16 @@ mod tests {
|
|||||||
Key::from_i128(11)..Key::from_i128(13),
|
Key::from_i128(11)..Key::from_i128(13),
|
||||||
],
|
],
|
||||||
};
|
};
|
||||||
key_space1.remove_overlapping_with(&key_space2);
|
let removed = key_space1.remove_overlapping_with(&key_space2);
|
||||||
|
let removed_expected = KeySpace {
|
||||||
|
ranges: vec![
|
||||||
|
Key::from_i128(2)..Key::from_i128(3),
|
||||||
|
Key::from_i128(6)..Key::from_i128(7),
|
||||||
|
Key::from_i128(11)..Key::from_i128(12),
|
||||||
|
],
|
||||||
|
};
|
||||||
|
assert_eq!(removed, removed_expected);
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
key_space1.ranges,
|
key_space1.ranges,
|
||||||
vec![
|
vec![
|
||||||
@@ -583,7 +600,17 @@ mod tests {
|
|||||||
Key::from_i128(14)..Key::from_i128(17),
|
Key::from_i128(14)..Key::from_i128(17),
|
||||||
],
|
],
|
||||||
};
|
};
|
||||||
key_space1.remove_overlapping_with(&key_space2);
|
|
||||||
|
let removed = key_space1.remove_overlapping_with(&key_space2);
|
||||||
|
let removed_expected = KeySpace {
|
||||||
|
ranges: vec![
|
||||||
|
Key::from_i128(3)..Key::from_i128(5),
|
||||||
|
Key::from_i128(8)..Key::from_i128(10),
|
||||||
|
Key::from_i128(14)..Key::from_i128(15),
|
||||||
|
],
|
||||||
|
};
|
||||||
|
assert_eq!(removed, removed_expected);
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
key_space1.ranges,
|
key_space1.ranges,
|
||||||
vec![
|
vec![
|
||||||
@@ -610,7 +637,11 @@ mod tests {
|
|||||||
Key::from_i128(15)..Key::from_i128(17),
|
Key::from_i128(15)..Key::from_i128(17),
|
||||||
],
|
],
|
||||||
};
|
};
|
||||||
key_space1.remove_overlapping_with(&key_space2);
|
|
||||||
|
let removed = key_space1.remove_overlapping_with(&key_space2);
|
||||||
|
let removed_expected = KeySpace::default();
|
||||||
|
assert_eq!(removed, removed_expected);
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
key_space1.ranges,
|
key_space1.ranges,
|
||||||
vec![
|
vec![
|
||||||
@@ -637,7 +668,17 @@ mod tests {
|
|||||||
let key_space2 = KeySpace {
|
let key_space2 = KeySpace {
|
||||||
ranges: vec![Key::from_i128(9)..Key::from_i128(19)],
|
ranges: vec![Key::from_i128(9)..Key::from_i128(19)],
|
||||||
};
|
};
|
||||||
key_space1.remove_overlapping_with(&key_space2);
|
|
||||||
|
let removed = key_space1.remove_overlapping_with(&key_space2);
|
||||||
|
let removed_expected = KeySpace {
|
||||||
|
ranges: vec![
|
||||||
|
Key::from_i128(9)..Key::from_i128(10),
|
||||||
|
Key::from_i128(12)..Key::from_i128(15),
|
||||||
|
Key::from_i128(17)..Key::from_i128(19),
|
||||||
|
],
|
||||||
|
};
|
||||||
|
assert_eq!(removed, removed_expected);
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
key_space1.ranges,
|
key_space1.ranges,
|
||||||
vec![
|
vec![
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ use utils::{
|
|||||||
history_buffer::HistoryBufferWithDropCounter,
|
history_buffer::HistoryBufferWithDropCounter,
|
||||||
id::{NodeId, TenantId, TimelineId},
|
id::{NodeId, TenantId, TimelineId},
|
||||||
lsn::Lsn,
|
lsn::Lsn,
|
||||||
|
serde_system_time,
|
||||||
};
|
};
|
||||||
|
|
||||||
use crate::controller_api::PlacementPolicy;
|
use crate::controller_api::PlacementPolicy;
|
||||||
@@ -301,6 +302,7 @@ pub struct TenantConfig {
|
|||||||
pub heatmap_period: Option<String>,
|
pub heatmap_period: Option<String>,
|
||||||
pub lazy_slru_download: Option<bool>,
|
pub lazy_slru_download: Option<bool>,
|
||||||
pub timeline_get_throttle: Option<ThrottleConfig>,
|
pub timeline_get_throttle: Option<ThrottleConfig>,
|
||||||
|
pub image_layer_creation_check_threshold: Option<u8>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||||
@@ -427,6 +429,7 @@ pub struct StatusResponse {
|
|||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
#[serde(deny_unknown_fields)]
|
#[serde(deny_unknown_fields)]
|
||||||
pub struct TenantLocationConfigRequest {
|
pub struct TenantLocationConfigRequest {
|
||||||
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
pub tenant_id: Option<TenantShardId>,
|
pub tenant_id: Option<TenantShardId>,
|
||||||
#[serde(flatten)]
|
#[serde(flatten)]
|
||||||
pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
|
pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
|
||||||
@@ -745,10 +748,18 @@ pub struct TimelineGcRequest {
|
|||||||
pub gc_horizon: Option<u64>,
|
pub gc_horizon: Option<u64>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct WalRedoManagerProcessStatus {
|
||||||
|
pub pid: u32,
|
||||||
|
/// The strum-generated `into::<&'static str>()` for `pageserver::walredo::ProcessKind`.
|
||||||
|
/// `ProcessKind` are a transitory thing, so, they have no enum representation in `pageserver_api`.
|
||||||
|
pub kind: Cow<'static, str>,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
pub struct WalRedoManagerStatus {
|
pub struct WalRedoManagerStatus {
|
||||||
pub last_redo_at: Option<chrono::DateTime<chrono::Utc>>,
|
pub last_redo_at: Option<chrono::DateTime<chrono::Utc>>,
|
||||||
pub pid: Option<u32>,
|
pub process: Option<WalRedoManagerProcessStatus>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
|
/// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
|
||||||
@@ -757,11 +768,7 @@ pub struct WalRedoManagerStatus {
|
|||||||
#[derive(Default, Debug, Serialize, Deserialize, Clone)]
|
#[derive(Default, Debug, Serialize, Deserialize, Clone)]
|
||||||
pub struct SecondaryProgress {
|
pub struct SecondaryProgress {
|
||||||
/// The remote storage LastModified time of the heatmap object we last downloaded.
|
/// The remote storage LastModified time of the heatmap object we last downloaded.
|
||||||
#[serde(
|
pub heatmap_mtime: Option<serde_system_time::SystemTime>,
|
||||||
serialize_with = "opt_ser_rfc3339_millis",
|
|
||||||
deserialize_with = "opt_deser_rfc3339_millis"
|
|
||||||
)]
|
|
||||||
pub heatmap_mtime: Option<SystemTime>,
|
|
||||||
|
|
||||||
/// The number of layers currently on-disk
|
/// The number of layers currently on-disk
|
||||||
pub layers_downloaded: usize,
|
pub layers_downloaded: usize,
|
||||||
@@ -774,29 +781,6 @@ pub struct SecondaryProgress {
|
|||||||
pub bytes_total: u64,
|
pub bytes_total: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn opt_ser_rfc3339_millis<S: serde::Serializer>(
|
|
||||||
ts: &Option<SystemTime>,
|
|
||||||
serializer: S,
|
|
||||||
) -> Result<S::Ok, S::Error> {
|
|
||||||
match ts {
|
|
||||||
Some(ts) => serializer.collect_str(&humantime::format_rfc3339_millis(*ts)),
|
|
||||||
None => serializer.serialize_none(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn opt_deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<Option<SystemTime>, D::Error>
|
|
||||||
where
|
|
||||||
D: serde::de::Deserializer<'de>,
|
|
||||||
{
|
|
||||||
let s: Option<String> = serde::de::Deserialize::deserialize(deserializer)?;
|
|
||||||
match s {
|
|
||||||
None => Ok(None),
|
|
||||||
Some(s) => humantime::parse_rfc3339(&s)
|
|
||||||
.map_err(serde::de::Error::custom)
|
|
||||||
.map(Some),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub mod virtual_file {
|
pub mod virtual_file {
|
||||||
#[derive(
|
#[derive(
|
||||||
Copy,
|
Copy,
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
use std::time::SystemTime;
|
use utils::serde_system_time::SystemTime;
|
||||||
|
|
||||||
/// Pageserver current utilization and scoring for how good candidate the pageserver would be for
|
/// Pageserver current utilization and scoring for how good candidate the pageserver would be for
|
||||||
/// the next tenant.
|
/// the next tenant.
|
||||||
@@ -21,28 +21,9 @@ pub struct PageserverUtilization {
|
|||||||
/// When was this snapshot captured, pageserver local time.
|
/// When was this snapshot captured, pageserver local time.
|
||||||
///
|
///
|
||||||
/// Use millis to give confidence that the value is regenerated often enough.
|
/// Use millis to give confidence that the value is regenerated often enough.
|
||||||
#[serde(
|
|
||||||
serialize_with = "ser_rfc3339_millis",
|
|
||||||
deserialize_with = "deser_rfc3339_millis"
|
|
||||||
)]
|
|
||||||
pub captured_at: SystemTime,
|
pub captured_at: SystemTime,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn ser_rfc3339_millis<S: serde::Serializer>(
|
|
||||||
ts: &SystemTime,
|
|
||||||
serializer: S,
|
|
||||||
) -> Result<S::Ok, S::Error> {
|
|
||||||
serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<SystemTime, D::Error>
|
|
||||||
where
|
|
||||||
D: serde::de::Deserializer<'de>,
|
|
||||||
{
|
|
||||||
let s: String = serde::de::Deserialize::deserialize(deserializer)?;
|
|
||||||
humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
|
/// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
|
||||||
///
|
///
|
||||||
/// Instead of newtype, use this because a newtype would get require handling deserializing values
|
/// Instead of newtype, use this because a newtype would get require handling deserializing values
|
||||||
@@ -69,7 +50,9 @@ mod tests {
|
|||||||
disk_usage_bytes: u64::MAX,
|
disk_usage_bytes: u64::MAX,
|
||||||
free_space_bytes: 0,
|
free_space_bytes: 0,
|
||||||
utilization_score: u64::MAX,
|
utilization_score: u64::MAX,
|
||||||
captured_at: SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
|
captured_at: SystemTime(
|
||||||
|
std::time::SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
|
||||||
|
),
|
||||||
};
|
};
|
||||||
|
|
||||||
let s = serde_json::to_string(&doc).unwrap();
|
let s = serde_json::to_string(&doc).unwrap();
|
||||||
|
|||||||
@@ -5,15 +5,93 @@ use crate::{
|
|||||||
models::ShardParameters,
|
models::ShardParameters,
|
||||||
};
|
};
|
||||||
use hex::FromHex;
|
use hex::FromHex;
|
||||||
|
use postgres_ffi::relfile_utils::INIT_FORKNUM;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use utils::id::TenantId;
|
use utils::id::TenantId;
|
||||||
|
|
||||||
|
/// See docs/rfcs/031-sharding-static.md for an overview of sharding.
|
||||||
|
///
|
||||||
|
/// This module contains a variety of types used to represent the concept of sharding
|
||||||
|
/// a Neon tenant across multiple physical shards. Since there are quite a few of these,
|
||||||
|
/// we provide an summary here.
|
||||||
|
///
|
||||||
|
/// Types used to describe shards:
|
||||||
|
/// - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
|
||||||
|
/// which identifies a tenant which is not shard-aware. This means its storage paths do not include
|
||||||
|
/// a shard suffix.
|
||||||
|
/// - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
|
||||||
|
/// - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
|
||||||
|
/// without the tenant ID. This is useful for things that are implicitly scoped to a particular
|
||||||
|
/// tenant, such as layer files.
|
||||||
|
/// - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
|
||||||
|
/// detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
|
||||||
|
/// - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
|
||||||
|
/// four hex digits. An unsharded tenant is `0000`.
|
||||||
|
/// - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
|
||||||
|
///
|
||||||
|
/// Types used to describe the parameters for data distribution in a sharded tenant:
|
||||||
|
/// - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
|
||||||
|
/// multiple shards. Its value is given in 8kiB pages.
|
||||||
|
/// - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
|
||||||
|
/// always zero: this is provided for future upgrades that might introduce different
|
||||||
|
/// data distribution schemes.
|
||||||
|
///
|
||||||
|
/// Examples:
|
||||||
|
/// - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
|
||||||
|
/// - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
|
||||||
|
/// - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
|
||||||
|
/// and their slugs are 0004, 0104, 0204, and 0304.
|
||||||
|
|
||||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
|
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
|
||||||
pub struct ShardNumber(pub u8);
|
pub struct ShardNumber(pub u8);
|
||||||
|
|
||||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
|
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
|
||||||
pub struct ShardCount(u8);
|
pub struct ShardCount(u8);
|
||||||
|
|
||||||
|
/// Combination of ShardNumber and ShardCount. For use within the context of a particular tenant,
|
||||||
|
/// when we need to know which shard we're dealing with, but do not need to know the full
|
||||||
|
/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
|
||||||
|
/// the fully qualified TenantShardId.
|
||||||
|
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
|
||||||
|
pub struct ShardIndex {
|
||||||
|
pub shard_number: ShardNumber,
|
||||||
|
pub shard_count: ShardCount,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`],
|
||||||
|
/// and to check whether that [`ShardNumber`] is the same as the current shard.
|
||||||
|
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
|
||||||
|
pub struct ShardIdentity {
|
||||||
|
pub number: ShardNumber,
|
||||||
|
pub count: ShardCount,
|
||||||
|
pub stripe_size: ShardStripeSize,
|
||||||
|
layout: ShardLayout,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Formatting helper, for generating the `shard_id` label in traces.
|
||||||
|
struct ShardSlug<'a>(&'a TenantShardId);
|
||||||
|
|
||||||
|
/// TenantShardId globally identifies a particular shard in a particular tenant.
|
||||||
|
///
|
||||||
|
/// These are written as `<TenantId>-<ShardSlug>`, for example:
|
||||||
|
/// # The second shard in a two-shard tenant
|
||||||
|
/// 072f1291a5310026820b2fe4b2968934-0102
|
||||||
|
///
|
||||||
|
/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
|
||||||
|
/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
|
||||||
|
/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
|
||||||
|
///
|
||||||
|
/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
|
||||||
|
/// is both forward and backward compatible with TenantId: a legacy TenantId can be
|
||||||
|
/// decoded as a TenantShardId, and when re-encoded it will be parseable
|
||||||
|
/// as a TenantId.
|
||||||
|
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
|
||||||
|
pub struct TenantShardId {
|
||||||
|
pub tenant_id: TenantId,
|
||||||
|
pub shard_number: ShardNumber,
|
||||||
|
pub shard_count: ShardCount,
|
||||||
|
}
|
||||||
|
|
||||||
impl ShardCount {
|
impl ShardCount {
|
||||||
pub const MAX: Self = Self(u8::MAX);
|
pub const MAX: Self = Self(u8::MAX);
|
||||||
|
|
||||||
@@ -38,6 +116,7 @@ impl ShardCount {
|
|||||||
self.0
|
self.0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
///
|
||||||
pub fn is_unsharded(&self) -> bool {
|
pub fn is_unsharded(&self) -> bool {
|
||||||
self.0 == 0
|
self.0 == 0
|
||||||
}
|
}
|
||||||
@@ -53,33 +132,6 @@ impl ShardNumber {
|
|||||||
pub const MAX: Self = Self(u8::MAX);
|
pub const MAX: Self = Self(u8::MAX);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// TenantShardId identify the units of work for the Pageserver.
|
|
||||||
///
|
|
||||||
/// These are written as `<tenant_id>-<shard number><shard-count>`, for example:
|
|
||||||
///
|
|
||||||
/// # The second shard in a two-shard tenant
|
|
||||||
/// 072f1291a5310026820b2fe4b2968934-0102
|
|
||||||
///
|
|
||||||
/// Historically, tenants could not have multiple shards, and were identified
|
|
||||||
/// by TenantId. To support this, TenantShardId has a special legacy
|
|
||||||
/// mode where `shard_count` is equal to zero: this represents a single-sharded
|
|
||||||
/// tenant which should be written as a TenantId with no suffix.
|
|
||||||
///
|
|
||||||
/// The human-readable encoding of TenantShardId, such as used in API URLs,
|
|
||||||
/// is both forward and backward compatible: a legacy TenantId can be
|
|
||||||
/// decoded as a TenantShardId, and when re-encoded it will be parseable
|
|
||||||
/// as a TenantId.
|
|
||||||
///
|
|
||||||
/// Note that the binary encoding is _not_ backward compatible, because
|
|
||||||
/// at the time sharding is introduced, there are no existing binary structures
|
|
||||||
/// containing TenantId that we need to handle.
|
|
||||||
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
|
|
||||||
pub struct TenantShardId {
|
|
||||||
pub tenant_id: TenantId,
|
|
||||||
pub shard_number: ShardNumber,
|
|
||||||
pub shard_count: ShardCount,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl TenantShardId {
|
impl TenantShardId {
|
||||||
pub fn unsharded(tenant_id: TenantId) -> Self {
|
pub fn unsharded(tenant_id: TenantId) -> Self {
|
||||||
Self {
|
Self {
|
||||||
@@ -111,10 +163,13 @@ impl TenantShardId {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Convenience for code that has special behavior on the 0th shard.
|
/// Convenience for code that has special behavior on the 0th shard.
|
||||||
pub fn is_zero(&self) -> bool {
|
pub fn is_shard_zero(&self) -> bool {
|
||||||
self.shard_number == ShardNumber(0)
|
self.shard_number == ShardNumber(0)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// The "unsharded" value is distinct from simply having a single shard: it represents
|
||||||
|
/// a tenant which is not shard-aware at all, and whose storage paths will not include
|
||||||
|
/// a shard suffix.
|
||||||
pub fn is_unsharded(&self) -> bool {
|
pub fn is_unsharded(&self) -> bool {
|
||||||
self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
|
self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
|
||||||
}
|
}
|
||||||
@@ -150,9 +205,6 @@ impl TenantShardId {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Formatting helper
|
|
||||||
struct ShardSlug<'a>(&'a TenantShardId);
|
|
||||||
|
|
||||||
impl<'a> std::fmt::Display for ShardSlug<'a> {
|
impl<'a> std::fmt::Display for ShardSlug<'a> {
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
write!(
|
write!(
|
||||||
@@ -222,16 +274,6 @@ impl From<[u8; 18]> for TenantShardId {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// For use within the context of a particular tenant, when we need to know which
|
|
||||||
/// shard we're dealing with, but do not need to know the full ShardIdentity (because
|
|
||||||
/// we won't be doing any page->shard mapping), and do not need to know the fully qualified
|
|
||||||
/// TenantShardId.
|
|
||||||
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
|
|
||||||
pub struct ShardIndex {
|
|
||||||
pub shard_number: ShardNumber,
|
|
||||||
pub shard_count: ShardCount,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ShardIndex {
|
impl ShardIndex {
|
||||||
pub fn new(number: ShardNumber, count: ShardCount) -> Self {
|
pub fn new(number: ShardNumber, count: ShardCount) -> Self {
|
||||||
Self {
|
Self {
|
||||||
@@ -246,6 +288,9 @@ impl ShardIndex {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// The "unsharded" value is distinct from simply having a single shard: it represents
|
||||||
|
/// a tenant which is not shard-aware at all, and whose storage paths will not include
|
||||||
|
/// a shard suffix.
|
||||||
pub fn is_unsharded(&self) -> bool {
|
pub fn is_unsharded(&self) -> bool {
|
||||||
self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
|
self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
|
||||||
}
|
}
|
||||||
@@ -313,6 +358,8 @@ impl Serialize for TenantShardId {
|
|||||||
if serializer.is_human_readable() {
|
if serializer.is_human_readable() {
|
||||||
serializer.collect_str(self)
|
serializer.collect_str(self)
|
||||||
} else {
|
} else {
|
||||||
|
// Note: while human encoding of [`TenantShardId`] is backward and forward
|
||||||
|
// compatible, this binary encoding is not.
|
||||||
let mut packed: [u8; 18] = [0; 18];
|
let mut packed: [u8; 18] = [0; 18];
|
||||||
packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
|
packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
|
||||||
packed[16] = self.shard_number.0;
|
packed[16] = self.shard_number.0;
|
||||||
@@ -390,16 +437,6 @@ const LAYOUT_BROKEN: ShardLayout = ShardLayout(255);
|
|||||||
/// Default stripe size in pages: 256MiB divided by 8kiB page size.
|
/// Default stripe size in pages: 256MiB divided by 8kiB page size.
|
||||||
const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
|
const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
|
||||||
|
|
||||||
/// The ShardIdentity contains the information needed for one member of map
|
|
||||||
/// to resolve a key to a shard, and then check whether that shard is ==self.
|
|
||||||
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
|
|
||||||
pub struct ShardIdentity {
|
|
||||||
pub number: ShardNumber,
|
|
||||||
pub count: ShardCount,
|
|
||||||
pub stripe_size: ShardStripeSize,
|
|
||||||
layout: ShardLayout,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(thiserror::Error, Debug, PartialEq, Eq)]
|
#[derive(thiserror::Error, Debug, PartialEq, Eq)]
|
||||||
pub enum ShardConfigError {
|
pub enum ShardConfigError {
|
||||||
#[error("Invalid shard count")]
|
#[error("Invalid shard count")]
|
||||||
@@ -439,6 +476,9 @@ impl ShardIdentity {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// The "unsharded" value is distinct from simply having a single shard: it represents
|
||||||
|
/// a tenant which is not shard-aware at all, and whose storage paths will not include
|
||||||
|
/// a shard suffix.
|
||||||
pub fn is_unsharded(&self) -> bool {
|
pub fn is_unsharded(&self) -> bool {
|
||||||
self.number == ShardNumber(0) && self.count == ShardCount(0)
|
self.number == ShardNumber(0) && self.count == ShardCount(0)
|
||||||
}
|
}
|
||||||
@@ -487,6 +527,8 @@ impl ShardIdentity {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Return true if the key should be ingested by this shard
|
/// Return true if the key should be ingested by this shard
|
||||||
|
///
|
||||||
|
/// Shards must ingest _at least_ keys which return true from this check.
|
||||||
pub fn is_key_local(&self, key: &Key) -> bool {
|
pub fn is_key_local(&self, key: &Key) -> bool {
|
||||||
assert!(!self.is_broken());
|
assert!(!self.is_broken());
|
||||||
if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) {
|
if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) {
|
||||||
@@ -496,8 +538,28 @@ impl ShardIdentity {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Special case for issue `<https://github.com/neondatabase/neon/issues/7451>`
|
||||||
|
///
|
||||||
|
/// When we fail to read a forknum block, this function tells us whether we may ignore the error
|
||||||
|
/// as a symptom of that issue.
|
||||||
|
pub fn is_key_buggy_forknum(&self, key: &Key) -> bool {
|
||||||
|
if !is_rel_block_key(key) || key.field5 != INIT_FORKNUM {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut hash = murmurhash32(key.field4);
|
||||||
|
hash = hash_combine(hash, murmurhash32(key.field6 / self.stripe_size.0));
|
||||||
|
let mapped_shard = ShardNumber((hash % self.count.0 as u32) as u8);
|
||||||
|
|
||||||
|
// The key may be affected by issue #7454: it is an initfork and it would not
|
||||||
|
// have mapped to shard 0 until we fixed that issue.
|
||||||
|
mapped_shard != ShardNumber(0)
|
||||||
|
}
|
||||||
|
|
||||||
/// Return true if the key should be discarded if found in this shard's
|
/// Return true if the key should be discarded if found in this shard's
|
||||||
/// data store, e.g. during compaction after a split
|
/// data store, e.g. during compaction after a split.
|
||||||
|
///
|
||||||
|
/// Shards _may_ drop keys which return false here, but are not obliged to.
|
||||||
pub fn is_key_disposable(&self, key: &Key) -> bool {
|
pub fn is_key_disposable(&self, key: &Key) -> bool {
|
||||||
if key_is_shard0(key) {
|
if key_is_shard0(key) {
|
||||||
// Q: Why can't we dispose of shard0 content if we're not shard 0?
|
// Q: Why can't we dispose of shard0 content if we're not shard 0?
|
||||||
@@ -523,7 +585,7 @@ impl ShardIdentity {
|
|||||||
|
|
||||||
/// Convenience for checking if this identity is the 0th shard in a tenant,
|
/// Convenience for checking if this identity is the 0th shard in a tenant,
|
||||||
/// for special cases on shard 0 such as ingesting relation sizes.
|
/// for special cases on shard 0 such as ingesting relation sizes.
|
||||||
pub fn is_zero(&self) -> bool {
|
pub fn is_shard_zero(&self) -> bool {
|
||||||
self.number == ShardNumber(0)
|
self.number == ShardNumber(0)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -606,7 +668,13 @@ fn key_is_shard0(key: &Key) -> bool {
|
|||||||
// relation pages are distributed to shards other than shard zero. Everything else gets
|
// relation pages are distributed to shards other than shard zero. Everything else gets
|
||||||
// stored on shard 0. This guarantees that shard 0 can independently serve basebackup
|
// stored on shard 0. This guarantees that shard 0 can independently serve basebackup
|
||||||
// requests, and any request other than those for particular blocks in relations.
|
// requests, and any request other than those for particular blocks in relations.
|
||||||
!is_rel_block_key(key)
|
//
|
||||||
|
// The only exception to this rule is "initfork" data -- this relates to postgres's UNLOGGED table
|
||||||
|
// type. These are special relations, usually with only 0 or 1 blocks, and we store them on shard 0
|
||||||
|
// because they must be included in basebackups.
|
||||||
|
let is_initfork = key.field5 == INIT_FORKNUM;
|
||||||
|
|
||||||
|
!is_rel_block_key(key) || is_initfork
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Provide the same result as the function in postgres `hashfn.h` with the same name
|
/// Provide the same result as the function in postgres `hashfn.h` with the same name
|
||||||
|
|||||||
@@ -118,7 +118,9 @@ pub use v14::bindings::{TimeLineID, TimestampTz, XLogRecPtr, XLogSegNo};
|
|||||||
// Likewise for these, although the assumption that these don't change is a little more iffy.
|
// Likewise for these, although the assumption that these don't change is a little more iffy.
|
||||||
pub use v14::bindings::{MultiXactOffset, MultiXactStatus};
|
pub use v14::bindings::{MultiXactOffset, MultiXactStatus};
|
||||||
pub use v14::bindings::{PageHeaderData, XLogRecord};
|
pub use v14::bindings::{PageHeaderData, XLogRecord};
|
||||||
pub use v14::xlog_utils::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
|
pub use v14::xlog_utils::{
|
||||||
|
XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
|
||||||
|
};
|
||||||
|
|
||||||
pub use v14::bindings::{CheckPoint, ControlFileData};
|
pub use v14::bindings::{CheckPoint, ControlFileData};
|
||||||
|
|
||||||
|
|||||||
@@ -4,7 +4,9 @@ use log::*;
|
|||||||
use postgres::types::PgLsn;
|
use postgres::types::PgLsn;
|
||||||
use postgres::Client;
|
use postgres::Client;
|
||||||
use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
|
use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
|
||||||
use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
|
use postgres_ffi::{
|
||||||
|
XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
|
||||||
|
};
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
use std::process::Command;
|
use std::process::Command;
|
||||||
use std::time::{Duration, Instant};
|
use std::time::{Duration, Instant};
|
||||||
@@ -262,11 +264,21 @@ fn craft_internal<C: postgres::GenericClient>(
|
|||||||
intermediate_lsns.insert(0, initial_lsn);
|
intermediate_lsns.insert(0, initial_lsn);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Some records may be not flushed, e.g. non-transactional logical messages.
|
// Some records may be not flushed, e.g. non-transactional logical messages. Flush now.
|
||||||
//
|
//
|
||||||
// Note: this is broken if pg_current_wal_insert_lsn is at page boundary
|
// If the previous WAL record ended exactly at page boundary, pg_current_wal_insert_lsn
|
||||||
// because pg_current_wal_insert_lsn skips page headers.
|
// returns the position just after the page header on the next page. That's where the next
|
||||||
client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?;
|
// record will be inserted. But the page header hasn't actually been written to the WAL
|
||||||
|
// yet, and if you try to flush it, you get a "request to flush past end of generated WAL"
|
||||||
|
// error. Because of that, if the insert location is just after a page header, back off to
|
||||||
|
// previous page boundary.
|
||||||
|
let mut lsn = u64::from(client.pg_current_wal_insert_lsn()?);
|
||||||
|
if lsn % WAL_SEGMENT_SIZE as u64 == XLOG_SIZE_OF_XLOG_LONG_PHD as u64 {
|
||||||
|
lsn -= XLOG_SIZE_OF_XLOG_LONG_PHD as u64;
|
||||||
|
} else if lsn % XLOG_BLCKSZ as u64 == XLOG_SIZE_OF_XLOG_SHORT_PHD as u64 {
|
||||||
|
lsn -= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64;
|
||||||
|
}
|
||||||
|
client.execute("select neon_xlogflush($1)", &[&PgLsn::from(lsn)])?;
|
||||||
Ok(intermediate_lsns)
|
Ok(intermediate_lsns)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -320,38 +332,49 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
|
|||||||
|
|
||||||
client.execute("CREATE table t(x int)", &[])?;
|
client.execute("CREATE table t(x int)", &[])?;
|
||||||
|
|
||||||
// Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary.
|
// Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary. We
|
||||||
// We will use logical message as the padding. We start with detecting how much WAL
|
// will use carefully-sized logical messages to advance WAL insert location such
|
||||||
// it takes for one logical message, considering all alignments and headers.
|
// that there is just enough space on the page for the XLOG_SWITCH record.
|
||||||
let base_wal_advance = {
|
loop {
|
||||||
|
// We start with measuring how much WAL it takes for one logical message,
|
||||||
|
// considering all alignments and headers.
|
||||||
let before_lsn = client.pg_current_wal_insert_lsn()?;
|
let before_lsn = client.pg_current_wal_insert_lsn()?;
|
||||||
// Small non-empty message bigger than few bytes is more likely than an empty
|
|
||||||
// message to have the same format as the big padding message.
|
|
||||||
client.execute(
|
client.execute(
|
||||||
"SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', 10))",
|
"SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', 10))",
|
||||||
&[],
|
&[],
|
||||||
)?;
|
)?;
|
||||||
// The XLOG_SWITCH record has no data => its size is exactly XLOG_SIZE_OF_XLOG_RECORD.
|
let after_lsn = client.pg_current_wal_insert_lsn()?;
|
||||||
(u64::from(client.pg_current_wal_insert_lsn()?) - u64::from(before_lsn)) as usize
|
|
||||||
+ XLOG_SIZE_OF_XLOG_RECORD
|
// Did the record cross a page boundary? If it did, start over. Crossing a
|
||||||
};
|
// page boundary adds to the apparent size of the record because of the page
|
||||||
let mut remaining_lsn =
|
// header, which throws off the calculation.
|
||||||
XLOG_BLCKSZ - u64::from(client.pg_current_wal_insert_lsn()?) as usize % XLOG_BLCKSZ;
|
if u64::from(before_lsn) / XLOG_BLCKSZ as u64
|
||||||
if remaining_lsn < base_wal_advance {
|
!= u64::from(after_lsn) / XLOG_BLCKSZ as u64
|
||||||
remaining_lsn += XLOG_BLCKSZ;
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// base_size is the size of a logical message without the payload
|
||||||
|
let base_size = u64::from(after_lsn) - u64::from(before_lsn) - 10;
|
||||||
|
|
||||||
|
// Is there enough space on the page for another logical message and an
|
||||||
|
// XLOG_SWITCH? If not, start over.
|
||||||
|
let page_remain = XLOG_BLCKSZ as u64 - u64::from(after_lsn) % XLOG_BLCKSZ as u64;
|
||||||
|
if page_remain < base_size - XLOG_SIZE_OF_XLOG_RECORD as u64 {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// We will write another logical message, such that after the logical message
|
||||||
|
// record, there will be space for exactly one XLOG_SWITCH. How large should
|
||||||
|
// the logical message's payload be? An XLOG_SWITCH record has no data => its
|
||||||
|
// size is exactly XLOG_SIZE_OF_XLOG_RECORD.
|
||||||
|
let repeats = page_remain - base_size - XLOG_SIZE_OF_XLOG_RECORD as u64;
|
||||||
|
|
||||||
|
client.execute(
|
||||||
|
"SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
|
||||||
|
&[&(repeats as i32)],
|
||||||
|
)?;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
let repeats = 10 + remaining_lsn - base_wal_advance;
|
|
||||||
info!(
|
|
||||||
"current_wal_insert_lsn={}, remaining_lsn={}, base_wal_advance={}, repeats={}",
|
|
||||||
client.pg_current_wal_insert_lsn()?,
|
|
||||||
remaining_lsn,
|
|
||||||
base_wal_advance,
|
|
||||||
repeats
|
|
||||||
);
|
|
||||||
client.execute(
|
|
||||||
"SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
|
|
||||||
&[&(repeats as i32)],
|
|
||||||
)?;
|
|
||||||
info!(
|
info!(
|
||||||
"current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
|
"current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
|
||||||
client.pg_current_wal_insert_lsn()?,
|
client.pg_current_wal_insert_lsn()?,
|
||||||
|
|||||||
@@ -134,6 +134,11 @@ impl RemotePath {
|
|||||||
pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> {
|
pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> {
|
||||||
self.0.strip_prefix(&p.0)
|
self.0.strip_prefix(&p.0)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn add_trailing_slash(&self) -> Self {
|
||||||
|
// Unwrap safety inputs are guararnteed to be valid UTF-8
|
||||||
|
Self(format!("{}/", self.0).try_into().unwrap())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// We don't need callers to be able to pass arbitrary delimiters: just control
|
/// We don't need callers to be able to pass arbitrary delimiters: just control
|
||||||
@@ -157,47 +162,21 @@ pub struct Listing {
|
|||||||
/// providing basic CRUD operations for storage files.
|
/// providing basic CRUD operations for storage files.
|
||||||
#[allow(async_fn_in_trait)]
|
#[allow(async_fn_in_trait)]
|
||||||
pub trait RemoteStorage: Send + Sync + 'static {
|
pub trait RemoteStorage: Send + Sync + 'static {
|
||||||
/// Lists all top level subdirectories for a given prefix
|
/// List objects in remote storage, with semantics matching AWS S3's ListObjectsV2.
|
||||||
/// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
|
/// (see `<https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html>`)
|
||||||
/// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS)
|
///
|
||||||
/// so this method doesnt need to.
|
/// Note that the prefix is relative to any `prefix_in_bucket` configured for the client, not
|
||||||
async fn list_prefixes(
|
/// from the absolute root of the bucket.
|
||||||
&self,
|
///
|
||||||
prefix: Option<&RemotePath>,
|
/// `mode` configures whether to use a delimiter. Without a delimiter all keys
|
||||||
cancel: &CancellationToken,
|
/// within the prefix are listed in the `keys` of the result. With a delimiter, any "directories" at the top level of
|
||||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
/// the prefix are returned in the `prefixes` of the result, and keys in the top level of the prefix are
|
||||||
let result = self
|
/// returned in `keys` ().
|
||||||
.list(prefix, ListingMode::WithDelimiter, None, cancel)
|
///
|
||||||
.await?
|
/// `max_keys` controls the maximum number of keys that will be returned. If this is None, this function
|
||||||
.prefixes;
|
/// will iteratively call listobjects until it runs out of keys. Note that this is not safe to use on
|
||||||
Ok(result)
|
/// unlimted size buckets, as the full list of objects is allocated into a monolithic data structure.
|
||||||
}
|
|
||||||
/// Lists all files in directory "recursively"
|
|
||||||
/// (not really recursively, because AWS has a flat namespace)
|
|
||||||
/// Note: This is subtely different than list_prefixes,
|
|
||||||
/// because it is for listing files instead of listing
|
|
||||||
/// names sharing common prefixes.
|
|
||||||
/// For example,
|
|
||||||
/// list_files("foo/bar") = ["foo/bar/cat123.txt",
|
|
||||||
/// "foo/bar/cat567.txt", "foo/bar/dog123.txt", "foo/bar/dog456.txt"]
|
|
||||||
/// whereas,
|
|
||||||
/// list_prefixes("foo/bar/") = ["cat", "dog"]
|
|
||||||
/// See `test_real_s3.rs` for more details.
|
|
||||||
///
|
///
|
||||||
/// max_keys limits max number of keys returned; None means unlimited.
|
|
||||||
async fn list_files(
|
|
||||||
&self,
|
|
||||||
prefix: Option<&RemotePath>,
|
|
||||||
max_keys: Option<NonZeroU32>,
|
|
||||||
cancel: &CancellationToken,
|
|
||||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
|
||||||
let result = self
|
|
||||||
.list(prefix, ListingMode::NoDelimiter, max_keys, cancel)
|
|
||||||
.await?
|
|
||||||
.keys;
|
|
||||||
Ok(result)
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn list(
|
async fn list(
|
||||||
&self,
|
&self,
|
||||||
prefix: Option<&RemotePath>,
|
prefix: Option<&RemotePath>,
|
||||||
@@ -336,41 +315,6 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// A function for listing all the files in a "directory"
|
|
||||||
// Example:
|
|
||||||
// list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
|
|
||||||
//
|
|
||||||
// max_keys limits max number of keys returned; None means unlimited.
|
|
||||||
pub async fn list_files(
|
|
||||||
&self,
|
|
||||||
folder: Option<&RemotePath>,
|
|
||||||
max_keys: Option<NonZeroU32>,
|
|
||||||
cancel: &CancellationToken,
|
|
||||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
|
||||||
match self {
|
|
||||||
Self::LocalFs(s) => s.list_files(folder, max_keys, cancel).await,
|
|
||||||
Self::AwsS3(s) => s.list_files(folder, max_keys, cancel).await,
|
|
||||||
Self::AzureBlob(s) => s.list_files(folder, max_keys, cancel).await,
|
|
||||||
Self::Unreliable(s) => s.list_files(folder, max_keys, cancel).await,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// lists common *prefixes*, if any of files
|
|
||||||
// Example:
|
|
||||||
// list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"]
|
|
||||||
pub async fn list_prefixes(
|
|
||||||
&self,
|
|
||||||
prefix: Option<&RemotePath>,
|
|
||||||
cancel: &CancellationToken,
|
|
||||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
|
||||||
match self {
|
|
||||||
Self::LocalFs(s) => s.list_prefixes(prefix, cancel).await,
|
|
||||||
Self::AwsS3(s) => s.list_prefixes(prefix, cancel).await,
|
|
||||||
Self::AzureBlob(s) => s.list_prefixes(prefix, cancel).await,
|
|
||||||
Self::Unreliable(s) => s.list_prefixes(prefix, cancel).await,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// See [`RemoteStorage::upload`]
|
/// See [`RemoteStorage::upload`]
|
||||||
pub async fn upload(
|
pub async fn upload(
|
||||||
&self,
|
&self,
|
||||||
@@ -565,6 +509,16 @@ impl GenericRemoteStorage {
|
|||||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
pub struct StorageMetadata(HashMap<String, String>);
|
pub struct StorageMetadata(HashMap<String, String>);
|
||||||
|
|
||||||
|
impl<const N: usize> From<[(&str, &str); N]> for StorageMetadata {
|
||||||
|
fn from(arr: [(&str, &str); N]) -> Self {
|
||||||
|
let map: HashMap<String, String> = arr
|
||||||
|
.iter()
|
||||||
|
.map(|(k, v)| (k.to_string(), v.to_string()))
|
||||||
|
.collect();
|
||||||
|
Self(map)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// External backup storage configuration, enough for creating a client for that storage.
|
/// External backup storage configuration, enough for creating a client for that storage.
|
||||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
pub struct RemoteStorageConfig {
|
pub struct RemoteStorageConfig {
|
||||||
|
|||||||
@@ -5,11 +5,9 @@
|
|||||||
//! volume is mounted to the local FS.
|
//! volume is mounted to the local FS.
|
||||||
|
|
||||||
use std::{
|
use std::{
|
||||||
borrow::Cow,
|
collections::HashSet,
|
||||||
future::Future,
|
|
||||||
io::ErrorKind,
|
io::ErrorKind,
|
||||||
num::NonZeroU32,
|
num::NonZeroU32,
|
||||||
pin::Pin,
|
|
||||||
time::{Duration, SystemTime, UNIX_EPOCH},
|
time::{Duration, SystemTime, UNIX_EPOCH},
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -22,11 +20,11 @@ use tokio::{
|
|||||||
io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
|
io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
|
||||||
};
|
};
|
||||||
use tokio_util::{io::ReaderStream, sync::CancellationToken};
|
use tokio_util::{io::ReaderStream, sync::CancellationToken};
|
||||||
use tracing::*;
|
use utils::crashsafe::path_with_suffix_extension;
|
||||||
use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
|
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
|
Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
|
||||||
|
REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||||
};
|
};
|
||||||
|
|
||||||
use super::{RemoteStorage, StorageMetadata};
|
use super::{RemoteStorage, StorageMetadata};
|
||||||
@@ -93,7 +91,47 @@ impl LocalFs {
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
async fn list_all(&self) -> anyhow::Result<Vec<RemotePath>> {
|
async fn list_all(&self) -> anyhow::Result<Vec<RemotePath>> {
|
||||||
Ok(get_all_files(&self.storage_root, true)
|
use std::{future::Future, pin::Pin};
|
||||||
|
fn get_all_files<'a, P>(
|
||||||
|
directory_path: P,
|
||||||
|
) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
|
||||||
|
where
|
||||||
|
P: AsRef<Utf8Path> + Send + Sync + 'a,
|
||||||
|
{
|
||||||
|
Box::pin(async move {
|
||||||
|
let directory_path = directory_path.as_ref();
|
||||||
|
if directory_path.exists() {
|
||||||
|
if directory_path.is_dir() {
|
||||||
|
let mut paths = Vec::new();
|
||||||
|
let mut dir_contents = fs::read_dir(directory_path).await?;
|
||||||
|
while let Some(dir_entry) = dir_contents.next_entry().await? {
|
||||||
|
let file_type = dir_entry.file_type().await?;
|
||||||
|
let entry_path =
|
||||||
|
Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
|
||||||
|
anyhow::Error::msg(format!(
|
||||||
|
"non-Unicode path: {}",
|
||||||
|
pb.to_string_lossy()
|
||||||
|
))
|
||||||
|
})?;
|
||||||
|
if file_type.is_symlink() {
|
||||||
|
tracing::debug!("{entry_path:?} is a symlink, skipping")
|
||||||
|
} else if file_type.is_dir() {
|
||||||
|
paths.extend(get_all_files(&entry_path).await?.into_iter())
|
||||||
|
} else {
|
||||||
|
paths.push(entry_path);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(paths)
|
||||||
|
} else {
|
||||||
|
bail!("Path {directory_path:?} is not a directory")
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
Ok(Vec::new())
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(get_all_files(&self.storage_root)
|
||||||
.await?
|
.await?
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|path| {
|
.map(|path| {
|
||||||
@@ -120,6 +158,14 @@ impl LocalFs {
|
|||||||
// S3 object list prefixes can be arbitrary strings, but when reading
|
// S3 object list prefixes can be arbitrary strings, but when reading
|
||||||
// the local filesystem we need a directory to start calling read_dir on.
|
// the local filesystem we need a directory to start calling read_dir on.
|
||||||
let mut initial_dir = full_path.clone();
|
let mut initial_dir = full_path.clone();
|
||||||
|
|
||||||
|
// If there's no trailing slash, we have to start looking from one above: even if
|
||||||
|
// `initial_dir` is a directory, we should still list any prefixes in the parent
|
||||||
|
// that start with the same string.
|
||||||
|
if !full_path.to_string().ends_with('/') {
|
||||||
|
initial_dir.pop();
|
||||||
|
}
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
// Did we make it to the root?
|
// Did we make it to the root?
|
||||||
if initial_dir.parent().is_none() {
|
if initial_dir.parent().is_none() {
|
||||||
@@ -295,61 +341,66 @@ impl RemoteStorage for LocalFs {
|
|||||||
let op = async {
|
let op = async {
|
||||||
let mut result = Listing::default();
|
let mut result = Listing::default();
|
||||||
|
|
||||||
if let ListingMode::NoDelimiter = mode {
|
// Filter out directories: in S3 directories don't exist, only the keys within them do.
|
||||||
let keys = self
|
let keys = self
|
||||||
.list_recursive(prefix)
|
.list_recursive(prefix)
|
||||||
.await
|
|
||||||
.map_err(DownloadError::Other)?;
|
|
||||||
|
|
||||||
result.keys = keys
|
|
||||||
.into_iter()
|
|
||||||
.filter(|k| {
|
|
||||||
let path = k.with_base(&self.storage_root);
|
|
||||||
!path.is_dir()
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
if let Some(max_keys) = max_keys {
|
|
||||||
result.keys.truncate(max_keys.get() as usize);
|
|
||||||
}
|
|
||||||
|
|
||||||
return Ok(result);
|
|
||||||
}
|
|
||||||
|
|
||||||
let path = match prefix {
|
|
||||||
Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
|
|
||||||
None => Cow::Borrowed(&self.storage_root),
|
|
||||||
};
|
|
||||||
|
|
||||||
let prefixes_to_filter = get_all_files(path.as_ref(), false)
|
|
||||||
.await
|
.await
|
||||||
.map_err(DownloadError::Other)?;
|
.map_err(DownloadError::Other)?;
|
||||||
|
let keys = keys
|
||||||
|
.into_iter()
|
||||||
|
.filter(|k| {
|
||||||
|
let path = k.with_base(&self.storage_root);
|
||||||
|
!path.is_dir()
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
// filter out empty directories to mirror s3 behavior.
|
if let ListingMode::NoDelimiter = mode {
|
||||||
for prefix in prefixes_to_filter {
|
result.keys = keys;
|
||||||
if prefix.is_dir()
|
} else {
|
||||||
&& is_directory_empty(&prefix)
|
let mut prefixes = HashSet::new();
|
||||||
.await
|
for key in keys {
|
||||||
.map_err(DownloadError::Other)?
|
// If the part after the prefix includes a "/", take only the first part and put it in `prefixes`.
|
||||||
{
|
let relative_key = if let Some(prefix) = prefix {
|
||||||
continue;
|
let mut prefix = prefix.clone();
|
||||||
}
|
// We only strip the dirname of the prefix, so that when we strip it from the start of keys we
|
||||||
|
// end up with full file/dir names.
|
||||||
let stripped = prefix
|
let prefix_full_local_path = prefix.with_base(&self.storage_root);
|
||||||
.strip_prefix(&self.storage_root)
|
let has_slash = prefix.0.to_string().ends_with('/');
|
||||||
.context("Failed to strip prefix")
|
let strip_prefix = if prefix_full_local_path.is_dir() && has_slash {
|
||||||
.and_then(RemotePath::new)
|
prefix
|
||||||
.expect(
|
} else {
|
||||||
"We list files for storage root, hence should be able to remote the prefix",
|
prefix.0.pop();
|
||||||
);
|
prefix
|
||||||
|
};
|
||||||
if prefix.is_dir() {
|
|
||||||
result.prefixes.push(stripped);
|
RemotePath::new(key.strip_prefix(&strip_prefix).unwrap()).unwrap()
|
||||||
} else {
|
} else {
|
||||||
result.keys.push(stripped);
|
key
|
||||||
|
};
|
||||||
|
|
||||||
|
let relative_key = format!("{}", relative_key);
|
||||||
|
if relative_key.contains(REMOTE_STORAGE_PREFIX_SEPARATOR) {
|
||||||
|
let first_part = relative_key
|
||||||
|
.split(REMOTE_STORAGE_PREFIX_SEPARATOR)
|
||||||
|
.next()
|
||||||
|
.unwrap()
|
||||||
|
.to_owned();
|
||||||
|
prefixes.insert(first_part);
|
||||||
|
} else {
|
||||||
|
result
|
||||||
|
.keys
|
||||||
|
.push(RemotePath::from_string(&relative_key).unwrap());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
result.prefixes = prefixes
|
||||||
|
.into_iter()
|
||||||
|
.map(|s| RemotePath::from_string(&s).unwrap())
|
||||||
|
.collect();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if let Some(max_keys) = max_keys {
|
||||||
|
result.keys.truncate(max_keys.get() as usize);
|
||||||
|
}
|
||||||
Ok(result)
|
Ok(result)
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -560,50 +611,6 @@ fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
|
|||||||
path_with_suffix_extension(original_path, "metadata")
|
path_with_suffix_extension(original_path, "metadata")
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_all_files<'a, P>(
|
|
||||||
directory_path: P,
|
|
||||||
recursive: bool,
|
|
||||||
) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
|
|
||||||
where
|
|
||||||
P: AsRef<Utf8Path> + Send + Sync + 'a,
|
|
||||||
{
|
|
||||||
Box::pin(async move {
|
|
||||||
let directory_path = directory_path.as_ref();
|
|
||||||
if directory_path.exists() {
|
|
||||||
if directory_path.is_dir() {
|
|
||||||
let mut paths = Vec::new();
|
|
||||||
let mut dir_contents = fs::read_dir(directory_path).await?;
|
|
||||||
while let Some(dir_entry) = dir_contents.next_entry().await? {
|
|
||||||
let file_type = dir_entry.file_type().await?;
|
|
||||||
let entry_path =
|
|
||||||
Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
|
|
||||||
anyhow::Error::msg(format!(
|
|
||||||
"non-Unicode path: {}",
|
|
||||||
pb.to_string_lossy()
|
|
||||||
))
|
|
||||||
})?;
|
|
||||||
if file_type.is_symlink() {
|
|
||||||
debug!("{entry_path:?} is a symlink, skipping")
|
|
||||||
} else if file_type.is_dir() {
|
|
||||||
if recursive {
|
|
||||||
paths.extend(get_all_files(&entry_path, true).await?.into_iter())
|
|
||||||
} else {
|
|
||||||
paths.push(entry_path)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
paths.push(entry_path);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(paths)
|
|
||||||
} else {
|
|
||||||
bail!("Path {directory_path:?} is not a directory")
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
Ok(Vec::new())
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<()> {
|
async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<()> {
|
||||||
let target_dir = match target_file_path.parent() {
|
let target_dir = match target_file_path.parent() {
|
||||||
Some(parent_dir) => parent_dir,
|
Some(parent_dir) => parent_dir,
|
||||||
@@ -923,13 +930,18 @@ mod fs_tests {
|
|||||||
// No delimiter: should recursively list everything
|
// No delimiter: should recursively list everything
|
||||||
let (storage, cancel) = create_storage()?;
|
let (storage, cancel) = create_storage()?;
|
||||||
let child = upload_dummy_file(&storage, "grandparent/parent/child", None, &cancel).await?;
|
let child = upload_dummy_file(&storage, "grandparent/parent/child", None, &cancel).await?;
|
||||||
|
let child_sibling =
|
||||||
|
upload_dummy_file(&storage, "grandparent/parent/child_sibling", None, &cancel).await?;
|
||||||
let uncle = upload_dummy_file(&storage, "grandparent/uncle", None, &cancel).await?;
|
let uncle = upload_dummy_file(&storage, "grandparent/uncle", None, &cancel).await?;
|
||||||
|
|
||||||
let listing = storage
|
let listing = storage
|
||||||
.list(None, ListingMode::NoDelimiter, None, &cancel)
|
.list(None, ListingMode::NoDelimiter, None, &cancel)
|
||||||
.await?;
|
.await?;
|
||||||
assert!(listing.prefixes.is_empty());
|
assert!(listing.prefixes.is_empty());
|
||||||
assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());
|
assert_eq!(
|
||||||
|
listing.keys.into_iter().collect::<HashSet<_>>(),
|
||||||
|
HashSet::from([uncle.clone(), child.clone(), child_sibling.clone()])
|
||||||
|
);
|
||||||
|
|
||||||
// Delimiter: should only go one deep
|
// Delimiter: should only go one deep
|
||||||
let listing = storage
|
let listing = storage
|
||||||
@@ -942,7 +954,25 @@ mod fs_tests {
|
|||||||
);
|
);
|
||||||
assert!(listing.keys.is_empty());
|
assert!(listing.keys.is_empty());
|
||||||
|
|
||||||
// Delimiter & prefix
|
// Delimiter & prefix with a trailing slash
|
||||||
|
let listing = storage
|
||||||
|
.list(
|
||||||
|
Some(&RemotePath::from_string("timelines/some_timeline/grandparent/").unwrap()),
|
||||||
|
ListingMode::WithDelimiter,
|
||||||
|
None,
|
||||||
|
&cancel,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
assert_eq!(
|
||||||
|
listing.keys,
|
||||||
|
[RemotePath::from_string("uncle").unwrap()].to_vec()
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
listing.prefixes,
|
||||||
|
[RemotePath::from_string("parent").unwrap()].to_vec()
|
||||||
|
);
|
||||||
|
|
||||||
|
// Delimiter and prefix without a trailing slash
|
||||||
let listing = storage
|
let listing = storage
|
||||||
.list(
|
.list(
|
||||||
Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
|
Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
|
||||||
@@ -951,12 +981,66 @@ mod fs_tests {
|
|||||||
&cancel,
|
&cancel,
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
assert_eq!(listing.keys, [].to_vec());
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
listing.prefixes,
|
listing.prefixes,
|
||||||
[RemotePath::from_string("timelines/some_timeline/grandparent/parent").unwrap()]
|
[RemotePath::from_string("grandparent").unwrap()].to_vec()
|
||||||
.to_vec()
|
);
|
||||||
|
|
||||||
|
// Delimiter and prefix that's partway through a path component
|
||||||
|
let listing = storage
|
||||||
|
.list(
|
||||||
|
Some(&RemotePath::from_string("timelines/some_timeline/grandp").unwrap()),
|
||||||
|
ListingMode::WithDelimiter,
|
||||||
|
None,
|
||||||
|
&cancel,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
assert_eq!(listing.keys, [].to_vec());
|
||||||
|
assert_eq!(
|
||||||
|
listing.prefixes,
|
||||||
|
[RemotePath::from_string("grandparent").unwrap()].to_vec()
|
||||||
|
);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn list_part_component() -> anyhow::Result<()> {
|
||||||
|
// No delimiter: should recursively list everything
|
||||||
|
let (storage, cancel) = create_storage()?;
|
||||||
|
|
||||||
|
// Imitates what happens in a tenant path when we have an unsharded path and a sharded path, and do a listing
|
||||||
|
// of the unsharded path: although there is a "directory" at the unsharded path, it should be handled as
|
||||||
|
// a freeform prefix.
|
||||||
|
let _child_a =
|
||||||
|
upload_dummy_file(&storage, "grandparent/tenant-01/child", None, &cancel).await?;
|
||||||
|
let _child_b =
|
||||||
|
upload_dummy_file(&storage, "grandparent/tenant/child", None, &cancel).await?;
|
||||||
|
|
||||||
|
// Delimiter and prefix that's partway through a path component
|
||||||
|
let listing = storage
|
||||||
|
.list(
|
||||||
|
Some(
|
||||||
|
&RemotePath::from_string("timelines/some_timeline/grandparent/tenant").unwrap(),
|
||||||
|
),
|
||||||
|
ListingMode::WithDelimiter,
|
||||||
|
None,
|
||||||
|
&cancel,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
assert_eq!(listing.keys, [].to_vec());
|
||||||
|
|
||||||
|
let mut found_prefixes = listing.prefixes.clone();
|
||||||
|
found_prefixes.sort();
|
||||||
|
assert_eq!(
|
||||||
|
found_prefixes,
|
||||||
|
[
|
||||||
|
RemotePath::from_string("tenant").unwrap(),
|
||||||
|
RemotePath::from_string("tenant-01").unwrap(),
|
||||||
|
]
|
||||||
|
.to_vec()
|
||||||
);
|
);
|
||||||
assert_eq!(listing.keys, [uncle.clone()].to_vec());
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -178,10 +178,7 @@ impl S3Bucket {
|
|||||||
|
|
||||||
pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
|
pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
|
||||||
assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
|
assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
|
||||||
let path_string = path
|
let path_string = path.get_path().as_str();
|
||||||
.get_path()
|
|
||||||
.as_str()
|
|
||||||
.trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR);
|
|
||||||
match &self.prefix_in_bucket {
|
match &self.prefix_in_bucket {
|
||||||
Some(prefix) => prefix.clone() + "/" + path_string,
|
Some(prefix) => prefix.clone() + "/" + path_string,
|
||||||
None => path_string.to_string(),
|
None => path_string.to_string(),
|
||||||
@@ -471,16 +468,11 @@ impl RemoteStorage for S3Bucket {
|
|||||||
// get the passed prefix or if it is not set use prefix_in_bucket value
|
// get the passed prefix or if it is not set use prefix_in_bucket value
|
||||||
let list_prefix = prefix
|
let list_prefix = prefix
|
||||||
.map(|p| self.relative_path_to_s3_object(p))
|
.map(|p| self.relative_path_to_s3_object(p))
|
||||||
.or_else(|| self.prefix_in_bucket.clone())
|
.or_else(|| {
|
||||||
.map(|mut p| {
|
self.prefix_in_bucket.clone().map(|mut s| {
|
||||||
// required to end with a separator
|
s.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
|
||||||
// otherwise request will return only the entry of a prefix
|
s
|
||||||
if matches!(mode, ListingMode::WithDelimiter)
|
})
|
||||||
&& !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
|
|
||||||
{
|
|
||||||
p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
|
|
||||||
}
|
|
||||||
p
|
|
||||||
});
|
});
|
||||||
|
|
||||||
let _permit = self.permit(kind, cancel).await?;
|
let _permit = self.permit(kind, cancel).await?;
|
||||||
@@ -549,11 +541,15 @@ impl RemoteStorage for S3Bucket {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
result.prefixes.extend(
|
// S3 gives us prefixes like "foo/", we return them like "foo"
|
||||||
prefixes
|
result.prefixes.extend(prefixes.iter().filter_map(|o| {
|
||||||
.iter()
|
Some(
|
||||||
.filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
|
self.s3_object_to_relative_path(
|
||||||
);
|
o.prefix()?
|
||||||
|
.trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
}));
|
||||||
|
|
||||||
continuation_token = match response.next_continuation_token {
|
continuation_token = match response.next_continuation_token {
|
||||||
Some(new_token) => Some(new_token),
|
Some(new_token) => Some(new_token),
|
||||||
@@ -1050,22 +1046,22 @@ mod tests {
|
|||||||
Some("/test/prefix/"),
|
Some("/test/prefix/"),
|
||||||
];
|
];
|
||||||
let expected_outputs = [
|
let expected_outputs = [
|
||||||
vec!["", "some/path", "some/path"],
|
vec!["", "some/path", "some/path/"],
|
||||||
vec!["/", "/some/path", "/some/path"],
|
vec!["/", "/some/path", "/some/path/"],
|
||||||
vec![
|
vec![
|
||||||
"test/prefix/",
|
"test/prefix/",
|
||||||
"test/prefix/some/path",
|
"test/prefix/some/path",
|
||||||
"test/prefix/some/path",
|
"test/prefix/some/path/",
|
||||||
],
|
],
|
||||||
vec![
|
vec![
|
||||||
"test/prefix/",
|
"test/prefix/",
|
||||||
"test/prefix/some/path",
|
"test/prefix/some/path",
|
||||||
"test/prefix/some/path",
|
"test/prefix/some/path/",
|
||||||
],
|
],
|
||||||
vec![
|
vec![
|
||||||
"test/prefix/",
|
"test/prefix/",
|
||||||
"test/prefix/some/path",
|
"test/prefix/some/path",
|
||||||
"test/prefix/some/path",
|
"test/prefix/some/path/",
|
||||||
],
|
],
|
||||||
];
|
];
|
||||||
|
|
||||||
|
|||||||
@@ -107,27 +107,6 @@ impl UnreliableWrapper {
|
|||||||
type VoidStorage = crate::LocalFs;
|
type VoidStorage = crate::LocalFs;
|
||||||
|
|
||||||
impl RemoteStorage for UnreliableWrapper {
|
impl RemoteStorage for UnreliableWrapper {
|
||||||
async fn list_prefixes(
|
|
||||||
&self,
|
|
||||||
prefix: Option<&RemotePath>,
|
|
||||||
cancel: &CancellationToken,
|
|
||||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
|
||||||
self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
|
|
||||||
.map_err(DownloadError::Other)?;
|
|
||||||
self.inner.list_prefixes(prefix, cancel).await
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn list_files(
|
|
||||||
&self,
|
|
||||||
folder: Option<&RemotePath>,
|
|
||||||
max_keys: Option<NonZeroU32>,
|
|
||||||
cancel: &CancellationToken,
|
|
||||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
|
||||||
self.attempt(RemoteOp::ListPrefixes(folder.cloned()))
|
|
||||||
.map_err(DownloadError::Other)?;
|
|
||||||
self.inner.list_files(folder, max_keys, cancel).await
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn list(
|
async fn list(
|
||||||
&self,
|
&self,
|
||||||
prefix: Option<&RemotePath>,
|
prefix: Option<&RemotePath>,
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use camino::Utf8Path;
|
use camino::Utf8Path;
|
||||||
|
use remote_storage::ListingMode;
|
||||||
use remote_storage::RemotePath;
|
use remote_storage::RemotePath;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::{collections::HashSet, num::NonZeroU32};
|
use std::{collections::HashSet, num::NonZeroU32};
|
||||||
@@ -54,9 +55,9 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
|
|||||||
let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
|
let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
|
||||||
.context("common_prefix construction")?;
|
.context("common_prefix construction")?;
|
||||||
let root_remote_prefixes = test_client
|
let root_remote_prefixes = test_client
|
||||||
.list_prefixes(None, &cancel)
|
.list(None, ListingMode::WithDelimiter, None, &cancel)
|
||||||
.await
|
.await?
|
||||||
.context("client list root prefixes failure")?
|
.prefixes
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.collect::<HashSet<_>>();
|
.collect::<HashSet<_>>();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
@@ -65,9 +66,14 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
|
|||||||
);
|
);
|
||||||
|
|
||||||
let nested_remote_prefixes = test_client
|
let nested_remote_prefixes = test_client
|
||||||
.list_prefixes(Some(&base_prefix), &cancel)
|
.list(
|
||||||
.await
|
Some(&base_prefix.add_trailing_slash()),
|
||||||
.context("client list nested prefixes failure")?
|
ListingMode::WithDelimiter,
|
||||||
|
None,
|
||||||
|
&cancel,
|
||||||
|
)
|
||||||
|
.await?
|
||||||
|
.prefixes
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.collect::<HashSet<_>>();
|
.collect::<HashSet<_>>();
|
||||||
let remote_only_prefixes = nested_remote_prefixes
|
let remote_only_prefixes = nested_remote_prefixes
|
||||||
@@ -90,11 +96,13 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
|
|||||||
///
|
///
|
||||||
/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
|
/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
|
||||||
/// Then performs the following queries:
|
/// Then performs the following queries:
|
||||||
/// 1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
|
/// 1. `list(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
|
||||||
/// 2. `list_files("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt`
|
/// 2. `list("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt`
|
||||||
#[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
|
#[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> anyhow::Result<()> {
|
async fn list_no_delimiter_works(
|
||||||
|
ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
let ctx = match ctx {
|
let ctx = match ctx {
|
||||||
MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
|
MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
|
||||||
MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
|
MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
|
||||||
@@ -107,29 +115,36 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
|
|||||||
let base_prefix =
|
let base_prefix =
|
||||||
RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
|
RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
|
||||||
let root_files = test_client
|
let root_files = test_client
|
||||||
.list_files(None, None, &cancel)
|
.list(None, ListingMode::NoDelimiter, None, &cancel)
|
||||||
.await
|
.await
|
||||||
.context("client list root files failure")?
|
.context("client list root files failure")?
|
||||||
|
.keys
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.collect::<HashSet<_>>();
|
.collect::<HashSet<_>>();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
root_files,
|
root_files,
|
||||||
ctx.remote_blobs.clone(),
|
ctx.remote_blobs.clone(),
|
||||||
"remote storage list_files on root mismatches with the uploads."
|
"remote storage list on root mismatches with the uploads."
|
||||||
);
|
);
|
||||||
|
|
||||||
// Test that max_keys limit works. In total there are about 21 files (see
|
// Test that max_keys limit works. In total there are about 21 files (see
|
||||||
// upload_simple_remote_data call in test_real_s3.rs).
|
// upload_simple_remote_data call in test_real_s3.rs).
|
||||||
let limited_root_files = test_client
|
let limited_root_files = test_client
|
||||||
.list_files(None, Some(NonZeroU32::new(2).unwrap()), &cancel)
|
.list(
|
||||||
|
None,
|
||||||
|
ListingMode::NoDelimiter,
|
||||||
|
Some(NonZeroU32::new(2).unwrap()),
|
||||||
|
&cancel,
|
||||||
|
)
|
||||||
.await
|
.await
|
||||||
.context("client list root files failure")?;
|
.context("client list root files failure")?;
|
||||||
assert_eq!(limited_root_files.len(), 2);
|
assert_eq!(limited_root_files.keys.len(), 2);
|
||||||
|
|
||||||
let nested_remote_files = test_client
|
let nested_remote_files = test_client
|
||||||
.list_files(Some(&base_prefix), None, &cancel)
|
.list(Some(&base_prefix), ListingMode::NoDelimiter, None, &cancel)
|
||||||
.await
|
.await
|
||||||
.context("client list nested files failure")?
|
.context("client list nested files failure")?
|
||||||
|
.keys
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.collect::<HashSet<_>>();
|
.collect::<HashSet<_>>();
|
||||||
let trim_remote_blobs: HashSet<_> = ctx
|
let trim_remote_blobs: HashSet<_> = ctx
|
||||||
@@ -141,7 +156,7 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
|
|||||||
.collect();
|
.collect();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
nested_remote_files, trim_remote_blobs,
|
nested_remote_files, trim_remote_blobs,
|
||||||
"remote storage list_files on subdirrectory mismatches with the uploads."
|
"remote storage list on subdirrectory mismatches with the uploads."
|
||||||
);
|
);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -199,7 +214,11 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<(
|
|||||||
|
|
||||||
ctx.client.delete_objects(&[path1, path2], &cancel).await?;
|
ctx.client.delete_objects(&[path1, path2], &cancel).await?;
|
||||||
|
|
||||||
let prefixes = ctx.client.list_prefixes(None, &cancel).await?;
|
let prefixes = ctx
|
||||||
|
.client
|
||||||
|
.list(None, ListingMode::WithDelimiter, None, &cancel)
|
||||||
|
.await?
|
||||||
|
.prefixes;
|
||||||
|
|
||||||
assert_eq!(prefixes.len(), 1);
|
assert_eq!(prefixes.len(), 1);
|
||||||
|
|
||||||
|
|||||||
@@ -57,7 +57,6 @@ enum MaybeEnabledStorage {
|
|||||||
Disabled,
|
Disabled,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
|
||||||
impl AsyncTestContext for MaybeEnabledStorage {
|
impl AsyncTestContext for MaybeEnabledStorage {
|
||||||
async fn setup() -> Self {
|
async fn setup() -> Self {
|
||||||
ensure_logging_ready();
|
ensure_logging_ready();
|
||||||
@@ -86,7 +85,6 @@ struct AzureWithTestBlobs {
|
|||||||
remote_blobs: HashSet<RemotePath>,
|
remote_blobs: HashSet<RemotePath>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
|
||||||
impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
|
impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
|
||||||
async fn setup() -> Self {
|
async fn setup() -> Self {
|
||||||
ensure_logging_ready();
|
ensure_logging_ready();
|
||||||
@@ -134,10 +132,6 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// NOTE: the setups for the list_prefixes test and the list_files test are very similar
|
|
||||||
// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
|
|
||||||
// whereas the list_files function is concerned with listing files.
|
|
||||||
// See `RemoteStorage::list_files` documentation for more details
|
|
||||||
enum MaybeEnabledStorageWithSimpleTestBlobs {
|
enum MaybeEnabledStorageWithSimpleTestBlobs {
|
||||||
Enabled(AzureWithSimpleTestBlobs),
|
Enabled(AzureWithSimpleTestBlobs),
|
||||||
Disabled,
|
Disabled,
|
||||||
@@ -148,7 +142,6 @@ struct AzureWithSimpleTestBlobs {
|
|||||||
remote_blobs: HashSet<RemotePath>,
|
remote_blobs: HashSet<RemotePath>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
|
||||||
impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
|
impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
|
||||||
async fn setup() -> Self {
|
async fn setup() -> Self {
|
||||||
ensure_logging_ready();
|
ensure_logging_ready();
|
||||||
|
|||||||
@@ -12,8 +12,8 @@ use anyhow::Context;
|
|||||||
use camino::Utf8Path;
|
use camino::Utf8Path;
|
||||||
use futures_util::StreamExt;
|
use futures_util::StreamExt;
|
||||||
use remote_storage::{
|
use remote_storage::{
|
||||||
DownloadError, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
|
DownloadError, GenericRemoteStorage, ListingMode, RemotePath, RemoteStorageConfig,
|
||||||
S3Config,
|
RemoteStorageKind, S3Config,
|
||||||
};
|
};
|
||||||
use test_context::test_context;
|
use test_context::test_context;
|
||||||
use test_context::AsyncTestContext;
|
use test_context::AsyncTestContext;
|
||||||
@@ -75,11 +75,14 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
|||||||
client: &Arc<GenericRemoteStorage>,
|
client: &Arc<GenericRemoteStorage>,
|
||||||
cancel: &CancellationToken,
|
cancel: &CancellationToken,
|
||||||
) -> anyhow::Result<HashSet<RemotePath>> {
|
) -> anyhow::Result<HashSet<RemotePath>> {
|
||||||
Ok(retry(|| client.list_files(None, None, cancel))
|
Ok(
|
||||||
.await
|
retry(|| client.list(None, ListingMode::NoDelimiter, None, cancel))
|
||||||
.context("list root files failure")?
|
.await
|
||||||
.into_iter()
|
.context("list root files failure")?
|
||||||
.collect::<HashSet<_>>())
|
.keys
|
||||||
|
.into_iter()
|
||||||
|
.collect::<HashSet<_>>(),
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
let cancel = CancellationToken::new();
|
let cancel = CancellationToken::new();
|
||||||
@@ -219,7 +222,6 @@ enum MaybeEnabledStorage {
|
|||||||
Disabled,
|
Disabled,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
|
||||||
impl AsyncTestContext for MaybeEnabledStorage {
|
impl AsyncTestContext for MaybeEnabledStorage {
|
||||||
async fn setup() -> Self {
|
async fn setup() -> Self {
|
||||||
ensure_logging_ready();
|
ensure_logging_ready();
|
||||||
@@ -248,7 +250,6 @@ struct S3WithTestBlobs {
|
|||||||
remote_blobs: HashSet<RemotePath>,
|
remote_blobs: HashSet<RemotePath>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
|
||||||
impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
|
impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
|
||||||
async fn setup() -> Self {
|
async fn setup() -> Self {
|
||||||
ensure_logging_ready();
|
ensure_logging_ready();
|
||||||
@@ -296,10 +297,6 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// NOTE: the setups for the list_prefixes test and the list_files test are very similar
|
|
||||||
// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
|
|
||||||
// whereas the list_files function is concerned with listing files.
|
|
||||||
// See `RemoteStorage::list_files` documentation for more details
|
|
||||||
enum MaybeEnabledStorageWithSimpleTestBlobs {
|
enum MaybeEnabledStorageWithSimpleTestBlobs {
|
||||||
Enabled(S3WithSimpleTestBlobs),
|
Enabled(S3WithSimpleTestBlobs),
|
||||||
Disabled,
|
Disabled,
|
||||||
@@ -310,7 +307,6 @@ struct S3WithSimpleTestBlobs {
|
|||||||
remote_blobs: HashSet<RemotePath>,
|
remote_blobs: HashSet<RemotePath>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait::async_trait]
|
|
||||||
impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
|
impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
|
||||||
async fn setup() -> Self {
|
async fn setup() -> Self {
|
||||||
ensure_logging_ready();
|
ensure_logging_ready();
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ camino.workspace = true
|
|||||||
chrono.workspace = true
|
chrono.workspace = true
|
||||||
heapless.workspace = true
|
heapless.workspace = true
|
||||||
hex = { workspace = true, features = ["serde"] }
|
hex = { workspace = true, features = ["serde"] }
|
||||||
|
humantime.workspace = true
|
||||||
hyper = { workspace = true, features = ["full"] }
|
hyper = { workspace = true, features = ["full"] }
|
||||||
fail.workspace = true
|
fail.workspace = true
|
||||||
futures = { workspace = true}
|
futures = { workspace = true}
|
||||||
|
|||||||
21
libs/utils/src/env.rs
Normal file
21
libs/utils/src/env.rs
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
//! Wrapper around `std::env::var` for parsing environment variables.
|
||||||
|
|
||||||
|
use std::{fmt::Display, str::FromStr};
|
||||||
|
|
||||||
|
pub fn var<V, E>(varname: &str) -> Option<V>
|
||||||
|
where
|
||||||
|
V: FromStr<Err = E>,
|
||||||
|
E: Display,
|
||||||
|
{
|
||||||
|
match std::env::var(varname) {
|
||||||
|
Ok(s) => Some(
|
||||||
|
s.parse()
|
||||||
|
.map_err(|e| format!("failed to parse env var {varname}: {e:#}"))
|
||||||
|
.unwrap(),
|
||||||
|
),
|
||||||
|
Err(std::env::VarError::NotPresent) => None,
|
||||||
|
Err(std::env::VarError::NotUnicode(_)) => {
|
||||||
|
panic!("env var {varname} is not unicode")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -63,6 +63,7 @@ pub mod measured_stream;
|
|||||||
|
|
||||||
pub mod serde_percent;
|
pub mod serde_percent;
|
||||||
pub mod serde_regex;
|
pub mod serde_regex;
|
||||||
|
pub mod serde_system_time;
|
||||||
|
|
||||||
pub mod pageserver_feedback;
|
pub mod pageserver_feedback;
|
||||||
|
|
||||||
@@ -89,6 +90,10 @@ pub mod yielding_loop;
|
|||||||
|
|
||||||
pub mod zstd;
|
pub mod zstd;
|
||||||
|
|
||||||
|
pub mod env;
|
||||||
|
|
||||||
|
pub mod poison;
|
||||||
|
|
||||||
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
|
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
|
||||||
///
|
///
|
||||||
/// we have several cases:
|
/// we have several cases:
|
||||||
|
|||||||
121
libs/utils/src/poison.rs
Normal file
121
libs/utils/src/poison.rs
Normal file
@@ -0,0 +1,121 @@
|
|||||||
|
//! Protect a piece of state from reuse after it is left in an inconsistent state.
|
||||||
|
//!
|
||||||
|
//! # Example
|
||||||
|
//!
|
||||||
|
//! ```
|
||||||
|
//! # tokio_test::block_on(async {
|
||||||
|
//! use utils::poison::Poison;
|
||||||
|
//! use std::time::Duration;
|
||||||
|
//!
|
||||||
|
//! struct State {
|
||||||
|
//! clean: bool,
|
||||||
|
//! }
|
||||||
|
//! let state = tokio::sync::Mutex::new(Poison::new("mystate", State { clean: true }));
|
||||||
|
//!
|
||||||
|
//! let mut mutex_guard = state.lock().await;
|
||||||
|
//! let mut poison_guard = mutex_guard.check_and_arm()?;
|
||||||
|
//! let state = poison_guard.data_mut();
|
||||||
|
//! state.clean = false;
|
||||||
|
//! // If we get cancelled at this await point, subsequent check_and_arm() calls will fail.
|
||||||
|
//! tokio::time::sleep(Duration::from_secs(10)).await;
|
||||||
|
//! state.clean = true;
|
||||||
|
//! poison_guard.disarm();
|
||||||
|
//! # Ok::<(), utils::poison::Error>(())
|
||||||
|
//! # });
|
||||||
|
//! ```
|
||||||
|
|
||||||
|
use tracing::warn;
|
||||||
|
|
||||||
|
pub struct Poison<T> {
|
||||||
|
what: &'static str,
|
||||||
|
state: State,
|
||||||
|
data: T,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Copy)]
|
||||||
|
enum State {
|
||||||
|
Clean,
|
||||||
|
Armed,
|
||||||
|
Poisoned { at: chrono::DateTime<chrono::Utc> },
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T> Poison<T> {
|
||||||
|
/// We log `what` `warning!` level if the [`Guard`] gets dropped without being [`Guard::disarm`]ed.
|
||||||
|
pub fn new(what: &'static str, data: T) -> Self {
|
||||||
|
Self {
|
||||||
|
what,
|
||||||
|
state: State::Clean,
|
||||||
|
data,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check for poisoning and return a [`Guard`] that provides access to the wrapped state.
|
||||||
|
pub fn check_and_arm(&mut self) -> Result<Guard<T>, Error> {
|
||||||
|
match self.state {
|
||||||
|
State::Clean => {
|
||||||
|
self.state = State::Armed;
|
||||||
|
Ok(Guard(self))
|
||||||
|
}
|
||||||
|
State::Armed => unreachable!("transient state"),
|
||||||
|
State::Poisoned { at } => Err(Error::Poisoned {
|
||||||
|
what: self.what,
|
||||||
|
at,
|
||||||
|
}),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Use [`Self::data`] and [`Self::data_mut`] to access the wrapped state.
|
||||||
|
/// Once modifications are done, use [`Self::disarm`].
|
||||||
|
/// If [`Guard`] gets dropped instead of calling [`Self::disarm`], the state is poisoned
|
||||||
|
/// and subsequent calls to [`Poison::check_and_arm`] will fail with an error.
|
||||||
|
pub struct Guard<'a, T>(&'a mut Poison<T>);
|
||||||
|
|
||||||
|
impl<'a, T> Guard<'a, T> {
|
||||||
|
pub fn data(&self) -> &T {
|
||||||
|
&self.0.data
|
||||||
|
}
|
||||||
|
pub fn data_mut(&mut self) -> &mut T {
|
||||||
|
&mut self.0.data
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn disarm(self) {
|
||||||
|
match self.0.state {
|
||||||
|
State::Clean => unreachable!("we set it to Armed in check_and_arm()"),
|
||||||
|
State::Armed => {
|
||||||
|
self.0.state = State::Clean;
|
||||||
|
}
|
||||||
|
State::Poisoned { at } => {
|
||||||
|
unreachable!("we fail check_and_arm() if it's in that state: {at}")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a, T> Drop for Guard<'a, T> {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
match self.0.state {
|
||||||
|
State::Clean => {
|
||||||
|
// set by disarm()
|
||||||
|
}
|
||||||
|
State::Armed => {
|
||||||
|
// still armed => poison it
|
||||||
|
let at = chrono::Utc::now();
|
||||||
|
self.0.state = State::Poisoned { at };
|
||||||
|
warn!(at=?at, "poisoning {}", self.0.what);
|
||||||
|
}
|
||||||
|
State::Poisoned { at } => {
|
||||||
|
unreachable!("we fail check_and_arm() if it's in that state: {at}")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(thiserror::Error, Debug)]
|
||||||
|
pub enum Error {
|
||||||
|
#[error("poisoned at {at}: {what}")]
|
||||||
|
Poisoned {
|
||||||
|
what: &'static str,
|
||||||
|
at: chrono::DateTime<chrono::Utc>,
|
||||||
|
},
|
||||||
|
}
|
||||||
@@ -182,6 +182,18 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Check if [`Self::wait_for`] or [`Self::wait_for_timeout`] would wait if called with `num`.
|
||||||
|
pub fn would_wait_for(&self, num: V) -> Result<(), V> {
|
||||||
|
let internal = self.internal.lock().unwrap();
|
||||||
|
let cnt = internal.current.cnt_value();
|
||||||
|
drop(internal);
|
||||||
|
if cnt >= num {
|
||||||
|
Ok(())
|
||||||
|
} else {
|
||||||
|
Err(cnt)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Register and return a channel that will be notified when a number arrives,
|
/// Register and return a channel that will be notified when a number arrives,
|
||||||
/// or None, if it has already arrived.
|
/// or None, if it has already arrived.
|
||||||
fn queue_for_wait(&self, num: V) -> Result<Option<Receiver<()>>, SeqWaitError> {
|
fn queue_for_wait(&self, num: V) -> Result<Option<Receiver<()>>, SeqWaitError> {
|
||||||
|
|||||||
55
libs/utils/src/serde_system_time.rs
Normal file
55
libs/utils/src/serde_system_time.rs
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
//! A `serde::{Deserialize,Serialize}` type for SystemTime with RFC3339 format and millisecond precision.
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, serde::Serialize, serde::Deserialize)]
|
||||||
|
#[serde(transparent)]
|
||||||
|
pub struct SystemTime(
|
||||||
|
#[serde(
|
||||||
|
deserialize_with = "deser_rfc3339_millis",
|
||||||
|
serialize_with = "ser_rfc3339_millis"
|
||||||
|
)]
|
||||||
|
pub std::time::SystemTime,
|
||||||
|
);
|
||||||
|
|
||||||
|
fn ser_rfc3339_millis<S: serde::ser::Serializer>(
|
||||||
|
ts: &std::time::SystemTime,
|
||||||
|
serializer: S,
|
||||||
|
) -> Result<S::Ok, S::Error> {
|
||||||
|
serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<std::time::SystemTime, D::Error>
|
||||||
|
where
|
||||||
|
D: serde::de::Deserializer<'de>,
|
||||||
|
{
|
||||||
|
let s: String = serde::de::Deserialize::deserialize(deserializer)?;
|
||||||
|
humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
/// Helper function to make a SystemTime have millisecond precision by truncating additional nanoseconds.
|
||||||
|
fn to_millisecond_precision(time: SystemTime) -> SystemTime {
|
||||||
|
match time.0.duration_since(std::time::SystemTime::UNIX_EPOCH) {
|
||||||
|
Ok(duration) => {
|
||||||
|
let total_millis = duration.as_secs() * 1_000 + u64::from(duration.subsec_millis());
|
||||||
|
SystemTime(
|
||||||
|
std::time::SystemTime::UNIX_EPOCH
|
||||||
|
+ std::time::Duration::from_millis(total_millis),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
Err(_) => time,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_serialize_deserialize() {
|
||||||
|
let input = SystemTime(std::time::SystemTime::now());
|
||||||
|
let expected_serialized = format!("\"{}\"", humantime::format_rfc3339_millis(input.0));
|
||||||
|
let serialized = serde_json::to_string(&input).unwrap();
|
||||||
|
assert_eq!(expected_serialized, serialized);
|
||||||
|
let deserialized: SystemTime = serde_json::from_str(&expected_serialized).unwrap();
|
||||||
|
assert_eq!(to_millisecond_precision(input), deserialized);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -192,6 +192,14 @@ impl<T> OnceCell<T> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Like [`Guard::take_and_deinit`], but will return `None` if this OnceCell was never
|
||||||
|
/// initialized.
|
||||||
|
pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> {
|
||||||
|
let inner = self.inner.get_mut().unwrap();
|
||||||
|
|
||||||
|
inner.take_and_deinit()
|
||||||
|
}
|
||||||
|
|
||||||
/// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete.
|
/// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete.
|
||||||
pub fn initializer_count(&self) -> usize {
|
pub fn initializer_count(&self) -> usize {
|
||||||
self.initializers.load(Ordering::Relaxed)
|
self.initializers.load(Ordering::Relaxed)
|
||||||
@@ -246,15 +254,23 @@ impl<'a, T> Guard<'a, T> {
|
|||||||
/// The permit will be on a semaphore part of the new internal value, and any following
|
/// The permit will be on a semaphore part of the new internal value, and any following
|
||||||
/// [`OnceCell::get_or_init`] will wait on it to complete.
|
/// [`OnceCell::get_or_init`] will wait on it to complete.
|
||||||
pub fn take_and_deinit(mut self) -> (T, InitPermit) {
|
pub fn take_and_deinit(mut self) -> (T, InitPermit) {
|
||||||
|
self.0
|
||||||
|
.take_and_deinit()
|
||||||
|
.expect("guard is not created unless value has been initialized")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T> Inner<T> {
|
||||||
|
pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> {
|
||||||
|
let value = self.value.take()?;
|
||||||
|
|
||||||
let mut swapped = Inner::default();
|
let mut swapped = Inner::default();
|
||||||
let sem = swapped.init_semaphore.clone();
|
let sem = swapped.init_semaphore.clone();
|
||||||
// acquire and forget right away, moving the control over to InitPermit
|
// acquire and forget right away, moving the control over to InitPermit
|
||||||
sem.try_acquire().expect("we just created this").forget();
|
sem.try_acquire().expect("we just created this").forget();
|
||||||
std::mem::swap(&mut *self.0, &mut swapped);
|
let permit = InitPermit(sem);
|
||||||
swapped
|
std::mem::swap(self, &mut swapped);
|
||||||
.value
|
Some((value, permit))
|
||||||
.map(|v| (v, InitPermit(sem)))
|
|
||||||
.expect("guard is not created unless value has been initialized")
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -263,6 +279,13 @@ impl<'a, T> Guard<'a, T> {
|
|||||||
/// On drop, this type will return the permit.
|
/// On drop, this type will return the permit.
|
||||||
pub struct InitPermit(Arc<tokio::sync::Semaphore>);
|
pub struct InitPermit(Arc<tokio::sync::Semaphore>);
|
||||||
|
|
||||||
|
impl std::fmt::Debug for InitPermit {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
let ptr = Arc::as_ptr(&self.0) as *const ();
|
||||||
|
f.debug_tuple("InitPermit").field(&ptr).finish()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl Drop for InitPermit {
|
impl Drop for InitPermit {
|
||||||
fn drop(&mut self) {
|
fn drop(&mut self) {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
@@ -559,4 +582,22 @@ mod tests {
|
|||||||
|
|
||||||
assert_eq!(*target.get().unwrap(), 11);
|
assert_eq!(*target.get().unwrap(), 11);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn take_and_deinit_on_mut() {
|
||||||
|
use std::convert::Infallible;
|
||||||
|
|
||||||
|
let mut target = OnceCell::<u32>::default();
|
||||||
|
assert!(target.take_and_deinit().is_none());
|
||||||
|
|
||||||
|
target
|
||||||
|
.get_or_init(|permit| async move { Ok::<_, Infallible>((42, permit)) })
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let again = target.take_and_deinit();
|
||||||
|
assert!(matches!(again, Some((42, _))), "{again:?}");
|
||||||
|
|
||||||
|
assert!(target.take_and_deinit().is_none());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -70,6 +70,7 @@ tokio-stream.workspace = true
|
|||||||
tokio-util.workspace = true
|
tokio-util.workspace = true
|
||||||
toml_edit = { workspace = true, features = [ "serde" ] }
|
toml_edit = { workspace = true, features = [ "serde" ] }
|
||||||
tracing.workspace = true
|
tracing.workspace = true
|
||||||
|
twox-hash.workspace = true
|
||||||
url.workspace = true
|
url.workspace = true
|
||||||
walkdir.workspace = true
|
walkdir.workspace = true
|
||||||
metrics.workspace = true
|
metrics.workspace = true
|
||||||
|
|||||||
@@ -27,30 +27,50 @@
|
|||||||
//!
|
//!
|
||||||
//! # Reference Numbers
|
//! # Reference Numbers
|
||||||
//!
|
//!
|
||||||
//! 2024-03-20 on i3en.3xlarge
|
//! 2024-04-15 on i3en.3xlarge
|
||||||
//!
|
//!
|
||||||
//! ```text
|
//! ```text
|
||||||
//! short/1 time: [26.483 µs 26.614 µs 26.767 µs]
|
//! async-short/1 time: [24.584 µs 24.737 µs 24.922 µs]
|
||||||
//! short/2 time: [32.223 µs 32.465 µs 32.767 µs]
|
//! async-short/2 time: [33.479 µs 33.660 µs 33.888 µs]
|
||||||
//! short/4 time: [47.203 µs 47.583 µs 47.984 µs]
|
//! async-short/4 time: [42.713 µs 43.046 µs 43.440 µs]
|
||||||
//! short/8 time: [89.135 µs 89.612 µs 90.139 µs]
|
//! async-short/8 time: [71.814 µs 72.478 µs 73.240 µs]
|
||||||
//! short/16 time: [190.12 µs 191.52 µs 192.88 µs]
|
//! async-short/16 time: [132.73 µs 134.45 µs 136.22 µs]
|
||||||
//! short/32 time: [380.96 µs 382.63 µs 384.20 µs]
|
//! async-short/32 time: [258.31 µs 260.73 µs 263.27 µs]
|
||||||
//! short/64 time: [736.86 µs 741.07 µs 745.03 µs]
|
//! async-short/64 time: [511.61 µs 514.44 µs 517.51 µs]
|
||||||
//! short/128 time: [1.4106 ms 1.4206 ms 1.4294 ms]
|
//! async-short/128 time: [992.64 µs 998.23 µs 1.0042 ms]
|
||||||
//! medium/1 time: [111.81 µs 112.25 µs 112.79 µs]
|
//! async-medium/1 time: [110.11 µs 110.50 µs 110.96 µs]
|
||||||
//! medium/2 time: [158.26 µs 159.13 µs 160.21 µs]
|
//! async-medium/2 time: [153.06 µs 153.85 µs 154.99 µs]
|
||||||
//! medium/4 time: [334.65 µs 337.14 µs 340.07 µs]
|
//! async-medium/4 time: [317.51 µs 319.92 µs 322.85 µs]
|
||||||
//! medium/8 time: [675.32 µs 679.91 µs 685.25 µs]
|
//! async-medium/8 time: [638.30 µs 644.68 µs 652.12 µs]
|
||||||
//! medium/16 time: [1.2929 ms 1.2996 ms 1.3067 ms]
|
//! async-medium/16 time: [1.2651 ms 1.2773 ms 1.2914 ms]
|
||||||
//! medium/32 time: [2.4295 ms 2.4461 ms 2.4623 ms]
|
//! async-medium/32 time: [2.5117 ms 2.5410 ms 2.5720 ms]
|
||||||
//! medium/64 time: [4.3973 ms 4.4458 ms 4.4875 ms]
|
//! async-medium/64 time: [4.8088 ms 4.8555 ms 4.9047 ms]
|
||||||
//! medium/128 time: [7.5955 ms 7.7847 ms 7.9481 ms]
|
//! async-medium/128 time: [8.8311 ms 8.9849 ms 9.1263 ms]
|
||||||
|
//! sync-short/1 time: [25.503 µs 25.626 µs 25.771 µs]
|
||||||
|
//! sync-short/2 time: [30.850 µs 31.013 µs 31.208 µs]
|
||||||
|
//! sync-short/4 time: [45.543 µs 45.856 µs 46.193 µs]
|
||||||
|
//! sync-short/8 time: [84.114 µs 84.639 µs 85.220 µs]
|
||||||
|
//! sync-short/16 time: [185.22 µs 186.15 µs 187.13 µs]
|
||||||
|
//! sync-short/32 time: [377.43 µs 378.87 µs 380.46 µs]
|
||||||
|
//! sync-short/64 time: [756.49 µs 759.04 µs 761.70 µs]
|
||||||
|
//! sync-short/128 time: [1.4825 ms 1.4874 ms 1.4923 ms]
|
||||||
|
//! sync-medium/1 time: [105.66 µs 106.01 µs 106.43 µs]
|
||||||
|
//! sync-medium/2 time: [153.10 µs 153.84 µs 154.72 µs]
|
||||||
|
//! sync-medium/4 time: [327.13 µs 329.44 µs 332.27 µs]
|
||||||
|
//! sync-medium/8 time: [654.26 µs 658.73 µs 663.63 µs]
|
||||||
|
//! sync-medium/16 time: [1.2682 ms 1.2748 ms 1.2816 ms]
|
||||||
|
//! sync-medium/32 time: [2.4456 ms 2.4595 ms 2.4731 ms]
|
||||||
|
//! sync-medium/64 time: [4.6523 ms 4.6890 ms 4.7256 ms]
|
||||||
|
//! sync-medium/128 time: [8.7215 ms 8.8323 ms 8.9344 ms]
|
||||||
//! ```
|
//! ```
|
||||||
|
|
||||||
use bytes::{Buf, Bytes};
|
use bytes::{Buf, Bytes};
|
||||||
use criterion::{BenchmarkId, Criterion};
|
use criterion::{BenchmarkId, Criterion};
|
||||||
use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
|
use pageserver::{
|
||||||
|
config::PageServerConf,
|
||||||
|
walrecord::NeonWalRecord,
|
||||||
|
walredo::{PostgresRedoManager, ProcessKind},
|
||||||
|
};
|
||||||
use pageserver_api::{key::Key, shard::TenantShardId};
|
use pageserver_api::{key::Key, shard::TenantShardId};
|
||||||
use std::{
|
use std::{
|
||||||
sync::Arc,
|
sync::Arc,
|
||||||
@@ -60,33 +80,39 @@ use tokio::{sync::Barrier, task::JoinSet};
|
|||||||
use utils::{id::TenantId, lsn::Lsn};
|
use utils::{id::TenantId, lsn::Lsn};
|
||||||
|
|
||||||
fn bench(c: &mut Criterion) {
|
fn bench(c: &mut Criterion) {
|
||||||
{
|
for process_kind in &[ProcessKind::Async, ProcessKind::Sync] {
|
||||||
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
{
|
||||||
for nclients in nclients {
|
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||||
let mut group = c.benchmark_group("short");
|
for nclients in nclients {
|
||||||
group.bench_with_input(
|
let mut group = c.benchmark_group(format!("{process_kind}-short"));
|
||||||
BenchmarkId::from_parameter(nclients),
|
group.bench_with_input(
|
||||||
&nclients,
|
BenchmarkId::from_parameter(nclients),
|
||||||
|b, nclients| {
|
&nclients,
|
||||||
let redo_work = Arc::new(Request::short_input());
|
|b, nclients| {
|
||||||
b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
|
let redo_work = Arc::new(Request::short_input());
|
||||||
},
|
b.iter_custom(|iters| {
|
||||||
);
|
bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
|
||||||
|
});
|
||||||
|
},
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
{
|
{
|
||||||
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||||
for nclients in nclients {
|
for nclients in nclients {
|
||||||
let mut group = c.benchmark_group("medium");
|
let mut group = c.benchmark_group(format!("{process_kind}-medium"));
|
||||||
group.bench_with_input(
|
group.bench_with_input(
|
||||||
BenchmarkId::from_parameter(nclients),
|
BenchmarkId::from_parameter(nclients),
|
||||||
&nclients,
|
&nclients,
|
||||||
|b, nclients| {
|
|b, nclients| {
|
||||||
let redo_work = Arc::new(Request::medium_input());
|
let redo_work = Arc::new(Request::medium_input());
|
||||||
b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
|
b.iter_custom(|iters| {
|
||||||
},
|
bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
|
||||||
);
|
});
|
||||||
|
},
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -94,10 +120,16 @@ criterion::criterion_group!(benches, bench);
|
|||||||
criterion::criterion_main!(benches);
|
criterion::criterion_main!(benches);
|
||||||
|
|
||||||
// Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
|
// Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
|
||||||
fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
|
fn bench_impl(
|
||||||
|
process_kind: ProcessKind,
|
||||||
|
redo_work: Arc<Request>,
|
||||||
|
n_redos: u64,
|
||||||
|
nclients: u64,
|
||||||
|
) -> Duration {
|
||||||
let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
|
let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
|
||||||
|
|
||||||
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
|
let mut conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
|
||||||
|
conf.walredo_process_kind = process_kind;
|
||||||
let conf = Box::leak(Box::new(conf));
|
let conf = Box::leak(Box::new(conf));
|
||||||
let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
|
let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
|
||||||
|
|
||||||
@@ -113,25 +145,40 @@ fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration
|
|||||||
let manager = PostgresRedoManager::new(conf, tenant_shard_id);
|
let manager = PostgresRedoManager::new(conf, tenant_shard_id);
|
||||||
let manager = Arc::new(manager);
|
let manager = Arc::new(manager);
|
||||||
|
|
||||||
|
// divide the amount of work equally among the clients.
|
||||||
|
let nredos_per_client = n_redos / nclients;
|
||||||
for _ in 0..nclients {
|
for _ in 0..nclients {
|
||||||
rt.block_on(async {
|
rt.block_on(async {
|
||||||
tasks.spawn(client(
|
tasks.spawn(client(
|
||||||
Arc::clone(&manager),
|
Arc::clone(&manager),
|
||||||
Arc::clone(&start),
|
Arc::clone(&start),
|
||||||
Arc::clone(&redo_work),
|
Arc::clone(&redo_work),
|
||||||
// divide the amount of work equally among the clients
|
nredos_per_client,
|
||||||
n_redos / nclients,
|
|
||||||
))
|
))
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
rt.block_on(async move {
|
let elapsed = rt.block_on(async move {
|
||||||
let mut total_wallclock_time = std::time::Duration::from_millis(0);
|
let mut total_wallclock_time = Duration::ZERO;
|
||||||
while let Some(res) = tasks.join_next().await {
|
while let Some(res) = tasks.join_next().await {
|
||||||
total_wallclock_time += res.unwrap();
|
total_wallclock_time += res.unwrap();
|
||||||
}
|
}
|
||||||
total_wallclock_time
|
total_wallclock_time
|
||||||
})
|
});
|
||||||
|
|
||||||
|
// consistency check to ensure process kind setting worked
|
||||||
|
if nredos_per_client > 0 {
|
||||||
|
assert_eq!(
|
||||||
|
manager
|
||||||
|
.status()
|
||||||
|
.process
|
||||||
|
.map(|p| p.kind)
|
||||||
|
.expect("the benchmark work causes a walredo process to be spawned"),
|
||||||
|
std::borrow::Cow::Borrowed(process_kind.into())
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
elapsed
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn client(
|
async fn client(
|
||||||
|
|||||||
@@ -128,12 +128,12 @@ impl Client {
|
|||||||
|
|
||||||
pub async fn timeline_info(
|
pub async fn timeline_info(
|
||||||
&self,
|
&self,
|
||||||
tenant_id: TenantId,
|
tenant_shard_id: TenantShardId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
force_await_logical_size: ForceAwaitLogicalSize,
|
force_await_logical_size: ForceAwaitLogicalSize,
|
||||||
) -> Result<pageserver_api::models::TimelineInfo> {
|
) -> Result<pageserver_api::models::TimelineInfo> {
|
||||||
let uri = format!(
|
let uri = format!(
|
||||||
"{}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
|
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}",
|
||||||
self.mgmt_api_endpoint
|
self.mgmt_api_endpoint
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -151,11 +151,11 @@ impl Client {
|
|||||||
|
|
||||||
pub async fn keyspace(
|
pub async fn keyspace(
|
||||||
&self,
|
&self,
|
||||||
tenant_id: TenantId,
|
tenant_shard_id: TenantShardId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
) -> Result<pageserver_api::models::partitioning::Partitioning> {
|
) -> Result<pageserver_api::models::partitioning::Partitioning> {
|
||||||
let uri = format!(
|
let uri = format!(
|
||||||
"{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/keyspace",
|
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/keyspace",
|
||||||
self.mgmt_api_endpoint
|
self.mgmt_api_endpoint
|
||||||
);
|
);
|
||||||
self.get(&uri)
|
self.get(&uri)
|
||||||
@@ -279,7 +279,7 @@ impl Client {
|
|||||||
lazy: bool,
|
lazy: bool,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let req_body = TenantLocationConfigRequest {
|
let req_body = TenantLocationConfigRequest {
|
||||||
tenant_id: Some(tenant_shard_id),
|
tenant_id: None,
|
||||||
config,
|
config,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -11,7 +11,6 @@ default = []
|
|||||||
anyhow.workspace = true
|
anyhow.workspace = true
|
||||||
async-compression.workspace = true
|
async-compression.workspace = true
|
||||||
async-stream.workspace = true
|
async-stream.workspace = true
|
||||||
async-trait.workspace = true
|
|
||||||
byteorder.workspace = true
|
byteorder.workspace = true
|
||||||
bytes.workspace = true
|
bytes.workspace = true
|
||||||
chrono = { workspace = true, features = ["serde"] }
|
chrono = { workspace = true, features = ["serde"] }
|
||||||
|
|||||||
@@ -43,7 +43,8 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
|
|||||||
fanout: u64,
|
fanout: u64,
|
||||||
ctx: &E::RequestContext,
|
ctx: &E::RequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
assert!(fanout >= 2);
|
assert!(fanout >= 1, "fanout needs to be at least 1 but is {fanout}");
|
||||||
|
let exp_base = fanout.max(2);
|
||||||
// Start at L0
|
// Start at L0
|
||||||
let mut current_level_no = 0;
|
let mut current_level_no = 0;
|
||||||
let mut current_level_target_height = target_file_size;
|
let mut current_level_target_height = target_file_size;
|
||||||
@@ -106,7 +107,7 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
current_level_no += 1;
|
current_level_no += 1;
|
||||||
current_level_target_height = current_level_target_height.saturating_mul(fanout);
|
current_level_target_height = current_level_target_height.saturating_mul(exp_base);
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -180,7 +180,7 @@ where
|
|||||||
match top.deref_mut() {
|
match top.deref_mut() {
|
||||||
LazyLoadLayer::Unloaded(ref mut l) => {
|
LazyLoadLayer::Unloaded(ref mut l) => {
|
||||||
let fut = l.load_keys(this.ctx);
|
let fut = l.load_keys(this.ctx);
|
||||||
this.load_future.set(Some(fut));
|
this.load_future.set(Some(Box::pin(fut)));
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
LazyLoadLayer::Loaded(ref mut entries) => {
|
LazyLoadLayer::Loaded(ref mut entries) => {
|
||||||
|
|||||||
@@ -3,7 +3,6 @@
|
|||||||
//!
|
//!
|
||||||
//! All the heavy lifting is done by the create_image and create_delta
|
//! All the heavy lifting is done by the create_image and create_delta
|
||||||
//! functions that the implementor provides.
|
//! functions that the implementor provides.
|
||||||
use async_trait::async_trait;
|
|
||||||
use futures::Future;
|
use futures::Future;
|
||||||
use pageserver_api::{key::Key, keyspace::key_range_size};
|
use pageserver_api::{key::Key, keyspace::key_range_size};
|
||||||
use std::ops::Range;
|
use std::ops::Range;
|
||||||
@@ -141,18 +140,16 @@ pub trait CompactionLayer<K: CompactionKey + ?Sized> {
|
|||||||
|
|
||||||
fn is_delta(&self) -> bool;
|
fn is_delta(&self) -> bool;
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait]
|
|
||||||
pub trait CompactionDeltaLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {
|
pub trait CompactionDeltaLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {
|
||||||
type DeltaEntry<'a>: CompactionDeltaEntry<'a, E::Key>
|
type DeltaEntry<'a>: CompactionDeltaEntry<'a, E::Key>
|
||||||
where
|
where
|
||||||
Self: 'a;
|
Self: 'a;
|
||||||
|
|
||||||
/// Return all keys in this delta layer.
|
/// Return all keys in this delta layer.
|
||||||
async fn load_keys<'a>(
|
fn load_keys<'a>(
|
||||||
&self,
|
&self,
|
||||||
ctx: &E::RequestContext,
|
ctx: &E::RequestContext,
|
||||||
) -> anyhow::Result<Vec<Self::DeltaEntry<'_>>>;
|
) -> impl Future<Output = anyhow::Result<Vec<Self::DeltaEntry<'_>>>> + Send;
|
||||||
}
|
}
|
||||||
|
|
||||||
pub trait CompactionImageLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {}
|
pub trait CompactionImageLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {}
|
||||||
|
|||||||
@@ -2,7 +2,6 @@ mod draw;
|
|||||||
|
|
||||||
use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp};
|
use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp};
|
||||||
|
|
||||||
use async_trait::async_trait;
|
|
||||||
use futures::StreamExt;
|
use futures::StreamExt;
|
||||||
use rand::Rng;
|
use rand::Rng;
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
@@ -139,7 +138,6 @@ impl interface::CompactionLayer<Key> for Arc<MockDeltaLayer> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait]
|
|
||||||
impl interface::CompactionDeltaLayer<MockTimeline> for Arc<MockDeltaLayer> {
|
impl interface::CompactionDeltaLayer<MockTimeline> for Arc<MockDeltaLayer> {
|
||||||
type DeltaEntry<'a> = MockRecord;
|
type DeltaEntry<'a> = MockRecord;
|
||||||
|
|
||||||
|
|||||||
@@ -12,9 +12,14 @@ bytes.workspace = true
|
|||||||
camino.workspace = true
|
camino.workspace = true
|
||||||
clap = { workspace = true, features = ["string"] }
|
clap = { workspace = true, features = ["string"] }
|
||||||
git-version.workspace = true
|
git-version.workspace = true
|
||||||
|
humantime.workspace = true
|
||||||
pageserver = { path = ".." }
|
pageserver = { path = ".." }
|
||||||
|
pageserver_api.workspace = true
|
||||||
|
remote_storage = { path = "../../libs/remote_storage" }
|
||||||
postgres_ffi.workspace = true
|
postgres_ffi.workspace = true
|
||||||
tokio.workspace = true
|
tokio.workspace = true
|
||||||
|
tokio-util.workspace = true
|
||||||
|
toml_edit.workspace = true
|
||||||
utils.workspace = true
|
utils.workspace = true
|
||||||
svg_fmt.workspace = true
|
svg_fmt.workspace = true
|
||||||
workspace_hack.workspace = true
|
workspace_hack.workspace = true
|
||||||
|
|||||||
@@ -9,18 +9,45 @@
|
|||||||
//! Coordinates in both axis are compressed for better readability.
|
//! Coordinates in both axis are compressed for better readability.
|
||||||
//! (see <https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb>)
|
//! (see <https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb>)
|
||||||
//!
|
//!
|
||||||
//! Example use:
|
//! The plain text API was chosen so that we can easily work with filenames from various
|
||||||
|
//! sources; see the Usage section below for examples.
|
||||||
|
//!
|
||||||
|
//! # Usage
|
||||||
|
//!
|
||||||
|
//! ## Producing the SVG
|
||||||
|
//!
|
||||||
//! ```bash
|
//! ```bash
|
||||||
//! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
|
//!
|
||||||
//! $ grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
|
//! # local timeline dir
|
||||||
//! $ firefox out.svg
|
//! ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
|
||||||
|
//! grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
|
||||||
|
//!
|
||||||
|
//! # Layer map dump from `/v1/tenant/$TENANT/timeline/$TIMELINE/layer`
|
||||||
|
//! (jq -r '.historic_layers[] | .layer_file_name' | cargo run -p pagectl draw-timeline) < layer-map.json > out.svg
|
||||||
|
//!
|
||||||
|
//! # From an `index_part.json` in S3
|
||||||
|
//! (jq -r '.layer_metadata | keys[]' | cargo run -p pagectl draw-timeline ) < index_part.json-00000016 > out.svg
|
||||||
|
//!
|
||||||
//! ```
|
//! ```
|
||||||
//!
|
//!
|
||||||
//! This API was chosen so that we can easily work with filenames extracted from ssh,
|
//! ## Viewing
|
||||||
//! or from pageserver log files.
|
|
||||||
//!
|
//!
|
||||||
//! TODO Consider shipping this as a grafana panel plugin:
|
//! **Inkscape** is better than the built-in viewers in browsers.
|
||||||
//! <https://grafana.com/tutorials/build-a-panel-plugin/>
|
//!
|
||||||
|
//! After selecting a layer file rectangle, use "Open XML Editor" (Ctrl|Cmd + Shift + X)
|
||||||
|
//! to see the layer file name in the comment field.
|
||||||
|
//!
|
||||||
|
//! ```bash
|
||||||
|
//!
|
||||||
|
//! # Linux
|
||||||
|
//! inkscape out.svg
|
||||||
|
//!
|
||||||
|
//! # macOS
|
||||||
|
//! /Applications/Inkscape.app/Contents/MacOS/inkscape out.svg
|
||||||
|
//!
|
||||||
|
//! ```
|
||||||
|
//!
|
||||||
|
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use pageserver::repository::Key;
|
use pageserver::repository::Key;
|
||||||
use pageserver::METADATA_FILE_NAME;
|
use pageserver::METADATA_FILE_NAME;
|
||||||
@@ -65,7 +92,12 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
|
|||||||
|
|
||||||
pub fn main() -> Result<()> {
|
pub fn main() -> Result<()> {
|
||||||
// Parse layer filenames from stdin
|
// Parse layer filenames from stdin
|
||||||
let mut ranges: Vec<(Range<Key>, Range<Lsn>)> = vec![];
|
struct Layer {
|
||||||
|
filename: String,
|
||||||
|
key_range: Range<Key>,
|
||||||
|
lsn_range: Range<Lsn>,
|
||||||
|
}
|
||||||
|
let mut files: Vec<Layer> = vec![];
|
||||||
let stdin = io::stdin();
|
let stdin = io::stdin();
|
||||||
for line in stdin.lock().lines() {
|
for line in stdin.lock().lines() {
|
||||||
let line = line.unwrap();
|
let line = line.unwrap();
|
||||||
@@ -76,14 +108,23 @@ pub fn main() -> Result<()> {
|
|||||||
// Don't try and parse "metadata" like a key-lsn range
|
// Don't try and parse "metadata" like a key-lsn range
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
let range = parse_filename(filename);
|
let (key_range, lsn_range) = parse_filename(filename);
|
||||||
ranges.push(range);
|
files.push(Layer {
|
||||||
|
filename: filename.to_owned(),
|
||||||
|
key_range,
|
||||||
|
lsn_range,
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// Collect all coordinates
|
// Collect all coordinates
|
||||||
let mut keys: Vec<Key> = vec![];
|
let mut keys: Vec<Key> = vec![];
|
||||||
let mut lsns: Vec<Lsn> = vec![];
|
let mut lsns: Vec<Lsn> = vec![];
|
||||||
for (keyr, lsnr) in &ranges {
|
for Layer {
|
||||||
|
key_range: keyr,
|
||||||
|
lsn_range: lsnr,
|
||||||
|
..
|
||||||
|
} in &files
|
||||||
|
{
|
||||||
keys.push(keyr.start);
|
keys.push(keyr.start);
|
||||||
keys.push(keyr.end);
|
keys.push(keyr.end);
|
||||||
lsns.push(lsnr.start);
|
lsns.push(lsnr.start);
|
||||||
@@ -107,7 +148,12 @@ pub fn main() -> Result<()> {
|
|||||||
h: stretch * lsn_map.len() as f32
|
h: stretch * lsn_map.len() as f32
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
for (keyr, lsnr) in &ranges {
|
for Layer {
|
||||||
|
filename,
|
||||||
|
key_range: keyr,
|
||||||
|
lsn_range: lsnr,
|
||||||
|
} in &files
|
||||||
|
{
|
||||||
let key_start = *key_map.get(&keyr.start).unwrap();
|
let key_start = *key_map.get(&keyr.start).unwrap();
|
||||||
let key_end = *key_map.get(&keyr.end).unwrap();
|
let key_end = *key_map.get(&keyr.end).unwrap();
|
||||||
let key_diff = key_end - key_start;
|
let key_diff = key_end - key_start;
|
||||||
@@ -151,6 +197,7 @@ pub fn main() -> Result<()> {
|
|||||||
.fill(fill)
|
.fill(fill)
|
||||||
.stroke(Stroke::Color(rgb(0, 0, 0), 0.1))
|
.stroke(Stroke::Color(rgb(0, 0, 0), 0.1))
|
||||||
.border_radius(0.4)
|
.border_radius(0.4)
|
||||||
|
.comment(filename)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
println!("{}", EndSvg);
|
println!("{}", EndSvg);
|
||||||
|
|||||||
@@ -9,6 +9,11 @@ mod index_part;
|
|||||||
mod layer_map_analyzer;
|
mod layer_map_analyzer;
|
||||||
mod layers;
|
mod layers;
|
||||||
|
|
||||||
|
use std::{
|
||||||
|
str::FromStr,
|
||||||
|
time::{Duration, SystemTime},
|
||||||
|
};
|
||||||
|
|
||||||
use camino::{Utf8Path, Utf8PathBuf};
|
use camino::{Utf8Path, Utf8PathBuf};
|
||||||
use clap::{Parser, Subcommand};
|
use clap::{Parser, Subcommand};
|
||||||
use index_part::IndexPartCmd;
|
use index_part::IndexPartCmd;
|
||||||
@@ -20,8 +25,16 @@ use pageserver::{
|
|||||||
tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
|
tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
|
||||||
virtual_file,
|
virtual_file,
|
||||||
};
|
};
|
||||||
|
use pageserver_api::shard::TenantShardId;
|
||||||
use postgres_ffi::ControlFileData;
|
use postgres_ffi::ControlFileData;
|
||||||
use utils::{lsn::Lsn, project_git_version};
|
use remote_storage::{RemotePath, RemoteStorageConfig};
|
||||||
|
use tokio_util::sync::CancellationToken;
|
||||||
|
use utils::{
|
||||||
|
id::TimelineId,
|
||||||
|
logging::{self, LogFormat, TracingErrorLayerEnablement},
|
||||||
|
lsn::Lsn,
|
||||||
|
project_git_version,
|
||||||
|
};
|
||||||
|
|
||||||
project_git_version!(GIT_VERSION);
|
project_git_version!(GIT_VERSION);
|
||||||
|
|
||||||
@@ -43,6 +56,7 @@ enum Commands {
|
|||||||
#[command(subcommand)]
|
#[command(subcommand)]
|
||||||
IndexPart(IndexPartCmd),
|
IndexPart(IndexPartCmd),
|
||||||
PrintLayerFile(PrintLayerFileCmd),
|
PrintLayerFile(PrintLayerFileCmd),
|
||||||
|
TimeTravelRemotePrefix(TimeTravelRemotePrefixCmd),
|
||||||
DrawTimeline {},
|
DrawTimeline {},
|
||||||
AnalyzeLayerMap(AnalyzeLayerMapCmd),
|
AnalyzeLayerMap(AnalyzeLayerMapCmd),
|
||||||
#[command(subcommand)]
|
#[command(subcommand)]
|
||||||
@@ -68,6 +82,26 @@ struct PrintLayerFileCmd {
|
|||||||
path: Utf8PathBuf,
|
path: Utf8PathBuf,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Roll back the time for the specified prefix using S3 history.
|
||||||
|
///
|
||||||
|
/// The command is fairly low level and powerful. Validation is only very light,
|
||||||
|
/// so it is more powerful, and thus potentially more dangerous.
|
||||||
|
#[derive(Parser)]
|
||||||
|
struct TimeTravelRemotePrefixCmd {
|
||||||
|
/// A configuration string for the remote_storage configuration.
|
||||||
|
///
|
||||||
|
/// Example: `remote_storage = { bucket_name = "aws-storage-bucket-name", bucket_region = "us-east-2" }`
|
||||||
|
config_toml_str: String,
|
||||||
|
/// remote prefix to time travel recover. For safety reasons, we require it to contain
|
||||||
|
/// a timeline or tenant ID in the prefix.
|
||||||
|
prefix: String,
|
||||||
|
/// Timestamp to travel to. Given in format like `2024-01-20T10:45:45Z`. Assumes UTC and second accuracy.
|
||||||
|
travel_to: String,
|
||||||
|
/// Timestamp of the start of the operation, must be after any changes we want to roll back and after.
|
||||||
|
/// You can use a few seconds before invoking the command. Same format as `travel_to`.
|
||||||
|
done_if_after: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Parser)]
|
#[derive(Parser)]
|
||||||
struct AnalyzeLayerMapCmd {
|
struct AnalyzeLayerMapCmd {
|
||||||
/// Pageserver data path
|
/// Pageserver data path
|
||||||
@@ -78,6 +112,14 @@ struct AnalyzeLayerMapCmd {
|
|||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() -> anyhow::Result<()> {
|
async fn main() -> anyhow::Result<()> {
|
||||||
|
logging::init(
|
||||||
|
LogFormat::Plain,
|
||||||
|
TracingErrorLayerEnablement::EnableWithRustLogFilter,
|
||||||
|
logging::Output::Stdout,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
logging::replace_panic_hook_with_tracing_panic_hook().forget();
|
||||||
|
|
||||||
let cli = CliOpts::parse();
|
let cli = CliOpts::parse();
|
||||||
|
|
||||||
match cli.command {
|
match cli.command {
|
||||||
@@ -105,6 +147,42 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
print_layerfile(&cmd.path).await?;
|
print_layerfile(&cmd.path).await?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Commands::TimeTravelRemotePrefix(cmd) => {
|
||||||
|
let timestamp = humantime::parse_rfc3339(&cmd.travel_to)
|
||||||
|
.map_err(|_e| anyhow::anyhow!("Invalid time for travel_to: '{}'", cmd.travel_to))?;
|
||||||
|
|
||||||
|
let done_if_after = if let Some(done_if_after) = &cmd.done_if_after {
|
||||||
|
humantime::parse_rfc3339(done_if_after).map_err(|_e| {
|
||||||
|
anyhow::anyhow!("Invalid time for done_if_after: '{}'", done_if_after)
|
||||||
|
})?
|
||||||
|
} else {
|
||||||
|
const SAFETY_MARGIN: Duration = Duration::from_secs(3);
|
||||||
|
tokio::time::sleep(SAFETY_MARGIN).await;
|
||||||
|
// Convert to string representation and back to get rid of sub-second values
|
||||||
|
let done_if_after = SystemTime::now();
|
||||||
|
tokio::time::sleep(SAFETY_MARGIN).await;
|
||||||
|
done_if_after
|
||||||
|
};
|
||||||
|
|
||||||
|
let timestamp = strip_subsecond(timestamp);
|
||||||
|
let done_if_after = strip_subsecond(done_if_after);
|
||||||
|
|
||||||
|
let Some(prefix) = validate_prefix(&cmd.prefix) else {
|
||||||
|
println!("specified prefix '{}' failed validation", cmd.prefix);
|
||||||
|
return Ok(());
|
||||||
|
};
|
||||||
|
let toml_document = toml_edit::Document::from_str(&cmd.config_toml_str)?;
|
||||||
|
let toml_item = toml_document
|
||||||
|
.get("remote_storage")
|
||||||
|
.expect("need remote_storage");
|
||||||
|
let config = RemoteStorageConfig::from_toml(toml_item)?.expect("incomplete config");
|
||||||
|
let storage = remote_storage::GenericRemoteStorage::from_config(&config);
|
||||||
|
let cancel = CancellationToken::new();
|
||||||
|
storage
|
||||||
|
.unwrap()
|
||||||
|
.time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel)
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -185,3 +263,89 @@ fn handle_metadata(
|
|||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Ensures that the given S3 prefix is sufficiently constrained.
|
||||||
|
/// The command is very risky already and we don't want to expose something
|
||||||
|
/// that allows usually unintentional and quite catastrophic time travel of
|
||||||
|
/// an entire bucket, which would be a major catastrophy and away
|
||||||
|
/// by only one character change (similar to "rm -r /home /username/foobar").
|
||||||
|
fn validate_prefix(prefix: &str) -> Option<RemotePath> {
|
||||||
|
if prefix.is_empty() {
|
||||||
|
// Empty prefix means we want to specify the *whole* bucket
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
let components = prefix.split('/').collect::<Vec<_>>();
|
||||||
|
let (last, components) = {
|
||||||
|
let last = components.last()?;
|
||||||
|
if last.is_empty() {
|
||||||
|
(
|
||||||
|
components.iter().nth_back(1)?,
|
||||||
|
&components[..(components.len() - 1)],
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
(last, &components[..])
|
||||||
|
}
|
||||||
|
};
|
||||||
|
'valid: {
|
||||||
|
if let Ok(_timeline_id) = TimelineId::from_str(last) {
|
||||||
|
// Ends in either a tenant or timeline ID
|
||||||
|
break 'valid;
|
||||||
|
}
|
||||||
|
if *last == "timelines" {
|
||||||
|
if let Some(before_last) = components.iter().nth_back(1) {
|
||||||
|
if let Ok(_tenant_id) = TenantShardId::from_str(before_last) {
|
||||||
|
// Has a valid tenant id
|
||||||
|
break 'valid;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
RemotePath::from_string(prefix).ok()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn strip_subsecond(timestamp: SystemTime) -> SystemTime {
|
||||||
|
let ts_str = humantime::format_rfc3339_seconds(timestamp).to_string();
|
||||||
|
humantime::parse_rfc3339(&ts_str).expect("can't parse just created timestamp")
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_validate_prefix() {
|
||||||
|
assert_eq!(validate_prefix(""), None);
|
||||||
|
assert_eq!(validate_prefix("/"), None);
|
||||||
|
#[track_caller]
|
||||||
|
fn assert_valid(prefix: &str) {
|
||||||
|
let remote_path = RemotePath::from_string(prefix).unwrap();
|
||||||
|
assert_eq!(validate_prefix(prefix), Some(remote_path));
|
||||||
|
}
|
||||||
|
assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/");
|
||||||
|
// Path is not relative but absolute
|
||||||
|
assert_eq!(
|
||||||
|
validate_prefix(
|
||||||
|
"/wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/"
|
||||||
|
),
|
||||||
|
None
|
||||||
|
);
|
||||||
|
assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/");
|
||||||
|
// Partial tenant IDs should be invalid, S3 will match all tenants with the specific ID prefix
|
||||||
|
assert_eq!(validate_prefix("wal/3aa8fcc61f6d357410b7d"), None);
|
||||||
|
assert_eq!(validate_prefix("wal"), None);
|
||||||
|
assert_eq!(validate_prefix("/wal/"), None);
|
||||||
|
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001");
|
||||||
|
// Partial tenant ID
|
||||||
|
assert_eq!(
|
||||||
|
validate_prefix("pageserver/v1/tenants/3aa8fcc61f6d357410b"),
|
||||||
|
None
|
||||||
|
);
|
||||||
|
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines");
|
||||||
|
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001-0004/timelines");
|
||||||
|
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/");
|
||||||
|
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/641e5342083b2235ee3deb8066819683");
|
||||||
|
assert_eq!(validate_prefix("pageserver/v1/tenants/"), None);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
|
use pageserver_api::shard::TenantShardId;
|
||||||
use pageserver_client::mgmt_api::ForceAwaitLogicalSize;
|
use pageserver_client::mgmt_api::ForceAwaitLogicalSize;
|
||||||
use pageserver_client::page_service::BasebackupRequest;
|
use pageserver_client::page_service::BasebackupRequest;
|
||||||
|
|
||||||
@@ -95,7 +96,7 @@ async fn main_impl(
|
|||||||
let timeline = *timeline;
|
let timeline = *timeline;
|
||||||
let info = mgmt_api_client
|
let info = mgmt_api_client
|
||||||
.timeline_info(
|
.timeline_info(
|
||||||
timeline.tenant_id,
|
TenantShardId::unsharded(timeline.tenant_id),
|
||||||
timeline.timeline_id,
|
timeline.timeline_id,
|
||||||
ForceAwaitLogicalSize::No,
|
ForceAwaitLogicalSize::No,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ use pageserver_api::key::{is_rel_block_key, key_to_rel_block, Key};
|
|||||||
use pageserver_api::keyspace::KeySpaceAccum;
|
use pageserver_api::keyspace::KeySpaceAccum;
|
||||||
use pageserver_api::models::PagestreamGetPageRequest;
|
use pageserver_api::models::PagestreamGetPageRequest;
|
||||||
|
|
||||||
|
use pageserver_api::shard::TenantShardId;
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
use utils::id::TenantTimelineId;
|
use utils::id::TenantTimelineId;
|
||||||
use utils::lsn::Lsn;
|
use utils::lsn::Lsn;
|
||||||
@@ -173,7 +174,10 @@ async fn main_impl(
|
|||||||
let timeline = *timeline;
|
let timeline = *timeline;
|
||||||
async move {
|
async move {
|
||||||
let partitioning = mgmt_api_client
|
let partitioning = mgmt_api_client
|
||||||
.keyspace(timeline.tenant_id, timeline.timeline_id)
|
.keyspace(
|
||||||
|
TenantShardId::unsharded(timeline.tenant_id),
|
||||||
|
timeline.timeline_id,
|
||||||
|
)
|
||||||
.await?;
|
.await?;
|
||||||
let lsn = partitioning.at_lsn;
|
let lsn = partitioning.at_lsn;
|
||||||
let start = Instant::now();
|
let start = Instant::now();
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use humantime::Duration;
|
use humantime::Duration;
|
||||||
|
use pageserver_api::shard::TenantShardId;
|
||||||
use tokio::task::JoinSet;
|
use tokio::task::JoinSet;
|
||||||
use utils::id::TenantTimelineId;
|
use utils::id::TenantTimelineId;
|
||||||
|
|
||||||
@@ -59,7 +60,11 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
|
|||||||
let mgmt_api_client = Arc::clone(&mgmt_api_client);
|
let mgmt_api_client = Arc::clone(&mgmt_api_client);
|
||||||
js.spawn(async move {
|
js.spawn(async move {
|
||||||
let info = mgmt_api_client
|
let info = mgmt_api_client
|
||||||
.timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes)
|
.timeline_info(
|
||||||
|
TenantShardId::unsharded(tl.tenant_id),
|
||||||
|
tl.timeline_id,
|
||||||
|
ForceAwaitLogicalSize::Yes,
|
||||||
|
)
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
@@ -74,7 +79,11 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
|
|||||||
while !info.current_logical_size_is_accurate {
|
while !info.current_logical_size_is_accurate {
|
||||||
ticker.tick().await;
|
ticker.tick().await;
|
||||||
info = mgmt_api_client
|
info = mgmt_api_client
|
||||||
.timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes)
|
.timeline_info(
|
||||||
|
TenantShardId::unsharded(tl.tenant_id),
|
||||||
|
tl.timeline_id,
|
||||||
|
ForceAwaitLogicalSize::Yes,
|
||||||
|
)
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
}
|
}
|
||||||
|
|||||||
112
pageserver/src/aux_file.rs
Normal file
112
pageserver/src/aux_file.rs
Normal file
@@ -0,0 +1,112 @@
|
|||||||
|
use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE};
|
||||||
|
use tracing::warn;
|
||||||
|
|
||||||
|
/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, first 13B of 128b xxhash].
|
||||||
|
fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key {
|
||||||
|
let mut key = [0; METADATA_KEY_SIZE];
|
||||||
|
let hash = twox_hash::xxh3::hash128(data).to_be_bytes();
|
||||||
|
key[0] = AUX_KEY_PREFIX;
|
||||||
|
key[1] = dir_level1;
|
||||||
|
key[2] = dir_level2;
|
||||||
|
key[3..16].copy_from_slice(&hash[0..13]);
|
||||||
|
Key::from_metadata_key_fixed_size(&key)
|
||||||
|
}
|
||||||
|
|
||||||
|
const AUX_DIR_PG_LOGICAL: u8 = 0x01;
|
||||||
|
const AUX_DIR_PG_REPLSLOT: u8 = 0x02;
|
||||||
|
const AUX_DIR_PG_UNKNOWN: u8 = 0xFF;
|
||||||
|
|
||||||
|
/// Encode the aux file into a fixed-size key.
|
||||||
|
///
|
||||||
|
/// The first byte is the AUX key prefix. We use the next 2 bytes of the key for the directory / aux file type.
|
||||||
|
/// We have one-to-one mapping for each of the aux file that we support. We hash the remaining part of the path
|
||||||
|
/// (usually a single file name, or several components) into 13-byte hash. The way we determine the 2-byte prefix
|
||||||
|
/// is roughly based on the first two components of the path, one unique number for one component.
|
||||||
|
///
|
||||||
|
/// * pg_logical/mappings -> 0x0101
|
||||||
|
/// * pg_logical/snapshots -> 0x0102
|
||||||
|
/// * pg_logical/replorigin_checkpoint -> 0x0103
|
||||||
|
/// * pg_logical/others -> 0x01FF
|
||||||
|
/// * pg_replslot/ -> 0x0201
|
||||||
|
/// * others -> 0xFFFF
|
||||||
|
///
|
||||||
|
/// If you add new AUX files to this function, please also add a test case to `test_encoding_portable`.
|
||||||
|
/// The new file type must have never been written to the storage before. Otherwise, there could be data
|
||||||
|
/// corruptions as the new file belongs to a new prefix but it might have been stored under the `others` prefix.
|
||||||
|
pub fn encode_aux_file_key(path: &str) -> Key {
|
||||||
|
if let Some(fname) = path.strip_prefix("pg_logical/mappings/") {
|
||||||
|
aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x01, fname.as_bytes())
|
||||||
|
} else if let Some(fname) = path.strip_prefix("pg_logical/snapshots/") {
|
||||||
|
aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x02, fname.as_bytes())
|
||||||
|
} else if path == "pg_logical/replorigin_checkpoint" {
|
||||||
|
aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x03, b"")
|
||||||
|
} else if let Some(fname) = path.strip_prefix("pg_logical/") {
|
||||||
|
if cfg!(debug_assertions) {
|
||||||
|
warn!(
|
||||||
|
"unsupported pg_logical aux file type: {}, putting to 0x01FF, would affect path scanning",
|
||||||
|
path
|
||||||
|
);
|
||||||
|
}
|
||||||
|
aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0xFF, fname.as_bytes())
|
||||||
|
} else if let Some(fname) = path.strip_prefix("pg_replslot/") {
|
||||||
|
aux_hash_to_metadata_key(AUX_DIR_PG_REPLSLOT, 0x01, fname.as_bytes())
|
||||||
|
} else {
|
||||||
|
if cfg!(debug_assertions) {
|
||||||
|
warn!(
|
||||||
|
"unsupported aux file type: {}, putting to 0xFFFF, would affect path scanning",
|
||||||
|
path
|
||||||
|
);
|
||||||
|
}
|
||||||
|
aux_hash_to_metadata_key(AUX_DIR_PG_UNKNOWN, 0xFF, path.as_bytes())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_hash_portable() {
|
||||||
|
// AUX file encoding requires the hash to be portable across all platforms. This test case checks
|
||||||
|
// if the algorithm produces the same hash across different environments.
|
||||||
|
assert_eq!(
|
||||||
|
305317690835051308206966631765527126151,
|
||||||
|
twox_hash::xxh3::hash128("test1".as_bytes())
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
85104974691013376326742244813280798847,
|
||||||
|
twox_hash::xxh3::hash128("test/test2".as_bytes())
|
||||||
|
);
|
||||||
|
assert_eq!(0, twox_hash::xxh3::hash128("".as_bytes()));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_encoding_portable() {
|
||||||
|
// To correct retrieve AUX files, the generated keys for the same file must be the same for all versions
|
||||||
|
// of the page server.
|
||||||
|
assert_eq!(
|
||||||
|
"8200000101E5B20C5F8DD5AA3289D6D9EAFA",
|
||||||
|
encode_aux_file_key("pg_logical/mappings/test1").to_string()
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
"820000010239AAC544893139B26F501B97E6",
|
||||||
|
encode_aux_file_key("pg_logical/snapshots/test2").to_string()
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
"820000010300000000000000000000000000",
|
||||||
|
encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string()
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
"82000001FF8635AF2134B7266EC5B4189FD6",
|
||||||
|
encode_aux_file_key("pg_logical/unsupported").to_string()
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
"8200000201772D0E5D71DE14DA86142A1619",
|
||||||
|
encode_aux_file_key("pg_replslot/test3").to_string()
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
"820000FFFF1866EBEB53B807B26A2416F317",
|
||||||
|
encode_aux_file_key("other_file_not_supported").to_string()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -13,7 +13,7 @@
|
|||||||
use anyhow::{anyhow, bail, ensure, Context};
|
use anyhow::{anyhow, bail, ensure, Context};
|
||||||
use bytes::{BufMut, Bytes, BytesMut};
|
use bytes::{BufMut, Bytes, BytesMut};
|
||||||
use fail::fail_point;
|
use fail::fail_point;
|
||||||
use pageserver_api::key::{key_to_slru_block, Key};
|
use pageserver_api::key::{key_to_slru_block, rel_block_to_key, Key};
|
||||||
use postgres_ffi::pg_constants;
|
use postgres_ffi::pg_constants;
|
||||||
use std::fmt::Write as FmtWrite;
|
use std::fmt::Write as FmtWrite;
|
||||||
use std::time::SystemTime;
|
use std::time::SystemTime;
|
||||||
@@ -297,7 +297,20 @@ where
|
|||||||
if rel.forknum == INIT_FORKNUM {
|
if rel.forknum == INIT_FORKNUM {
|
||||||
// I doubt we need _init fork itself, but having it at least
|
// I doubt we need _init fork itself, but having it at least
|
||||||
// serves as a marker relation is unlogged.
|
// serves as a marker relation is unlogged.
|
||||||
self.add_rel(rel, rel).await?;
|
if let Err(_e) = self.add_rel(rel, rel).await {
|
||||||
|
if self
|
||||||
|
.timeline
|
||||||
|
.get_shard_identity()
|
||||||
|
.is_key_buggy_forknum(&rel_block_to_key(rel, 0x0))
|
||||||
|
{
|
||||||
|
// Workaround https://github.com/neondatabase/neon/issues/7451 -- if we have an unlogged relation
|
||||||
|
// whose INIT_FORKNUM is not correctly on shard zero, then omit it in the basebackup. This allows
|
||||||
|
// postgres to start up. The relation won't work, but it will be possible to DROP TABLE on it and
|
||||||
|
// recreate.
|
||||||
|
tracing::warn!("Omitting relation {rel} for issue #7451: drop and recreate this unlogged relation");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
self.add_rel(rel, rel.with_forknum(MAIN_FORKNUM)).await?;
|
self.add_rel(rel, rel.with_forknum(MAIN_FORKNUM)).await?;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
|
|||||||
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
|
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
|
||||||
use pageserver::tenant::{secondary, TenantSharedResources};
|
use pageserver::tenant::{secondary, TenantSharedResources};
|
||||||
use remote_storage::GenericRemoteStorage;
|
use remote_storage::GenericRemoteStorage;
|
||||||
|
use tokio::signal::unix::SignalKind;
|
||||||
use tokio::time::Instant;
|
use tokio::time::Instant;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
|
|
||||||
@@ -284,6 +285,7 @@ fn start_pageserver(
|
|||||||
))
|
))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
pageserver::preinitialize_metrics();
|
pageserver::preinitialize_metrics();
|
||||||
|
pageserver::metrics::wal_redo::set_process_kind_metric(conf.walredo_process_kind);
|
||||||
|
|
||||||
// If any failpoints were set from FAILPOINTS environment variable,
|
// If any failpoints were set from FAILPOINTS environment variable,
|
||||||
// print them to the log for debugging purposes
|
// print them to the log for debugging purposes
|
||||||
@@ -671,42 +673,37 @@ fn start_pageserver(
|
|||||||
let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
|
let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
|
||||||
|
|
||||||
// All started up! Now just sit and wait for shutdown signal.
|
// All started up! Now just sit and wait for shutdown signal.
|
||||||
{
|
|
||||||
use signal_hook::consts::*;
|
|
||||||
let signal_handler = BACKGROUND_RUNTIME.spawn_blocking(move || {
|
|
||||||
let mut signals =
|
|
||||||
signal_hook::iterator::Signals::new([SIGINT, SIGTERM, SIGQUIT]).unwrap();
|
|
||||||
return signals
|
|
||||||
.forever()
|
|
||||||
.next()
|
|
||||||
.expect("forever() never returns None unless explicitly closed");
|
|
||||||
});
|
|
||||||
let signal = BACKGROUND_RUNTIME
|
|
||||||
.block_on(signal_handler)
|
|
||||||
.expect("join error");
|
|
||||||
match signal {
|
|
||||||
SIGQUIT => {
|
|
||||||
info!("Got signal {signal}. Terminating in immediate shutdown mode",);
|
|
||||||
std::process::exit(111);
|
|
||||||
}
|
|
||||||
SIGINT | SIGTERM => {
|
|
||||||
info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);
|
|
||||||
|
|
||||||
// This cancels the `shutdown_pageserver` cancellation tree.
|
{
|
||||||
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
|
BACKGROUND_RUNTIME.block_on(async move {
|
||||||
// The plan is to change that over time.
|
let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap();
|
||||||
shutdown_pageserver.take();
|
let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap();
|
||||||
let bg_remote_storage = remote_storage.clone();
|
let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap();
|
||||||
let bg_deletion_queue = deletion_queue.clone();
|
let signal = tokio::select! {
|
||||||
BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
|
_ = sigquit.recv() => {
|
||||||
&tenant_manager,
|
info!("Got signal SIGQUIT. Terminating in immediate shutdown mode",);
|
||||||
bg_remote_storage.map(|_| bg_deletion_queue),
|
std::process::exit(111);
|
||||||
0,
|
}
|
||||||
));
|
_ = sigint.recv() => { "SIGINT" },
|
||||||
unreachable!()
|
_ = sigterm.recv() => { "SIGTERM" },
|
||||||
}
|
};
|
||||||
_ => unreachable!(),
|
|
||||||
}
|
info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);
|
||||||
|
|
||||||
|
// This cancels the `shutdown_pageserver` cancellation tree.
|
||||||
|
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
|
||||||
|
// The plan is to change that over time.
|
||||||
|
shutdown_pageserver.take();
|
||||||
|
let bg_remote_storage = remote_storage.clone();
|
||||||
|
let bg_deletion_queue = deletion_queue.clone();
|
||||||
|
pageserver::shutdown_pageserver(
|
||||||
|
&tenant_manager,
|
||||||
|
bg_remote_storage.map(|_| bg_deletion_queue),
|
||||||
|
0,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
unreachable!()
|
||||||
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -97,6 +97,8 @@ pub mod defaults {
|
|||||||
|
|
||||||
pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
|
pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
|
||||||
|
|
||||||
|
pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "sync";
|
||||||
|
|
||||||
///
|
///
|
||||||
/// Default built-in configuration file.
|
/// Default built-in configuration file.
|
||||||
///
|
///
|
||||||
@@ -140,6 +142,8 @@ pub mod defaults {
|
|||||||
|
|
||||||
#validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
|
#validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
|
||||||
|
|
||||||
|
#walredo_process_kind = '{DEFAULT_WALREDO_PROCESS_KIND}'
|
||||||
|
|
||||||
[tenant_config]
|
[tenant_config]
|
||||||
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
||||||
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
|
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
|
||||||
@@ -290,6 +294,8 @@ pub struct PageServerConf {
|
|||||||
///
|
///
|
||||||
/// Setting this to zero disables limits on total ephemeral layer size.
|
/// Setting this to zero disables limits on total ephemeral layer size.
|
||||||
pub ephemeral_bytes_per_memory_kb: usize,
|
pub ephemeral_bytes_per_memory_kb: usize,
|
||||||
|
|
||||||
|
pub walredo_process_kind: crate::walredo::ProcessKind,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||||
@@ -413,6 +419,8 @@ struct PageServerConfigBuilder {
|
|||||||
validate_vectored_get: BuilderValue<bool>,
|
validate_vectored_get: BuilderValue<bool>,
|
||||||
|
|
||||||
ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
|
ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
|
||||||
|
|
||||||
|
walredo_process_kind: BuilderValue<crate::walredo::ProcessKind>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PageServerConfigBuilder {
|
impl PageServerConfigBuilder {
|
||||||
@@ -500,6 +508,8 @@ impl PageServerConfigBuilder {
|
|||||||
)),
|
)),
|
||||||
validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
|
validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
|
||||||
ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
|
ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
|
||||||
|
|
||||||
|
walredo_process_kind: Set(DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap()),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -683,6 +693,10 @@ impl PageServerConfigBuilder {
|
|||||||
self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
|
self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn get_walredo_process_kind(&mut self, value: crate::walredo::ProcessKind) {
|
||||||
|
self.walredo_process_kind = BuilderValue::Set(value);
|
||||||
|
}
|
||||||
|
|
||||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||||
let default = Self::default_values();
|
let default = Self::default_values();
|
||||||
|
|
||||||
@@ -739,6 +753,7 @@ impl PageServerConfigBuilder {
|
|||||||
max_vectored_read_bytes,
|
max_vectored_read_bytes,
|
||||||
validate_vectored_get,
|
validate_vectored_get,
|
||||||
ephemeral_bytes_per_memory_kb,
|
ephemeral_bytes_per_memory_kb,
|
||||||
|
walredo_process_kind,
|
||||||
}
|
}
|
||||||
CUSTOM LOGIC
|
CUSTOM LOGIC
|
||||||
{
|
{
|
||||||
@@ -1032,6 +1047,9 @@ impl PageServerConf {
|
|||||||
"ephemeral_bytes_per_memory_kb" => {
|
"ephemeral_bytes_per_memory_kb" => {
|
||||||
builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
|
builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
|
||||||
}
|
}
|
||||||
|
"walredo_process_kind" => {
|
||||||
|
builder.get_walredo_process_kind(parse_toml_from_str("walredo_process_kind", item)?)
|
||||||
|
}
|
||||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1114,6 +1132,7 @@ impl PageServerConf {
|
|||||||
),
|
),
|
||||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||||
|
walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1351,7 +1370,8 @@ background_task_maximum_delay = '334 s'
|
|||||||
.expect("Invalid default constant")
|
.expect("Invalid default constant")
|
||||||
),
|
),
|
||||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
|
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||||
|
walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
|
||||||
},
|
},
|
||||||
"Correct defaults should be used when no config values are provided"
|
"Correct defaults should be used when no config values are provided"
|
||||||
);
|
);
|
||||||
@@ -1423,7 +1443,8 @@ background_task_maximum_delay = '334 s'
|
|||||||
.expect("Invalid default constant")
|
.expect("Invalid default constant")
|
||||||
),
|
),
|
||||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
|
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||||
|
walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
|
||||||
},
|
},
|
||||||
"Should be able to parse all basic config values correctly"
|
"Should be able to parse all basic config values correctly"
|
||||||
);
|
);
|
||||||
|
|||||||
@@ -304,7 +304,7 @@ async fn calculate_synthetic_size_worker(
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if !tenant_shard_id.is_zero() {
|
if !tenant_shard_id.is_shard_zero() {
|
||||||
// We only send consumption metrics from shard 0, so don't waste time calculating
|
// We only send consumption metrics from shard 0, so don't waste time calculating
|
||||||
// synthetic size on other shards.
|
// synthetic size on other shards.
|
||||||
continue;
|
continue;
|
||||||
|
|||||||
@@ -199,7 +199,7 @@ pub(super) async fn collect_all_metrics(
|
|||||||
};
|
};
|
||||||
|
|
||||||
let tenants = futures::stream::iter(tenants).filter_map(|(id, state, _)| async move {
|
let tenants = futures::stream::iter(tenants).filter_map(|(id, state, _)| async move {
|
||||||
if state != TenantState::Active || !id.is_zero() {
|
if state != TenantState::Active || !id.is_shard_zero() {
|
||||||
None
|
None
|
||||||
} else {
|
} else {
|
||||||
tenant_manager
|
tenant_manager
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ use pageserver_api::{
|
|||||||
use serde::{de::DeserializeOwned, Serialize};
|
use serde::{de::DeserializeOwned, Serialize};
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
use url::Url;
|
use url::Url;
|
||||||
use utils::{backoff, generation::Generation, id::NodeId};
|
use utils::{backoff, failpoint_support, generation::Generation, id::NodeId};
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
config::{NodeMetadata, PageServerConf},
|
config::{NodeMetadata, PageServerConf},
|
||||||
@@ -210,7 +210,10 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
|||||||
.collect(),
|
.collect(),
|
||||||
};
|
};
|
||||||
|
|
||||||
fail::fail_point!("control-plane-client-validate");
|
failpoint_support::sleep_millis_async!("control-plane-client-validate-sleep", &self.cancel);
|
||||||
|
if self.cancel.is_cancelled() {
|
||||||
|
return Err(RetryForeverError::ShuttingDown);
|
||||||
|
}
|
||||||
|
|
||||||
let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;
|
let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;
|
||||||
|
|
||||||
|
|||||||
@@ -58,24 +58,6 @@ paths:
|
|||||||
responses:
|
responses:
|
||||||
"200":
|
"200":
|
||||||
description: The reload completed successfully.
|
description: The reload completed successfully.
|
||||||
"401":
|
|
||||||
description: Unauthorized Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/UnauthorizedError"
|
|
||||||
"403":
|
|
||||||
description: Forbidden Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ForbiddenError"
|
|
||||||
"500":
|
|
||||||
description: Generic operation error (also hits if no keys were found)
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
|
|
||||||
/v1/tenant/{tenant_id}:
|
/v1/tenant/{tenant_id}:
|
||||||
parameters:
|
parameters:
|
||||||
@@ -93,62 +75,14 @@ paths:
|
|||||||
application/json:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
$ref: "#/components/schemas/TenantInfo"
|
$ref: "#/components/schemas/TenantInfo"
|
||||||
"400":
|
|
||||||
description: Error when no tenant id found in path or no timeline id
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
"401":
|
|
||||||
description: Unauthorized Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/UnauthorizedError"
|
|
||||||
"403":
|
|
||||||
description: Forbidden Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ForbiddenError"
|
|
||||||
"500":
|
|
||||||
description: Generic operation error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
"503":
|
|
||||||
description: Temporarily unavailable, please retry.
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
|
||||||
|
|
||||||
delete:
|
delete:
|
||||||
description: |
|
description: |
|
||||||
Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved.
|
Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved.
|
||||||
404 means that deletion successfully finished"
|
404 means that deletion successfully finished"
|
||||||
responses:
|
responses:
|
||||||
"400":
|
|
||||||
description: Error when no tenant id found in path
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
"401":
|
|
||||||
description: Unauthorized Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/UnauthorizedError"
|
|
||||||
"403":
|
|
||||||
description: Forbidden Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ForbiddenError"
|
|
||||||
"404":
|
"404":
|
||||||
description: Tenant not found
|
description: Tenant not found. This is the success path.
|
||||||
content:
|
content:
|
||||||
application/json:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
@@ -165,18 +99,6 @@ paths:
|
|||||||
application/json:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
$ref: "#/components/schemas/PreconditionFailedError"
|
$ref: "#/components/schemas/PreconditionFailedError"
|
||||||
"500":
|
|
||||||
description: Generic operation error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
"503":
|
|
||||||
description: Temporarily unavailable, please retry.
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
|
||||||
|
|
||||||
/v1/tenant/{tenant_id}/time_travel_remote_storage:
|
/v1/tenant/{tenant_id}/time_travel_remote_storage:
|
||||||
parameters:
|
parameters:
|
||||||
@@ -206,36 +128,6 @@ paths:
|
|||||||
application/json:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
type: string
|
type: string
|
||||||
"400":
|
|
||||||
description: Error when no tenant id found in path or invalid timestamp
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
"401":
|
|
||||||
description: Unauthorized Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/UnauthorizedError"
|
|
||||||
"403":
|
|
||||||
description: Forbidden Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ForbiddenError"
|
|
||||||
"500":
|
|
||||||
description: Generic operation error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
"503":
|
|
||||||
description: Temporarily unavailable, please retry.
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
|
||||||
|
|
||||||
/v1/tenant/{tenant_id}/timeline:
|
/v1/tenant/{tenant_id}/timeline:
|
||||||
parameters:
|
parameters:
|
||||||
@@ -255,36 +147,6 @@ paths:
|
|||||||
type: array
|
type: array
|
||||||
items:
|
items:
|
||||||
$ref: "#/components/schemas/TimelineInfo"
|
$ref: "#/components/schemas/TimelineInfo"
|
||||||
"400":
|
|
||||||
description: Error when no tenant id found in path
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
"401":
|
|
||||||
description: Unauthorized Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/UnauthorizedError"
|
|
||||||
"403":
|
|
||||||
description: Forbidden Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ForbiddenError"
|
|
||||||
"500":
|
|
||||||
description: Generic operation error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
"503":
|
|
||||||
description: Temporarily unavailable, please retry.
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
|
||||||
|
|
||||||
|
|
||||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}:
|
/v1/tenant/{tenant_id}/timeline/{timeline_id}:
|
||||||
@@ -309,60 +171,12 @@ paths:
|
|||||||
application/json:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
$ref: "#/components/schemas/TimelineInfo"
|
$ref: "#/components/schemas/TimelineInfo"
|
||||||
"400":
|
|
||||||
description: Error when no tenant id found in path or no timeline id
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
"401":
|
|
||||||
description: Unauthorized Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/UnauthorizedError"
|
|
||||||
"403":
|
|
||||||
description: Forbidden Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ForbiddenError"
|
|
||||||
"500":
|
|
||||||
description: Generic operation error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
"503":
|
|
||||||
description: Temporarily unavailable, please retry.
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
|
||||||
|
|
||||||
delete:
|
delete:
|
||||||
description: "Attempts to delete specified timeline. 500 and 409 errors should be retried"
|
description: "Attempts to delete specified timeline. 500 and 409 errors should be retried"
|
||||||
responses:
|
responses:
|
||||||
"400":
|
|
||||||
description: Error when no tenant id found in path or no timeline id
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
"401":
|
|
||||||
description: Unauthorized Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/UnauthorizedError"
|
|
||||||
"403":
|
|
||||||
description: Forbidden Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ForbiddenError"
|
|
||||||
"404":
|
"404":
|
||||||
description: Timeline not found
|
description: Timeline not found. This is the success path.
|
||||||
content:
|
content:
|
||||||
application/json:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
@@ -379,18 +193,6 @@ paths:
|
|||||||
application/json:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
$ref: "#/components/schemas/PreconditionFailedError"
|
$ref: "#/components/schemas/PreconditionFailedError"
|
||||||
"500":
|
|
||||||
description: Generic operation error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
"503":
|
|
||||||
description: Temporarily unavailable, please retry.
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
|
||||||
|
|
||||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_timestamp_of_lsn:
|
/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_timestamp_of_lsn:
|
||||||
parameters:
|
parameters:
|
||||||
@@ -423,36 +225,6 @@ paths:
|
|||||||
schema:
|
schema:
|
||||||
type: string
|
type: string
|
||||||
format: date-time
|
format: date-time
|
||||||
"400":
|
|
||||||
description: Error when no tenant id found in path, no timeline id or invalid timestamp
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
"401":
|
|
||||||
description: Unauthorized Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/UnauthorizedError"
|
|
||||||
"403":
|
|
||||||
description: Forbidden Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ForbiddenError"
|
|
||||||
"404":
|
|
||||||
description: Timeline not found, or there is no timestamp information for the given lsn
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/NotFoundError"
|
|
||||||
"500":
|
|
||||||
description: Generic operation error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
|
|
||||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp:
|
/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp:
|
||||||
parameters:
|
parameters:
|
||||||
@@ -484,36 +256,6 @@ paths:
|
|||||||
application/json:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
$ref: "#/components/schemas/LsnByTimestampResponse"
|
$ref: "#/components/schemas/LsnByTimestampResponse"
|
||||||
"400":
|
|
||||||
description: Error when no tenant id found in path, no timeline id or invalid timestamp
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
"401":
|
|
||||||
description: Unauthorized Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/UnauthorizedError"
|
|
||||||
"403":
|
|
||||||
description: Forbidden Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ForbiddenError"
|
|
||||||
"500":
|
|
||||||
description: Generic operation error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
"503":
|
|
||||||
description: Temporarily unavailable, please retry.
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
|
||||||
|
|
||||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc:
|
/v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc:
|
||||||
parameters:
|
parameters:
|
||||||
@@ -537,36 +279,6 @@ paths:
|
|||||||
application/json:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
type: string
|
type: string
|
||||||
"400":
|
|
||||||
description: Error when no tenant id found in path, no timeline id or invalid timestamp
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
"401":
|
|
||||||
description: Unauthorized Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/UnauthorizedError"
|
|
||||||
"403":
|
|
||||||
description: Forbidden Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ForbiddenError"
|
|
||||||
"500":
|
|
||||||
description: Generic operation error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
"503":
|
|
||||||
description: Temporarily unavailable, please retry.
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
|
||||||
/v1/tenant/{tenant_shard_id}/location_config:
|
/v1/tenant/{tenant_shard_id}/location_config:
|
||||||
parameters:
|
parameters:
|
||||||
- name: tenant_shard_id
|
- name: tenant_shard_id
|
||||||
@@ -628,24 +340,6 @@ paths:
|
|||||||
application/json:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
$ref: "#/components/schemas/TenantLocationConfigResponse"
|
$ref: "#/components/schemas/TenantLocationConfigResponse"
|
||||||
"503":
|
|
||||||
description: Tenant's state cannot be changed right now. Wait a few seconds and retry.
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
"401":
|
|
||||||
description: Unauthorized Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/UnauthorizedError"
|
|
||||||
"403":
|
|
||||||
description: Forbidden Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ForbiddenError"
|
|
||||||
"409":
|
"409":
|
||||||
description: |
|
description: |
|
||||||
The tenant is already known to Pageserver in some way,
|
The tenant is already known to Pageserver in some way,
|
||||||
@@ -662,12 +356,6 @@ paths:
|
|||||||
application/json:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
$ref: "#/components/schemas/ConflictError"
|
$ref: "#/components/schemas/ConflictError"
|
||||||
"500":
|
|
||||||
description: Generic operation error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
/v1/tenant/{tenant_id}/ignore:
|
/v1/tenant/{tenant_id}/ignore:
|
||||||
parameters:
|
parameters:
|
||||||
- name: tenant_id
|
- name: tenant_id
|
||||||
@@ -684,36 +372,6 @@ paths:
|
|||||||
responses:
|
responses:
|
||||||
"200":
|
"200":
|
||||||
description: Tenant ignored
|
description: Tenant ignored
|
||||||
"400":
|
|
||||||
description: Error when no tenant id found in path parameters
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
"401":
|
|
||||||
description: Unauthorized Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/UnauthorizedError"
|
|
||||||
"403":
|
|
||||||
description: Forbidden Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ForbiddenError"
|
|
||||||
"500":
|
|
||||||
description: Generic operation error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
"503":
|
|
||||||
description: Temporarily unavailable, please retry.
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
|
||||||
|
|
||||||
|
|
||||||
/v1/tenant/{tenant_id}/load:
|
/v1/tenant/{tenant_id}/load:
|
||||||
@@ -740,36 +398,6 @@ paths:
|
|||||||
responses:
|
responses:
|
||||||
"202":
|
"202":
|
||||||
description: Tenant scheduled to load successfully
|
description: Tenant scheduled to load successfully
|
||||||
"400":
|
|
||||||
description: Error when no tenant id found in path parameters
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
"401":
|
|
||||||
description: Unauthorized Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/UnauthorizedError"
|
|
||||||
"403":
|
|
||||||
description: Forbidden Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ForbiddenError"
|
|
||||||
"500":
|
|
||||||
description: Generic operation error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
"503":
|
|
||||||
description: Temporarily unavailable, please retry.
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
|
||||||
|
|
||||||
/v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive:
|
/v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive:
|
||||||
parameters:
|
parameters:
|
||||||
@@ -790,37 +418,6 @@ paths:
|
|||||||
responses:
|
responses:
|
||||||
"202":
|
"202":
|
||||||
description: Tenant scheduled to load successfully
|
description: Tenant scheduled to load successfully
|
||||||
"404":
|
|
||||||
description: No tenant or timeline found for the specified ids
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
"401":
|
|
||||||
description: Unauthorized Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/UnauthorizedError"
|
|
||||||
"403":
|
|
||||||
description: Forbidden Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ForbiddenError"
|
|
||||||
"500":
|
|
||||||
description: Generic operation error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
"503":
|
|
||||||
description: Temporarily unavailable, please retry.
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
|
||||||
|
|
||||||
|
|
||||||
/v1/tenant/{tenant_id}/synthetic_size:
|
/v1/tenant/{tenant_id}/synthetic_size:
|
||||||
parameters:
|
parameters:
|
||||||
@@ -839,31 +436,8 @@ paths:
|
|||||||
application/json:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
$ref: "#/components/schemas/SyntheticSizeResponse"
|
$ref: "#/components/schemas/SyntheticSizeResponse"
|
||||||
"401":
|
|
||||||
description: Unauthorized Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/UnauthorizedError"
|
|
||||||
"403":
|
|
||||||
description: Forbidden Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ForbiddenError"
|
|
||||||
"500":
|
|
||||||
description: Generic operation error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
"503":
|
|
||||||
description: Temporarily unavailable, please retry.
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
|
||||||
|
|
||||||
|
# This route has no handler. TODO: remove?
|
||||||
/v1/tenant/{tenant_id}/size:
|
/v1/tenant/{tenant_id}/size:
|
||||||
parameters:
|
parameters:
|
||||||
- name: tenant_id
|
- name: tenant_id
|
||||||
@@ -945,18 +519,6 @@ paths:
|
|||||||
responses:
|
responses:
|
||||||
"200":
|
"200":
|
||||||
description: Success
|
description: Success
|
||||||
"500":
|
|
||||||
description: Generic operation error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
"503":
|
|
||||||
description: Temporarily unavailable, please retry.
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
|
||||||
|
|
||||||
/v1/tenant/{tenant_shard_id}/secondary/download:
|
/v1/tenant/{tenant_shard_id}/secondary/download:
|
||||||
parameters:
|
parameters:
|
||||||
@@ -987,20 +549,6 @@ paths:
|
|||||||
application/json:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
$ref: "#/components/schemas/SecondaryProgress"
|
$ref: "#/components/schemas/SecondaryProgress"
|
||||||
"500":
|
|
||||||
description: Generic operation error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
"503":
|
|
||||||
description: Temporarily unavailable, please retry.
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/v1/tenant/{tenant_id}/timeline/:
|
/v1/tenant/{tenant_id}/timeline/:
|
||||||
parameters:
|
parameters:
|
||||||
@@ -1043,24 +591,6 @@ paths:
|
|||||||
application/json:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
$ref: "#/components/schemas/TimelineInfo"
|
$ref: "#/components/schemas/TimelineInfo"
|
||||||
"400":
|
|
||||||
description: Malformed timeline create request
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
"401":
|
|
||||||
description: Unauthorized Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/UnauthorizedError"
|
|
||||||
"403":
|
|
||||||
description: Forbidden Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ForbiddenError"
|
|
||||||
"406":
|
"406":
|
||||||
description: Permanently unsatisfiable request, don't retry.
|
description: Permanently unsatisfiable request, don't retry.
|
||||||
content:
|
content:
|
||||||
@@ -1079,18 +609,6 @@ paths:
|
|||||||
application/json:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
$ref: "#/components/schemas/Error"
|
$ref: "#/components/schemas/Error"
|
||||||
"500":
|
|
||||||
description: Generic operation error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
"503":
|
|
||||||
description: Temporarily unavailable, please retry.
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
|
||||||
|
|
||||||
/v1/tenant/:
|
/v1/tenant/:
|
||||||
get:
|
get:
|
||||||
@@ -1104,30 +622,6 @@ paths:
|
|||||||
type: array
|
type: array
|
||||||
items:
|
items:
|
||||||
$ref: "#/components/schemas/TenantInfo"
|
$ref: "#/components/schemas/TenantInfo"
|
||||||
"401":
|
|
||||||
description: Unauthorized Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/UnauthorizedError"
|
|
||||||
"403":
|
|
||||||
description: Forbidden Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ForbiddenError"
|
|
||||||
"500":
|
|
||||||
description: Generic operation error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
"503":
|
|
||||||
description: Temporarily unavailable, please retry.
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
|
||||||
|
|
||||||
post:
|
post:
|
||||||
description: |
|
description: |
|
||||||
@@ -1148,43 +642,12 @@ paths:
|
|||||||
application/json:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
type: string
|
type: string
|
||||||
"400":
|
|
||||||
description: Malformed tenant create request
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
"401":
|
|
||||||
description: Unauthorized Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/UnauthorizedError"
|
|
||||||
"403":
|
|
||||||
description: Forbidden Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ForbiddenError"
|
|
||||||
"409":
|
"409":
|
||||||
description: Tenant already exists, creation skipped
|
description: Tenant already exists, creation skipped
|
||||||
content:
|
content:
|
||||||
application/json:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
$ref: "#/components/schemas/ConflictError"
|
$ref: "#/components/schemas/ConflictError"
|
||||||
"500":
|
|
||||||
description: Generic operation error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
"503":
|
|
||||||
description: Temporarily unavailable, please retry.
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
|
||||||
|
|
||||||
|
|
||||||
/v1/tenant/config:
|
/v1/tenant/config:
|
||||||
put:
|
put:
|
||||||
@@ -1206,36 +669,6 @@ paths:
|
|||||||
type: array
|
type: array
|
||||||
items:
|
items:
|
||||||
$ref: "#/components/schemas/TenantInfo"
|
$ref: "#/components/schemas/TenantInfo"
|
||||||
"400":
|
|
||||||
description: Malformed tenant config request
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
"401":
|
|
||||||
description: Unauthorized Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/UnauthorizedError"
|
|
||||||
"403":
|
|
||||||
description: Forbidden Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ForbiddenError"
|
|
||||||
"500":
|
|
||||||
description: Generic operation error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
"503":
|
|
||||||
description: Temporarily unavailable, please retry.
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
|
||||||
|
|
||||||
/v1/tenant/{tenant_id}/config/:
|
/v1/tenant/{tenant_id}/config/:
|
||||||
parameters:
|
parameters:
|
||||||
@@ -1255,42 +688,6 @@ paths:
|
|||||||
application/json:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
$ref: "#/components/schemas/TenantConfigResponse"
|
$ref: "#/components/schemas/TenantConfigResponse"
|
||||||
"400":
|
|
||||||
description: Malformed get tenanant config request
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
"401":
|
|
||||||
description: Unauthorized Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/UnauthorizedError"
|
|
||||||
"403":
|
|
||||||
description: Forbidden Error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ForbiddenError"
|
|
||||||
"404":
|
|
||||||
description: Tenand or timeline were not found
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/NotFoundError"
|
|
||||||
"500":
|
|
||||||
description: Generic operation error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
"503":
|
|
||||||
description: Temporarily unavailable, please retry.
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
|
||||||
|
|
||||||
/v1/utilization:
|
/v1/utilization:
|
||||||
get:
|
get:
|
||||||
@@ -1304,12 +701,6 @@ paths:
|
|||||||
application/json:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
$ref: "#/components/schemas/PageserverUtilization"
|
$ref: "#/components/schemas/PageserverUtilization"
|
||||||
"500":
|
|
||||||
description: Generic operation error
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/Error"
|
|
||||||
|
|
||||||
components:
|
components:
|
||||||
securitySchemes:
|
securitySchemes:
|
||||||
@@ -1629,7 +1020,7 @@ components:
|
|||||||
type: integer
|
type: integer
|
||||||
format: int64
|
format: int64
|
||||||
minimum: 0
|
minimum: 0
|
||||||
description: The amount of disk space currently utilized by layer files.
|
description: The amount of disk space currently used.
|
||||||
free_space_bytes:
|
free_space_bytes:
|
||||||
type: integer
|
type: integer
|
||||||
format: int64
|
format: int64
|
||||||
|
|||||||
@@ -160,6 +160,9 @@ impl From<PageReconstructError> for ApiError {
|
|||||||
fn from(pre: PageReconstructError) -> ApiError {
|
fn from(pre: PageReconstructError) -> ApiError {
|
||||||
match pre {
|
match pre {
|
||||||
PageReconstructError::Other(pre) => ApiError::InternalServerError(pre),
|
PageReconstructError::Other(pre) => ApiError::InternalServerError(pre),
|
||||||
|
PageReconstructError::MissingKey(e) => {
|
||||||
|
ApiError::InternalServerError(anyhow::anyhow!("{e}"))
|
||||||
|
}
|
||||||
PageReconstructError::Cancelled => {
|
PageReconstructError::Cancelled => {
|
||||||
ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
|
ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
|
||||||
}
|
}
|
||||||
@@ -457,8 +460,12 @@ async fn reload_auth_validation_keys_handler(
|
|||||||
json_response(StatusCode::OK, ())
|
json_response(StatusCode::OK, ())
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
|
let err_msg = "Error reloading public keys";
|
||||||
warn!("Error reloading public keys from {key_path:?}: {e:}");
|
warn!("Error reloading public keys from {key_path:?}: {e:}");
|
||||||
json_response(StatusCode::INTERNAL_SERVER_ERROR, ())
|
json_response(
|
||||||
|
StatusCode::INTERNAL_SERVER_ERROR,
|
||||||
|
HttpErrorBody::from_msg(err_msg.to_string()),
|
||||||
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -696,7 +703,7 @@ async fn get_lsn_by_timestamp_handler(
|
|||||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||||
let state = get_state(&request);
|
let state = get_state(&request);
|
||||||
|
|
||||||
if !tenant_shard_id.is_zero() {
|
if !tenant_shard_id.is_shard_zero() {
|
||||||
// Requires SLRU contents, which are only stored on shard zero
|
// Requires SLRU contents, which are only stored on shard zero
|
||||||
return Err(ApiError::BadRequest(anyhow!(
|
return Err(ApiError::BadRequest(anyhow!(
|
||||||
"Size calculations are only available on shard zero"
|
"Size calculations are only available on shard zero"
|
||||||
@@ -747,7 +754,7 @@ async fn get_timestamp_of_lsn_handler(
|
|||||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||||
let state = get_state(&request);
|
let state = get_state(&request);
|
||||||
|
|
||||||
if !tenant_shard_id.is_zero() {
|
if !tenant_shard_id.is_shard_zero() {
|
||||||
// Requires SLRU contents, which are only stored on shard zero
|
// Requires SLRU contents, which are only stored on shard zero
|
||||||
return Err(ApiError::BadRequest(anyhow!(
|
return Err(ApiError::BadRequest(anyhow!(
|
||||||
"Size calculations are only available on shard zero"
|
"Size calculations are only available on shard zero"
|
||||||
@@ -772,7 +779,9 @@ async fn get_timestamp_of_lsn_handler(
|
|||||||
let time = format_rfc3339(postgres_ffi::from_pg_timestamp(time)).to_string();
|
let time = format_rfc3339(postgres_ffi::from_pg_timestamp(time)).to_string();
|
||||||
json_response(StatusCode::OK, time)
|
json_response(StatusCode::OK, time)
|
||||||
}
|
}
|
||||||
None => json_response(StatusCode::NOT_FOUND, ()),
|
None => Err(ApiError::NotFound(
|
||||||
|
anyhow::anyhow!("Timestamp for lsn {} not found", lsn).into(),
|
||||||
|
)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -993,11 +1002,26 @@ async fn tenant_status(
|
|||||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||||
let state = get_state(&request);
|
let state = get_state(&request);
|
||||||
|
|
||||||
|
// In tests, sometimes we want to query the state of a tenant without auto-activating it if it's currently waiting.
|
||||||
|
let activate = true;
|
||||||
|
#[cfg(feature = "testing")]
|
||||||
|
let activate = parse_query_param(&request, "activate")?.unwrap_or(activate);
|
||||||
|
|
||||||
let tenant_info = async {
|
let tenant_info = async {
|
||||||
let tenant = state
|
let tenant = state
|
||||||
.tenant_manager
|
.tenant_manager
|
||||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||||
|
|
||||||
|
if activate {
|
||||||
|
// This is advisory: we prefer to let the tenant activate on-demand when this function is
|
||||||
|
// called, but it is still valid to return 200 and describe the current state of the tenant
|
||||||
|
// if it doesn't make it into an active state.
|
||||||
|
tenant
|
||||||
|
.wait_to_become_active(ACTIVE_TENANT_TIMEOUT)
|
||||||
|
.await
|
||||||
|
.ok();
|
||||||
|
}
|
||||||
|
|
||||||
// Calculate total physical size of all timelines
|
// Calculate total physical size of all timelines
|
||||||
let mut current_physical_size = 0;
|
let mut current_physical_size = 0;
|
||||||
for timeline in tenant.list_timelines().iter() {
|
for timeline in tenant.list_timelines().iter() {
|
||||||
@@ -1071,7 +1095,7 @@ async fn tenant_size_handler(
|
|||||||
let headers = request.headers();
|
let headers = request.headers();
|
||||||
let state = get_state(&request);
|
let state = get_state(&request);
|
||||||
|
|
||||||
if !tenant_shard_id.is_zero() {
|
if !tenant_shard_id.is_shard_zero() {
|
||||||
return Err(ApiError::BadRequest(anyhow!(
|
return Err(ApiError::BadRequest(anyhow!(
|
||||||
"Size calculations are only available on shard zero"
|
"Size calculations are only available on shard zero"
|
||||||
)));
|
)));
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ use anyhow::{bail, ensure, Context, Result};
|
|||||||
use bytes::Bytes;
|
use bytes::Bytes;
|
||||||
use camino::Utf8Path;
|
use camino::Utf8Path;
|
||||||
use futures::StreamExt;
|
use futures::StreamExt;
|
||||||
|
use pageserver_api::key::rel_block_to_key;
|
||||||
use tokio::io::{AsyncRead, AsyncReadExt};
|
use tokio::io::{AsyncRead, AsyncReadExt};
|
||||||
use tokio_tar::Archive;
|
use tokio_tar::Archive;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
@@ -170,7 +171,10 @@ async fn import_rel(
|
|||||||
let r = reader.read_exact(&mut buf).await;
|
let r = reader.read_exact(&mut buf).await;
|
||||||
match r {
|
match r {
|
||||||
Ok(_) => {
|
Ok(_) => {
|
||||||
modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
|
let key = rel_block_to_key(rel, blknum);
|
||||||
|
if modification.tline.get_shard_identity().is_key_local(&key) {
|
||||||
|
modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: UnexpectedEof is expected
|
// TODO: UnexpectedEof is expected
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ pub mod disk_usage_eviction_task;
|
|||||||
pub mod http;
|
pub mod http;
|
||||||
pub mod import_datadir;
|
pub mod import_datadir;
|
||||||
pub use pageserver_api::keyspace;
|
pub use pageserver_api::keyspace;
|
||||||
|
pub mod aux_file;
|
||||||
pub mod metrics;
|
pub mod metrics;
|
||||||
pub mod page_cache;
|
pub mod page_cache;
|
||||||
pub mod page_service;
|
pub mod page_service;
|
||||||
|
|||||||
@@ -86,11 +86,20 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
|
|||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
});
|
});
|
||||||
|
|
||||||
pub(crate) static READ_NUM_FS_LAYERS: Lazy<Histogram> = Lazy::new(|| {
|
pub(crate) static READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
|
||||||
register_histogram!(
|
register_histogram!(
|
||||||
"pageserver_read_num_fs_layers",
|
"pageserver_layers_visited_per_read_global",
|
||||||
"Number of persistent layers accessed for processing a read request, including those in the cache",
|
"Number of layers visited to reconstruct one key",
|
||||||
vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0],
|
vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
|
||||||
|
)
|
||||||
|
.expect("failed to define a metric")
|
||||||
|
});
|
||||||
|
|
||||||
|
pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
|
||||||
|
register_histogram!(
|
||||||
|
"pageserver_layers_visited_per_vectored_read_global",
|
||||||
|
"Average number of layers visited to reconstruct one key",
|
||||||
|
vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
|
||||||
)
|
)
|
||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
});
|
});
|
||||||
@@ -1483,12 +1492,18 @@ pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
|
|||||||
});
|
});
|
||||||
|
|
||||||
pub(crate) struct WalIngestMetrics {
|
pub(crate) struct WalIngestMetrics {
|
||||||
|
pub(crate) bytes_received: IntCounter,
|
||||||
pub(crate) records_received: IntCounter,
|
pub(crate) records_received: IntCounter,
|
||||||
pub(crate) records_committed: IntCounter,
|
pub(crate) records_committed: IntCounter,
|
||||||
pub(crate) records_filtered: IntCounter,
|
pub(crate) records_filtered: IntCounter,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
|
pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
|
||||||
|
bytes_received: register_int_counter!(
|
||||||
|
"pageserver_wal_ingest_bytes_received",
|
||||||
|
"Bytes of WAL ingested from safekeepers",
|
||||||
|
)
|
||||||
|
.unwrap(),
|
||||||
records_received: register_int_counter!(
|
records_received: register_int_counter!(
|
||||||
"pageserver_wal_ingest_records_received",
|
"pageserver_wal_ingest_records_received",
|
||||||
"Number of WAL records received from safekeepers"
|
"Number of WAL records received from safekeepers"
|
||||||
@@ -1512,7 +1527,8 @@ pub(crate) struct SecondaryModeMetrics {
|
|||||||
pub(crate) download_heatmap: IntCounter,
|
pub(crate) download_heatmap: IntCounter,
|
||||||
pub(crate) download_layer: IntCounter,
|
pub(crate) download_layer: IntCounter,
|
||||||
}
|
}
|
||||||
pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| SecondaryModeMetrics {
|
pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| {
|
||||||
|
SecondaryModeMetrics {
|
||||||
upload_heatmap: register_int_counter!(
|
upload_heatmap: register_int_counter!(
|
||||||
"pageserver_secondary_upload_heatmap",
|
"pageserver_secondary_upload_heatmap",
|
||||||
"Number of heatmaps written to remote storage by attached tenants"
|
"Number of heatmaps written to remote storage by attached tenants"
|
||||||
@@ -1530,7 +1546,7 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| Seco
|
|||||||
.expect("failed to define a metric"),
|
.expect("failed to define a metric"),
|
||||||
download_heatmap: register_int_counter!(
|
download_heatmap: register_int_counter!(
|
||||||
"pageserver_secondary_download_heatmap",
|
"pageserver_secondary_download_heatmap",
|
||||||
"Number of downloads of heatmaps by secondary mode locations"
|
"Number of downloads of heatmaps by secondary mode locations, including when it hasn't changed"
|
||||||
)
|
)
|
||||||
.expect("failed to define a metric"),
|
.expect("failed to define a metric"),
|
||||||
download_layer: register_int_counter!(
|
download_layer: register_int_counter!(
|
||||||
@@ -1538,6 +1554,7 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| Seco
|
|||||||
"Number of downloads of layers by secondary mode locations"
|
"Number of downloads of layers by secondary mode locations"
|
||||||
)
|
)
|
||||||
.expect("failed to define a metric"),
|
.expect("failed to define a metric"),
|
||||||
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||||
@@ -1813,6 +1830,29 @@ impl Default for WalRedoProcessCounters {
|
|||||||
pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
|
pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
|
||||||
Lazy::new(WalRedoProcessCounters::default);
|
Lazy::new(WalRedoProcessCounters::default);
|
||||||
|
|
||||||
|
#[cfg(not(test))]
|
||||||
|
pub mod wal_redo {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
static PROCESS_KIND: Lazy<std::sync::Mutex<UIntGaugeVec>> = Lazy::new(|| {
|
||||||
|
std::sync::Mutex::new(
|
||||||
|
register_uint_gauge_vec!(
|
||||||
|
"pageserver_wal_redo_process_kind",
|
||||||
|
"The configured process kind for walredo",
|
||||||
|
&["kind"],
|
||||||
|
)
|
||||||
|
.unwrap(),
|
||||||
|
)
|
||||||
|
});
|
||||||
|
|
||||||
|
pub fn set_process_kind_metric(kind: crate::walredo::ProcessKind) {
|
||||||
|
// use guard to avoid races around the next two steps
|
||||||
|
let guard = PROCESS_KIND.lock().unwrap();
|
||||||
|
guard.reset();
|
||||||
|
guard.with_label_values(&[&format!("{kind}")]).set(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Similar to `prometheus::HistogramTimer` but does not record on drop.
|
/// Similar to `prometheus::HistogramTimer` but does not record on drop.
|
||||||
pub(crate) struct StorageTimeMetricsTimer {
|
pub(crate) struct StorageTimeMetricsTimer {
|
||||||
metrics: StorageTimeMetrics,
|
metrics: StorageTimeMetrics,
|
||||||
@@ -2083,7 +2123,7 @@ impl TimelineMetrics {
|
|||||||
|
|
||||||
pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
|
pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
|
||||||
// Only shard zero deals in synthetic sizes
|
// Only shard zero deals in synthetic sizes
|
||||||
if tenant_shard_id.is_zero() {
|
if tenant_shard_id.is_shard_zero() {
|
||||||
let tid = tenant_shard_id.tenant_id.to_string();
|
let tid = tenant_shard_id.tenant_id.to_string();
|
||||||
let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
|
let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
|
||||||
}
|
}
|
||||||
@@ -2094,6 +2134,7 @@ pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
|
|||||||
use futures::Future;
|
use futures::Future;
|
||||||
use pin_project_lite::pin_project;
|
use pin_project_lite::pin_project;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
use std::num::NonZeroUsize;
|
||||||
use std::pin::Pin;
|
use std::pin::Pin;
|
||||||
use std::sync::{Arc, Mutex};
|
use std::sync::{Arc, Mutex};
|
||||||
use std::task::{Context, Poll};
|
use std::task::{Context, Poll};
|
||||||
@@ -2663,6 +2704,26 @@ pub(crate) mod disk_usage_based_eviction {
|
|||||||
pub(crate) static METRICS: Lazy<Metrics> = Lazy::new(Metrics::default);
|
pub(crate) static METRICS: Lazy<Metrics> = Lazy::new(Metrics::default);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static TOKIO_EXECUTOR_THREAD_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||||
|
register_uint_gauge_vec!(
|
||||||
|
"pageserver_tokio_executor_thread_configured_count",
|
||||||
|
"Total number of configued tokio executor threads in the process.
|
||||||
|
The `setup` label denotes whether we're running with multiple runtimes or a single runtime.",
|
||||||
|
&["setup"],
|
||||||
|
)
|
||||||
|
.unwrap()
|
||||||
|
});
|
||||||
|
|
||||||
|
pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) {
|
||||||
|
static SERIALIZE: std::sync::Mutex<()> = std::sync::Mutex::new(());
|
||||||
|
let _guard = SERIALIZE.lock().unwrap();
|
||||||
|
TOKIO_EXECUTOR_THREAD_COUNT.reset();
|
||||||
|
TOKIO_EXECUTOR_THREAD_COUNT
|
||||||
|
.get_metric_with_label_values(&[setup])
|
||||||
|
.unwrap()
|
||||||
|
.set(u64::try_from(num_threads.get()).unwrap());
|
||||||
|
}
|
||||||
|
|
||||||
pub fn preinitialize_metrics() {
|
pub fn preinitialize_metrics() {
|
||||||
// Python tests need these and on some we do alerting.
|
// Python tests need these and on some we do alerting.
|
||||||
//
|
//
|
||||||
@@ -2719,7 +2780,8 @@ pub fn preinitialize_metrics() {
|
|||||||
|
|
||||||
// histograms
|
// histograms
|
||||||
[
|
[
|
||||||
&READ_NUM_FS_LAYERS,
|
&READ_NUM_LAYERS_VISITED,
|
||||||
|
&VEC_READ_NUM_LAYERS_VISITED,
|
||||||
&WAIT_LSN_TIME,
|
&WAIT_LSN_TIME,
|
||||||
&WAL_REDO_TIME,
|
&WAL_REDO_TIME,
|
||||||
&WAL_REDO_RECORDS_HISTOGRAM,
|
&WAL_REDO_RECORDS_HISTOGRAM,
|
||||||
|
|||||||
@@ -874,9 +874,20 @@ impl PageServerHandler {
|
|||||||
// walsender completes the authentication and starts streaming the
|
// walsender completes the authentication and starts streaming the
|
||||||
// WAL.
|
// WAL.
|
||||||
if lsn <= last_record_lsn {
|
if lsn <= last_record_lsn {
|
||||||
|
// It might be better to use max(lsn, latest_gc_cutoff_lsn) instead
|
||||||
|
// last_record_lsn. That would give the same result, since we know
|
||||||
|
// that there haven't been modifications since 'lsn'. Using an older
|
||||||
|
// LSN might be faster, because that could allow skipping recent
|
||||||
|
// layers when finding the page.
|
||||||
lsn = last_record_lsn;
|
lsn = last_record_lsn;
|
||||||
} else {
|
} else {
|
||||||
timeline.wait_lsn(lsn, ctx).await?;
|
timeline
|
||||||
|
.wait_lsn(
|
||||||
|
lsn,
|
||||||
|
crate::tenant::timeline::WaitLsnWaiter::PageService,
|
||||||
|
ctx,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
// Since we waited for 'lsn' to arrive, that is now the last
|
// Since we waited for 'lsn' to arrive, that is now the last
|
||||||
// record LSN. (Or close enough for our purposes; the
|
// record LSN. (Or close enough for our purposes; the
|
||||||
// last-record LSN can advance immediately after we return
|
// last-record LSN can advance immediately after we return
|
||||||
@@ -888,7 +899,13 @@ impl PageServerHandler {
|
|||||||
"invalid LSN(0) in request".into(),
|
"invalid LSN(0) in request".into(),
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
timeline.wait_lsn(lsn, ctx).await?;
|
timeline
|
||||||
|
.wait_lsn(
|
||||||
|
lsn,
|
||||||
|
crate::tenant::timeline::WaitLsnWaiter::PageService,
|
||||||
|
ctx,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
}
|
}
|
||||||
|
|
||||||
if lsn < **latest_gc_cutoff_lsn {
|
if lsn < **latest_gc_cutoff_lsn {
|
||||||
@@ -1189,6 +1206,10 @@ impl PageServerHandler {
|
|||||||
))
|
))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Note on "fullbackup":
|
||||||
|
/// Full basebackups should only be used for debugging purposes.
|
||||||
|
/// Originally, it was introduced to enable breaking storage format changes,
|
||||||
|
/// but that is not applicable anymore.
|
||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
#[instrument(skip_all, fields(shard_id, ?lsn, ?prev_lsn, %full_backup))]
|
#[instrument(skip_all, fields(shard_id, ?lsn, ?prev_lsn, %full_backup))]
|
||||||
async fn handle_basebackup_request<IO>(
|
async fn handle_basebackup_request<IO>(
|
||||||
@@ -1215,7 +1236,13 @@ impl PageServerHandler {
|
|||||||
if let Some(lsn) = lsn {
|
if let Some(lsn) = lsn {
|
||||||
// Backup was requested at a particular LSN. Wait for it to arrive.
|
// Backup was requested at a particular LSN. Wait for it to arrive.
|
||||||
info!("waiting for {}", lsn);
|
info!("waiting for {}", lsn);
|
||||||
timeline.wait_lsn(lsn, ctx).await?;
|
timeline
|
||||||
|
.wait_lsn(
|
||||||
|
lsn,
|
||||||
|
crate::tenant::timeline::WaitLsnWaiter::PageService,
|
||||||
|
ctx,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
timeline
|
timeline
|
||||||
.check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
|
.check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
|
||||||
.context("invalid basebackup lsn")?;
|
.context("invalid basebackup lsn")?;
|
||||||
|
|||||||
@@ -252,16 +252,8 @@ impl Timeline {
|
|||||||
let mut buf = version.get(self, key, ctx).await?;
|
let mut buf = version.get(self, key, ctx).await?;
|
||||||
let nblocks = buf.get_u32_le();
|
let nblocks = buf.get_u32_le();
|
||||||
|
|
||||||
if latest {
|
self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
|
||||||
// Update relation size cache only if "latest" flag is set.
|
|
||||||
// This flag is set by compute when it is working with most recent version of relation.
|
|
||||||
// Typically master compute node always set latest=true.
|
|
||||||
// Please notice, that even if compute node "by mistake" specifies old LSN but set
|
|
||||||
// latest=true, then it can not cause cache corruption, because with latest=true
|
|
||||||
// pageserver choose max(request_lsn, last_written_lsn) and so cached value will be
|
|
||||||
// associated with most recent value of LSN.
|
|
||||||
self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
|
|
||||||
}
|
|
||||||
Ok(nblocks)
|
Ok(nblocks)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -456,6 +448,11 @@ impl Timeline {
|
|||||||
// include physical changes from later commits that will be marked
|
// include physical changes from later commits that will be marked
|
||||||
// as aborted, and will need to be vacuumed away.
|
// as aborted, and will need to be vacuumed away.
|
||||||
let commit_lsn = Lsn((low - 1) * 8);
|
let commit_lsn = Lsn((low - 1) * 8);
|
||||||
|
// This maxing operation is for the edge case that the search above did
|
||||||
|
// set found_smaller to true but it never increased the lsn. Then, low
|
||||||
|
// is still the old min_lsn the subtraction above could possibly give a value
|
||||||
|
// below the anchestor_lsn.
|
||||||
|
let commit_lsn = commit_lsn.max(min_lsn);
|
||||||
match (found_smaller, found_larger) {
|
match (found_smaller, found_larger) {
|
||||||
(false, false) => {
|
(false, false) => {
|
||||||
// This can happen if no commit records have been processed yet, e.g.
|
// This can happen if no commit records have been processed yet, e.g.
|
||||||
@@ -817,7 +814,7 @@ impl Timeline {
|
|||||||
/// Get cached size of relation if it not updated after specified LSN
|
/// Get cached size of relation if it not updated after specified LSN
|
||||||
pub fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option<BlockNumber> {
|
pub fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option<BlockNumber> {
|
||||||
let rel_size_cache = self.rel_size_cache.read().unwrap();
|
let rel_size_cache = self.rel_size_cache.read().unwrap();
|
||||||
if let Some((cached_lsn, nblocks)) = rel_size_cache.get(tag) {
|
if let Some((cached_lsn, nblocks)) = rel_size_cache.map.get(tag) {
|
||||||
if lsn >= *cached_lsn {
|
if lsn >= *cached_lsn {
|
||||||
return Some(*nblocks);
|
return Some(*nblocks);
|
||||||
}
|
}
|
||||||
@@ -828,7 +825,16 @@ impl Timeline {
|
|||||||
/// Update cached relation size if there is no more recent update
|
/// Update cached relation size if there is no more recent update
|
||||||
pub fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
|
pub fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
|
||||||
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
|
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
|
||||||
match rel_size_cache.entry(tag) {
|
|
||||||
|
if lsn < rel_size_cache.complete_as_of {
|
||||||
|
// Do not cache old values. It's safe to cache the size on read, as long as
|
||||||
|
// the read was at an LSN since we started the WAL ingestion. Reasoning: we
|
||||||
|
// never evict values from the cache, so if the relation size changed after
|
||||||
|
// 'lsn', the new value is already in the cache.
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
match rel_size_cache.map.entry(tag) {
|
||||||
hash_map::Entry::Occupied(mut entry) => {
|
hash_map::Entry::Occupied(mut entry) => {
|
||||||
let cached_lsn = entry.get_mut();
|
let cached_lsn = entry.get_mut();
|
||||||
if lsn >= cached_lsn.0 {
|
if lsn >= cached_lsn.0 {
|
||||||
@@ -844,13 +850,13 @@ impl Timeline {
|
|||||||
/// Store cached relation size
|
/// Store cached relation size
|
||||||
pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
|
pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
|
||||||
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
|
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
|
||||||
rel_size_cache.insert(tag, (lsn, nblocks));
|
rel_size_cache.map.insert(tag, (lsn, nblocks));
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Remove cached relation size
|
/// Remove cached relation size
|
||||||
pub fn remove_cached_rel_size(&self, tag: &RelTag) {
|
pub fn remove_cached_rel_size(&self, tag: &RelTag) {
|
||||||
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
|
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
|
||||||
rel_size_cache.remove(tag);
|
rel_size_cache.map.remove(tag);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1401,7 +1407,7 @@ impl<'a> DatadirModification<'a> {
|
|||||||
let n_files;
|
let n_files;
|
||||||
let mut aux_files = self.tline.aux_files.lock().await;
|
let mut aux_files = self.tline.aux_files.lock().await;
|
||||||
if let Some(mut dir) = aux_files.dir.take() {
|
if let Some(mut dir) = aux_files.dir.take() {
|
||||||
// We already updated aux files in `self`: emit a delta and update our latest value
|
// We already updated aux files in `self`: emit a delta and update our latest value.
|
||||||
dir.upsert(file_path.clone(), content.clone());
|
dir.upsert(file_path.clone(), content.clone());
|
||||||
n_files = dir.files.len();
|
n_files = dir.files.len();
|
||||||
if aux_files.n_deltas == MAX_AUX_FILE_DELTAS {
|
if aux_files.n_deltas == MAX_AUX_FILE_DELTAS {
|
||||||
@@ -1446,10 +1452,14 @@ impl<'a> DatadirModification<'a> {
|
|||||||
// reset the map.
|
// reset the map.
|
||||||
return Err(e.into());
|
return Err(e.into());
|
||||||
}
|
}
|
||||||
// FIXME: PageReconstructError doesn't have an explicit variant for key-not-found, so
|
// Note: we added missing key error variant in https://github.com/neondatabase/neon/pull/7393 but
|
||||||
// we are assuming that all _other_ possible errors represents a missing key. If some
|
// the original code assumes all other errors are missing keys. Therefore, we keep the code path
|
||||||
// other error occurs, we may incorrectly reset the map of aux files.
|
// the same for now, though in theory, we should only match the `MissingKey` variant.
|
||||||
Err(PageReconstructError::Other(_) | PageReconstructError::WalRedo(_)) => {
|
Err(
|
||||||
|
PageReconstructError::Other(_)
|
||||||
|
| PageReconstructError::WalRedo(_)
|
||||||
|
| PageReconstructError::MissingKey { .. },
|
||||||
|
) => {
|
||||||
// Key is missing, we must insert an image as the basis for subsequent deltas.
|
// Key is missing, we must insert an image as the basis for subsequent deltas.
|
||||||
|
|
||||||
let mut dir = AuxFilesDirectory {
|
let mut dir = AuxFilesDirectory {
|
||||||
|
|||||||
@@ -33,6 +33,52 @@ impl Value {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
#[derive(Debug, PartialEq)]
|
||||||
|
pub(crate) enum InvalidInput {
|
||||||
|
TooShortValue,
|
||||||
|
TooShortPostgresRecord,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// We could have a ValueRef where everything is `serde(borrow)`. Before implementing that, lets
|
||||||
|
/// use this type for querying if a slice looks some particular way.
|
||||||
|
#[cfg(test)]
|
||||||
|
pub(crate) struct ValueBytes;
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
impl ValueBytes {
|
||||||
|
pub(crate) fn will_init(raw: &[u8]) -> Result<bool, InvalidInput> {
|
||||||
|
if raw.len() < 12 {
|
||||||
|
return Err(InvalidInput::TooShortValue);
|
||||||
|
}
|
||||||
|
|
||||||
|
let value_discriminator = &raw[0..4];
|
||||||
|
|
||||||
|
if value_discriminator == [0, 0, 0, 0] {
|
||||||
|
// Value::Image always initializes
|
||||||
|
return Ok(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
if value_discriminator != [0, 0, 0, 1] {
|
||||||
|
// not a Value::WalRecord(..)
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
let walrecord_discriminator = &raw[4..8];
|
||||||
|
|
||||||
|
if walrecord_discriminator != [0, 0, 0, 0] {
|
||||||
|
// only NeonWalRecord::Postgres can have will_init
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
if raw.len() < 17 {
|
||||||
|
return Err(InvalidInput::TooShortPostgresRecord);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(raw[8] == 1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use super::*;
|
use super::*;
|
||||||
@@ -70,6 +116,8 @@ mod test {
|
|||||||
];
|
];
|
||||||
|
|
||||||
roundtrip!(image, expected);
|
roundtrip!(image, expected);
|
||||||
|
|
||||||
|
assert!(ValueBytes::will_init(&expected).unwrap());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -93,6 +141,96 @@ mod test {
|
|||||||
];
|
];
|
||||||
|
|
||||||
roundtrip!(rec, expected);
|
roundtrip!(rec, expected);
|
||||||
|
|
||||||
|
assert!(ValueBytes::will_init(&expected).unwrap());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn bytes_inspection_too_short_image() {
|
||||||
|
let rec = Value::Image(Bytes::from_static(b""));
|
||||||
|
|
||||||
|
#[rustfmt::skip]
|
||||||
|
let expected = [
|
||||||
|
// top level discriminator of 4 bytes
|
||||||
|
0x00, 0x00, 0x00, 0x00,
|
||||||
|
// 8 byte length
|
||||||
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||||
|
];
|
||||||
|
|
||||||
|
roundtrip!(rec, expected);
|
||||||
|
|
||||||
|
assert!(ValueBytes::will_init(&expected).unwrap());
|
||||||
|
assert_eq!(expected.len(), 12);
|
||||||
|
for len in 0..12 {
|
||||||
|
assert_eq!(
|
||||||
|
ValueBytes::will_init(&expected[..len]).unwrap_err(),
|
||||||
|
InvalidInput::TooShortValue
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn bytes_inspection_too_short_postgres_record() {
|
||||||
|
let rec = NeonWalRecord::Postgres {
|
||||||
|
will_init: false,
|
||||||
|
rec: Bytes::from_static(b""),
|
||||||
|
};
|
||||||
|
let rec = Value::WalRecord(rec);
|
||||||
|
|
||||||
|
#[rustfmt::skip]
|
||||||
|
let expected = [
|
||||||
|
// flattened discriminator of total 8 bytes
|
||||||
|
0x00, 0x00, 0x00, 0x01,
|
||||||
|
0x00, 0x00, 0x00, 0x00,
|
||||||
|
// will_init
|
||||||
|
0x00,
|
||||||
|
// 8 byte length
|
||||||
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||||
|
];
|
||||||
|
|
||||||
|
roundtrip!(rec, expected);
|
||||||
|
|
||||||
|
assert!(!ValueBytes::will_init(&expected).unwrap());
|
||||||
|
assert_eq!(expected.len(), 17);
|
||||||
|
for len in 12..17 {
|
||||||
|
assert_eq!(
|
||||||
|
ValueBytes::will_init(&expected[..len]).unwrap_err(),
|
||||||
|
InvalidInput::TooShortPostgresRecord
|
||||||
|
)
|
||||||
|
}
|
||||||
|
for len in 0..12 {
|
||||||
|
assert_eq!(
|
||||||
|
ValueBytes::will_init(&expected[..len]).unwrap_err(),
|
||||||
|
InvalidInput::TooShortValue
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn clear_visibility_map_flags_example() {
|
||||||
|
let rec = NeonWalRecord::ClearVisibilityMapFlags {
|
||||||
|
new_heap_blkno: Some(0x11),
|
||||||
|
old_heap_blkno: None,
|
||||||
|
flags: 0x03,
|
||||||
|
};
|
||||||
|
let rec = Value::WalRecord(rec);
|
||||||
|
|
||||||
|
#[rustfmt::skip]
|
||||||
|
let expected = [
|
||||||
|
// discriminators
|
||||||
|
0x00, 0x00, 0x00, 0x01,
|
||||||
|
0x00, 0x00, 0x00, 0x01,
|
||||||
|
// Some == 1 followed by 4 bytes
|
||||||
|
0x01, 0x00, 0x00, 0x00, 0x11,
|
||||||
|
// None == 0
|
||||||
|
0x00,
|
||||||
|
// flags
|
||||||
|
0x03
|
||||||
|
];
|
||||||
|
|
||||||
|
roundtrip!(rec, expected);
|
||||||
|
|
||||||
|
assert!(!ValueBytes::will_init(&expected).unwrap());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -33,13 +33,14 @@
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
use std::future::Future;
|
use std::future::Future;
|
||||||
|
use std::num::NonZeroUsize;
|
||||||
use std::panic::AssertUnwindSafe;
|
use std::panic::AssertUnwindSafe;
|
||||||
|
use std::str::FromStr;
|
||||||
use std::sync::atomic::{AtomicU64, Ordering};
|
use std::sync::atomic::{AtomicU64, Ordering};
|
||||||
use std::sync::{Arc, Mutex};
|
use std::sync::{Arc, Mutex};
|
||||||
|
|
||||||
use futures::FutureExt;
|
use futures::FutureExt;
|
||||||
use pageserver_api::shard::TenantShardId;
|
use pageserver_api::shard::TenantShardId;
|
||||||
use tokio::runtime::Runtime;
|
|
||||||
use tokio::task::JoinHandle;
|
use tokio::task::JoinHandle;
|
||||||
use tokio::task_local;
|
use tokio::task_local;
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
@@ -48,8 +49,11 @@ use tracing::{debug, error, info, warn};
|
|||||||
|
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
|
|
||||||
|
use utils::env;
|
||||||
use utils::id::TimelineId;
|
use utils::id::TimelineId;
|
||||||
|
|
||||||
|
use crate::metrics::set_tokio_runtime_setup;
|
||||||
|
|
||||||
//
|
//
|
||||||
// There are four runtimes:
|
// There are four runtimes:
|
||||||
//
|
//
|
||||||
@@ -98,52 +102,119 @@ use utils::id::TimelineId;
|
|||||||
// other operations, if the upload tasks e.g. get blocked on locks. It shouldn't
|
// other operations, if the upload tasks e.g. get blocked on locks. It shouldn't
|
||||||
// happen, but still.
|
// happen, but still.
|
||||||
//
|
//
|
||||||
pub static COMPUTE_REQUEST_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
|
|
||||||
tokio::runtime::Builder::new_multi_thread()
|
|
||||||
.thread_name("compute request worker")
|
|
||||||
.enable_all()
|
|
||||||
.build()
|
|
||||||
.expect("Failed to create compute request runtime")
|
|
||||||
});
|
|
||||||
|
|
||||||
pub static MGMT_REQUEST_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
|
pub(crate) static TOKIO_WORKER_THREADS: Lazy<NonZeroUsize> = Lazy::new(|| {
|
||||||
tokio::runtime::Builder::new_multi_thread()
|
|
||||||
.thread_name("mgmt request worker")
|
|
||||||
.enable_all()
|
|
||||||
.build()
|
|
||||||
.expect("Failed to create mgmt request runtime")
|
|
||||||
});
|
|
||||||
|
|
||||||
pub static WALRECEIVER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
|
|
||||||
tokio::runtime::Builder::new_multi_thread()
|
|
||||||
.thread_name("walreceiver worker")
|
|
||||||
.enable_all()
|
|
||||||
.build()
|
|
||||||
.expect("Failed to create walreceiver runtime")
|
|
||||||
});
|
|
||||||
|
|
||||||
pub static BACKGROUND_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
|
|
||||||
tokio::runtime::Builder::new_multi_thread()
|
|
||||||
.thread_name("background op worker")
|
|
||||||
// if you change the number of worker threads please change the constant below
|
|
||||||
.enable_all()
|
|
||||||
.build()
|
|
||||||
.expect("Failed to create background op runtime")
|
|
||||||
});
|
|
||||||
|
|
||||||
pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(|| {
|
|
||||||
// force init and thus panics
|
|
||||||
let _ = BACKGROUND_RUNTIME.handle();
|
|
||||||
// replicates tokio-1.28.1::loom::sys::num_cpus which is not available publicly
|
// replicates tokio-1.28.1::loom::sys::num_cpus which is not available publicly
|
||||||
// tokio would had already panicked for parsing errors or NotUnicode
|
// tokio would had already panicked for parsing errors or NotUnicode
|
||||||
//
|
//
|
||||||
// this will be wrong if any of the runtimes gets their worker threads configured to something
|
// this will be wrong if any of the runtimes gets their worker threads configured to something
|
||||||
// else, but that has not been needed in a long time.
|
// else, but that has not been needed in a long time.
|
||||||
std::env::var("TOKIO_WORKER_THREADS")
|
NonZeroUsize::new(
|
||||||
.map(|s| s.parse::<usize>().unwrap())
|
std::env::var("TOKIO_WORKER_THREADS")
|
||||||
.unwrap_or_else(|_e| usize::max(2, num_cpus::get()))
|
.map(|s| s.parse::<usize>().unwrap())
|
||||||
|
.unwrap_or_else(|_e| usize::max(2, num_cpus::get())),
|
||||||
|
)
|
||||||
|
.expect("the max() ensures that this is not zero")
|
||||||
});
|
});
|
||||||
|
|
||||||
|
enum TokioRuntimeMode {
|
||||||
|
SingleThreaded,
|
||||||
|
MultiThreaded { num_workers: NonZeroUsize },
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FromStr for TokioRuntimeMode {
|
||||||
|
type Err = String;
|
||||||
|
|
||||||
|
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||||
|
match s {
|
||||||
|
"current_thread" => Ok(TokioRuntimeMode::SingleThreaded),
|
||||||
|
s => match s.strip_prefix("multi_thread:") {
|
||||||
|
Some("default") => Ok(TokioRuntimeMode::MultiThreaded {
|
||||||
|
num_workers: *TOKIO_WORKER_THREADS,
|
||||||
|
}),
|
||||||
|
Some(suffix) => {
|
||||||
|
let num_workers = suffix.parse::<NonZeroUsize>().map_err(|e| {
|
||||||
|
format!(
|
||||||
|
"invalid number of multi-threaded runtime workers ({suffix:?}): {e}",
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
Ok(TokioRuntimeMode::MultiThreaded { num_workers })
|
||||||
|
}
|
||||||
|
None => Err(format!("invalid runtime config: {s:?}")),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static ONE_RUNTIME: Lazy<Option<tokio::runtime::Runtime>> = Lazy::new(|| {
|
||||||
|
let thread_name = "pageserver-tokio";
|
||||||
|
let Some(mode) = env::var("NEON_PAGESERVER_USE_ONE_RUNTIME") else {
|
||||||
|
// If the env var is not set, leave this static as None.
|
||||||
|
set_tokio_runtime_setup(
|
||||||
|
"multiple-runtimes",
|
||||||
|
NUM_MULTIPLE_RUNTIMES
|
||||||
|
.checked_mul(*TOKIO_WORKER_THREADS)
|
||||||
|
.unwrap(),
|
||||||
|
);
|
||||||
|
return None;
|
||||||
|
};
|
||||||
|
Some(match mode {
|
||||||
|
TokioRuntimeMode::SingleThreaded => {
|
||||||
|
set_tokio_runtime_setup("one-runtime-single-threaded", NonZeroUsize::new(1).unwrap());
|
||||||
|
tokio::runtime::Builder::new_current_thread()
|
||||||
|
.thread_name(thread_name)
|
||||||
|
.enable_all()
|
||||||
|
.build()
|
||||||
|
.expect("failed to create one single runtime")
|
||||||
|
}
|
||||||
|
TokioRuntimeMode::MultiThreaded { num_workers } => {
|
||||||
|
set_tokio_runtime_setup("one-runtime-multi-threaded", num_workers);
|
||||||
|
tokio::runtime::Builder::new_multi_thread()
|
||||||
|
.thread_name(thread_name)
|
||||||
|
.enable_all()
|
||||||
|
.worker_threads(num_workers.get())
|
||||||
|
.build()
|
||||||
|
.expect("failed to create one multi-threaded runtime")
|
||||||
|
}
|
||||||
|
})
|
||||||
|
});
|
||||||
|
|
||||||
|
/// Declare a lazy static variable named `$varname` that will resolve
|
||||||
|
/// to a tokio runtime handle. If the env var `NEON_PAGESERVER_USE_ONE_RUNTIME`
|
||||||
|
/// is set, this will resolve to `ONE_RUNTIME`. Otherwise, the macro invocation
|
||||||
|
/// declares a separate runtime and the lazy static variable `$varname`
|
||||||
|
/// will resolve to that separate runtime.
|
||||||
|
///
|
||||||
|
/// The result is is that `$varname.spawn()` will use `ONE_RUNTIME` if
|
||||||
|
/// `NEON_PAGESERVER_USE_ONE_RUNTIME` is set, and will use the separate runtime
|
||||||
|
/// otherwise.
|
||||||
|
macro_rules! pageserver_runtime {
|
||||||
|
($varname:ident, $name:literal) => {
|
||||||
|
pub static $varname: Lazy<&'static tokio::runtime::Runtime> = Lazy::new(|| {
|
||||||
|
if let Some(runtime) = &*ONE_RUNTIME {
|
||||||
|
return runtime;
|
||||||
|
}
|
||||||
|
static RUNTIME: Lazy<tokio::runtime::Runtime> = Lazy::new(|| {
|
||||||
|
tokio::runtime::Builder::new_multi_thread()
|
||||||
|
.thread_name($name)
|
||||||
|
.worker_threads(TOKIO_WORKER_THREADS.get())
|
||||||
|
.enable_all()
|
||||||
|
.build()
|
||||||
|
.expect(std::concat!("Failed to create runtime ", $name))
|
||||||
|
});
|
||||||
|
&*RUNTIME
|
||||||
|
});
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
pageserver_runtime!(COMPUTE_REQUEST_RUNTIME, "compute request worker");
|
||||||
|
pageserver_runtime!(MGMT_REQUEST_RUNTIME, "mgmt request worker");
|
||||||
|
pageserver_runtime!(WALRECEIVER_RUNTIME, "walreceiver worker");
|
||||||
|
pageserver_runtime!(BACKGROUND_RUNTIME, "background op worker");
|
||||||
|
// Bump this number when adding a new pageserver_runtime!
|
||||||
|
// SAFETY: it's obviously correct
|
||||||
|
const NUM_MULTIPLE_RUNTIMES: NonZeroUsize = unsafe { NonZeroUsize::new_unchecked(4) };
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy)]
|
#[derive(Debug, Clone, Copy)]
|
||||||
pub struct PageserverTaskId(u64);
|
pub struct PageserverTaskId(u64);
|
||||||
|
|
||||||
@@ -214,13 +285,12 @@ pub enum TaskKind {
|
|||||||
/// Internally, `Client` hands over requests to the `Connection` object.
|
/// Internally, `Client` hands over requests to the `Connection` object.
|
||||||
/// The `Connection` object is responsible for speaking the wire protocol.
|
/// The `Connection` object is responsible for speaking the wire protocol.
|
||||||
///
|
///
|
||||||
/// Walreceiver uses its own abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
|
/// Walreceiver uses a legacy abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
|
||||||
/// That abstraction doesn't use `task_mgr`.
|
|
||||||
/// The `WalReceiverManager` task ensures that this `TaskHandle` task does not outlive the `WalReceiverManager` task.
|
/// The `WalReceiverManager` task ensures that this `TaskHandle` task does not outlive the `WalReceiverManager` task.
|
||||||
/// For the `RequestContext` that we hand to the TaskHandle, we use the [`WalReceiverConnectionHandler`] task kind.
|
/// For the `RequestContext` that we hand to the TaskHandle, we use the [`WalReceiverConnectionHandler`] task kind.
|
||||||
///
|
///
|
||||||
/// Once the connection is established, the `TaskHandle` task creates a
|
/// Once the connection is established, the `TaskHandle` task spawns a
|
||||||
/// [`WalReceiverConnectionPoller`] task_mgr task that is responsible for polling
|
/// [`WalReceiverConnectionPoller`] task that is responsible for polling
|
||||||
/// the `Connection` object.
|
/// the `Connection` object.
|
||||||
/// A `CancellationToken` created by the `TaskHandle` task ensures
|
/// A `CancellationToken` created by the `TaskHandle` task ensures
|
||||||
/// that the [`WalReceiverConnectionPoller`] task will cancel soon after as the `TaskHandle` is dropped.
|
/// that the [`WalReceiverConnectionPoller`] task will cancel soon after as the `TaskHandle` is dropped.
|
||||||
@@ -230,7 +300,6 @@ pub enum TaskKind {
|
|||||||
WalReceiverManager,
|
WalReceiverManager,
|
||||||
|
|
||||||
/// The `TaskHandle` task that executes `handle_walreceiver_connection`.
|
/// The `TaskHandle` task that executes `handle_walreceiver_connection`.
|
||||||
/// Not a `task_mgr` task, but we use this `TaskKind` for its `RequestContext`.
|
|
||||||
/// See the comment on [`WalReceiverManager`].
|
/// See the comment on [`WalReceiverManager`].
|
||||||
///
|
///
|
||||||
/// [`WalReceiverManager`]: Self::WalReceiverManager
|
/// [`WalReceiverManager`]: Self::WalReceiverManager
|
||||||
|
|||||||
@@ -12,6 +12,7 @@
|
|||||||
//!
|
//!
|
||||||
|
|
||||||
use anyhow::{bail, Context};
|
use anyhow::{bail, Context};
|
||||||
|
use arc_swap::ArcSwap;
|
||||||
use camino::Utf8Path;
|
use camino::Utf8Path;
|
||||||
use camino::Utf8PathBuf;
|
use camino::Utf8PathBuf;
|
||||||
use enumset::EnumSet;
|
use enumset::EnumSet;
|
||||||
@@ -98,7 +99,7 @@ use std::ops::Bound::Included;
|
|||||||
use std::sync::atomic::AtomicU64;
|
use std::sync::atomic::AtomicU64;
|
||||||
use std::sync::atomic::Ordering;
|
use std::sync::atomic::Ordering;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::sync::{Mutex, RwLock};
|
use std::sync::Mutex;
|
||||||
use std::time::{Duration, Instant};
|
use std::time::{Duration, Instant};
|
||||||
|
|
||||||
use crate::span;
|
use crate::span;
|
||||||
@@ -260,7 +261,7 @@ pub struct Tenant {
|
|||||||
// We keep TenantConfOpt sturct here to preserve the information
|
// We keep TenantConfOpt sturct here to preserve the information
|
||||||
// about parameters that are not set.
|
// about parameters that are not set.
|
||||||
// This is necessary to allow global config updates.
|
// This is necessary to allow global config updates.
|
||||||
tenant_conf: Arc<RwLock<AttachedTenantConf>>,
|
tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
|
||||||
|
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_shard_id: TenantShardId,
|
||||||
|
|
||||||
@@ -385,7 +386,7 @@ impl WalRedoManager {
|
|||||||
|
|
||||||
pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
|
pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
|
||||||
match self {
|
match self {
|
||||||
WalRedoManager::Prod(m) => m.status(),
|
WalRedoManager::Prod(m) => Some(m.status()),
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
WalRedoManager::Test(_) => None,
|
WalRedoManager::Test(_) => None,
|
||||||
}
|
}
|
||||||
@@ -558,9 +559,10 @@ impl Tenant {
|
|||||||
// By doing what we do here, the index part upload is retried.
|
// By doing what we do here, the index part upload is retried.
|
||||||
// If control plane retries timeline creation in the meantime, the mgmt API handler
|
// If control plane retries timeline creation in the meantime, the mgmt API handler
|
||||||
// for timeline creation will coalesce on the upload we queue here.
|
// for timeline creation will coalesce on the upload we queue here.
|
||||||
|
// FIXME: this branch should be dead code as we no longer write local metadata.
|
||||||
let rtc = timeline.remote_client.as_ref().unwrap();
|
let rtc = timeline.remote_client.as_ref().unwrap();
|
||||||
rtc.init_upload_queue_for_empty_remote(&metadata)?;
|
rtc.init_upload_queue_for_empty_remote(&metadata)?;
|
||||||
rtc.schedule_index_upload_for_metadata_update(&metadata)?;
|
rtc.schedule_index_upload_for_full_metadata_update(&metadata)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
timeline
|
timeline
|
||||||
@@ -1515,7 +1517,7 @@ impl Tenant {
|
|||||||
// sizes etc. and that would get confused if the previous page versions
|
// sizes etc. and that would get confused if the previous page versions
|
||||||
// are not in the repository yet.
|
// are not in the repository yet.
|
||||||
ancestor_timeline
|
ancestor_timeline
|
||||||
.wait_lsn(*lsn, ctx)
|
.wait_lsn(*lsn, timeline::WaitLsnWaiter::Tenant, ctx)
|
||||||
.await
|
.await
|
||||||
.map_err(|e| match e {
|
.map_err(|e| match e {
|
||||||
e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => {
|
e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => {
|
||||||
@@ -1606,7 +1608,7 @@ impl Tenant {
|
|||||||
);
|
);
|
||||||
|
|
||||||
{
|
{
|
||||||
let conf = self.tenant_conf.read().unwrap();
|
let conf = self.tenant_conf.load();
|
||||||
|
|
||||||
if !conf.location.may_delete_layers_hint() {
|
if !conf.location.may_delete_layers_hint() {
|
||||||
info!("Skipping GC in location state {:?}", conf.location);
|
info!("Skipping GC in location state {:?}", conf.location);
|
||||||
@@ -1633,7 +1635,7 @@ impl Tenant {
|
|||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
let conf = self.tenant_conf.read().unwrap();
|
let conf = self.tenant_conf.load();
|
||||||
if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() {
|
if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() {
|
||||||
info!("Skipping compaction in location state {:?}", conf.location);
|
info!("Skipping compaction in location state {:?}", conf.location);
|
||||||
return Ok(());
|
return Ok(());
|
||||||
@@ -1782,7 +1784,7 @@ impl Tenant {
|
|||||||
async fn shutdown(
|
async fn shutdown(
|
||||||
&self,
|
&self,
|
||||||
shutdown_progress: completion::Barrier,
|
shutdown_progress: completion::Barrier,
|
||||||
freeze_and_flush: bool,
|
shutdown_mode: timeline::ShutdownMode,
|
||||||
) -> Result<(), completion::Barrier> {
|
) -> Result<(), completion::Barrier> {
|
||||||
span::debug_assert_current_span_has_tenant_id();
|
span::debug_assert_current_span_has_tenant_id();
|
||||||
|
|
||||||
@@ -1829,16 +1831,8 @@ impl Tenant {
|
|||||||
timelines.values().for_each(|timeline| {
|
timelines.values().for_each(|timeline| {
|
||||||
let timeline = Arc::clone(timeline);
|
let timeline = Arc::clone(timeline);
|
||||||
let timeline_id = timeline.timeline_id;
|
let timeline_id = timeline.timeline_id;
|
||||||
|
let span = tracing::info_span!("timeline_shutdown", %timeline_id, ?shutdown_mode);
|
||||||
let span =
|
js.spawn(async move { timeline.shutdown(shutdown_mode).instrument(span).await });
|
||||||
tracing::info_span!("timeline_shutdown", %timeline_id, ?freeze_and_flush);
|
|
||||||
js.spawn(async move {
|
|
||||||
if freeze_and_flush {
|
|
||||||
timeline.flush_and_shutdown().instrument(span).await
|
|
||||||
} else {
|
|
||||||
timeline.shutdown().instrument(span).await
|
|
||||||
}
|
|
||||||
});
|
|
||||||
})
|
})
|
||||||
};
|
};
|
||||||
// test_long_timeline_create_then_tenant_delete is leaning on this message
|
// test_long_timeline_create_then_tenant_delete is leaning on this message
|
||||||
@@ -2082,14 +2076,14 @@ impl Tenant {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn get_attach_mode(&self) -> AttachmentMode {
|
pub(crate) fn get_attach_mode(&self) -> AttachmentMode {
|
||||||
self.tenant_conf.read().unwrap().location.attach_mode
|
self.tenant_conf.load().location.attach_mode
|
||||||
}
|
}
|
||||||
|
|
||||||
/// For API access: generate a LocationConfig equivalent to the one that would be used to
|
/// For API access: generate a LocationConfig equivalent to the one that would be used to
|
||||||
/// create a Tenant in the same state. Do not use this in hot paths: it's for relatively
|
/// create a Tenant in the same state. Do not use this in hot paths: it's for relatively
|
||||||
/// rare external API calls, like a reconciliation at startup.
|
/// rare external API calls, like a reconciliation at startup.
|
||||||
pub(crate) fn get_location_conf(&self) -> models::LocationConfig {
|
pub(crate) fn get_location_conf(&self) -> models::LocationConfig {
|
||||||
let conf = self.tenant_conf.read().unwrap();
|
let conf = self.tenant_conf.load();
|
||||||
|
|
||||||
let location_config_mode = match conf.location.attach_mode {
|
let location_config_mode = match conf.location.attach_mode {
|
||||||
AttachmentMode::Single => models::LocationConfigMode::AttachedSingle,
|
AttachmentMode::Single => models::LocationConfigMode::AttachedSingle,
|
||||||
@@ -2236,7 +2230,7 @@ where
|
|||||||
|
|
||||||
impl Tenant {
|
impl Tenant {
|
||||||
pub fn tenant_specific_overrides(&self) -> TenantConfOpt {
|
pub fn tenant_specific_overrides(&self) -> TenantConfOpt {
|
||||||
self.tenant_conf.read().unwrap().tenant_conf.clone()
|
self.tenant_conf.load().tenant_conf.clone()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn effective_config(&self) -> TenantConf {
|
pub fn effective_config(&self) -> TenantConf {
|
||||||
@@ -2245,84 +2239,84 @@ impl Tenant {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_checkpoint_distance(&self) -> u64 {
|
pub fn get_checkpoint_distance(&self) -> u64 {
|
||||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||||
tenant_conf
|
tenant_conf
|
||||||
.checkpoint_distance
|
.checkpoint_distance
|
||||||
.unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
|
.unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_checkpoint_timeout(&self) -> Duration {
|
pub fn get_checkpoint_timeout(&self) -> Duration {
|
||||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||||
tenant_conf
|
tenant_conf
|
||||||
.checkpoint_timeout
|
.checkpoint_timeout
|
||||||
.unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
|
.unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_compaction_target_size(&self) -> u64 {
|
pub fn get_compaction_target_size(&self) -> u64 {
|
||||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||||
tenant_conf
|
tenant_conf
|
||||||
.compaction_target_size
|
.compaction_target_size
|
||||||
.unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
|
.unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_compaction_period(&self) -> Duration {
|
pub fn get_compaction_period(&self) -> Duration {
|
||||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||||
tenant_conf
|
tenant_conf
|
||||||
.compaction_period
|
.compaction_period
|
||||||
.unwrap_or(self.conf.default_tenant_conf.compaction_period)
|
.unwrap_or(self.conf.default_tenant_conf.compaction_period)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_compaction_threshold(&self) -> usize {
|
pub fn get_compaction_threshold(&self) -> usize {
|
||||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||||
tenant_conf
|
tenant_conf
|
||||||
.compaction_threshold
|
.compaction_threshold
|
||||||
.unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
|
.unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_gc_horizon(&self) -> u64 {
|
pub fn get_gc_horizon(&self) -> u64 {
|
||||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||||
tenant_conf
|
tenant_conf
|
||||||
.gc_horizon
|
.gc_horizon
|
||||||
.unwrap_or(self.conf.default_tenant_conf.gc_horizon)
|
.unwrap_or(self.conf.default_tenant_conf.gc_horizon)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_gc_period(&self) -> Duration {
|
pub fn get_gc_period(&self) -> Duration {
|
||||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||||
tenant_conf
|
tenant_conf
|
||||||
.gc_period
|
.gc_period
|
||||||
.unwrap_or(self.conf.default_tenant_conf.gc_period)
|
.unwrap_or(self.conf.default_tenant_conf.gc_period)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_image_creation_threshold(&self) -> usize {
|
pub fn get_image_creation_threshold(&self) -> usize {
|
||||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||||
tenant_conf
|
tenant_conf
|
||||||
.image_creation_threshold
|
.image_creation_threshold
|
||||||
.unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
|
.unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_pitr_interval(&self) -> Duration {
|
pub fn get_pitr_interval(&self) -> Duration {
|
||||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||||
tenant_conf
|
tenant_conf
|
||||||
.pitr_interval
|
.pitr_interval
|
||||||
.unwrap_or(self.conf.default_tenant_conf.pitr_interval)
|
.unwrap_or(self.conf.default_tenant_conf.pitr_interval)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_trace_read_requests(&self) -> bool {
|
pub fn get_trace_read_requests(&self) -> bool {
|
||||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||||
tenant_conf
|
tenant_conf
|
||||||
.trace_read_requests
|
.trace_read_requests
|
||||||
.unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
|
.unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_min_resident_size_override(&self) -> Option<u64> {
|
pub fn get_min_resident_size_override(&self) -> Option<u64> {
|
||||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||||
tenant_conf
|
tenant_conf
|
||||||
.min_resident_size_override
|
.min_resident_size_override
|
||||||
.or(self.conf.default_tenant_conf.min_resident_size_override)
|
.or(self.conf.default_tenant_conf.min_resident_size_override)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_heatmap_period(&self) -> Option<Duration> {
|
pub fn get_heatmap_period(&self) -> Option<Duration> {
|
||||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||||
let heatmap_period = tenant_conf
|
let heatmap_period = tenant_conf
|
||||||
.heatmap_period
|
.heatmap_period
|
||||||
.unwrap_or(self.conf.default_tenant_conf.heatmap_period);
|
.unwrap_or(self.conf.default_tenant_conf.heatmap_period);
|
||||||
@@ -2334,26 +2328,40 @@ impl Tenant {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
|
pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
|
||||||
self.tenant_conf.write().unwrap().tenant_conf = new_tenant_conf;
|
// Use read-copy-update in order to avoid overwriting the location config
|
||||||
self.tenant_conf_updated();
|
// state if this races with [`Tenant::set_new_location_config`]. Note that
|
||||||
|
// this race is not possible if both request types come from the storage
|
||||||
|
// controller (as they should!) because an exclusive op lock is required
|
||||||
|
// on the storage controller side.
|
||||||
|
self.tenant_conf.rcu(|inner| {
|
||||||
|
Arc::new(AttachedTenantConf {
|
||||||
|
tenant_conf: new_tenant_conf.clone(),
|
||||||
|
location: inner.location,
|
||||||
|
})
|
||||||
|
});
|
||||||
|
|
||||||
|
self.tenant_conf_updated(&new_tenant_conf);
|
||||||
// Don't hold self.timelines.lock() during the notifies.
|
// Don't hold self.timelines.lock() during the notifies.
|
||||||
// There's no risk of deadlock right now, but there could be if we consolidate
|
// There's no risk of deadlock right now, but there could be if we consolidate
|
||||||
// mutexes in struct Timeline in the future.
|
// mutexes in struct Timeline in the future.
|
||||||
let timelines = self.list_timelines();
|
let timelines = self.list_timelines();
|
||||||
for timeline in timelines {
|
for timeline in timelines {
|
||||||
timeline.tenant_conf_updated();
|
timeline.tenant_conf_updated(&new_tenant_conf);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) {
|
pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) {
|
||||||
*self.tenant_conf.write().unwrap() = new_conf;
|
let new_tenant_conf = new_conf.tenant_conf.clone();
|
||||||
self.tenant_conf_updated();
|
|
||||||
|
self.tenant_conf.store(Arc::new(new_conf));
|
||||||
|
|
||||||
|
self.tenant_conf_updated(&new_tenant_conf);
|
||||||
// Don't hold self.timelines.lock() during the notifies.
|
// Don't hold self.timelines.lock() during the notifies.
|
||||||
// There's no risk of deadlock right now, but there could be if we consolidate
|
// There's no risk of deadlock right now, but there could be if we consolidate
|
||||||
// mutexes in struct Timeline in the future.
|
// mutexes in struct Timeline in the future.
|
||||||
let timelines = self.list_timelines();
|
let timelines = self.list_timelines();
|
||||||
for timeline in timelines {
|
for timeline in timelines {
|
||||||
timeline.tenant_conf_updated();
|
timeline.tenant_conf_updated(&new_tenant_conf);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2367,11 +2375,8 @@ impl Tenant {
|
|||||||
.unwrap_or(psconf.default_tenant_conf.timeline_get_throttle.clone())
|
.unwrap_or(psconf.default_tenant_conf.timeline_get_throttle.clone())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn tenant_conf_updated(&self) {
|
pub(crate) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) {
|
||||||
let conf = {
|
let conf = Self::get_timeline_get_throttle_config(self.conf, new_conf);
|
||||||
let guard = self.tenant_conf.read().unwrap();
|
|
||||||
Self::get_timeline_get_throttle_config(self.conf, &guard.tenant_conf)
|
|
||||||
};
|
|
||||||
self.timeline_get_throttle.reconfigure(conf)
|
self.timeline_get_throttle.reconfigure(conf)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2519,7 +2524,7 @@ impl Tenant {
|
|||||||
Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf),
|
Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf),
|
||||||
&crate::metrics::tenant_throttling::TIMELINE_GET,
|
&crate::metrics::tenant_throttling::TIMELINE_GET,
|
||||||
)),
|
)),
|
||||||
tenant_conf: Arc::new(RwLock::new(attached_conf)),
|
tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2865,20 +2870,23 @@ impl Tenant {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(cutoff) = timeline.get_last_record_lsn().checked_sub(horizon) {
|
let cutoff = timeline
|
||||||
let branchpoints: Vec<Lsn> = all_branchpoints
|
.get_last_record_lsn()
|
||||||
.range((
|
.checked_sub(horizon)
|
||||||
Included((timeline_id, Lsn(0))),
|
.unwrap_or(Lsn(0));
|
||||||
Included((timeline_id, Lsn(u64::MAX))),
|
|
||||||
))
|
|
||||||
.map(|&x| x.1)
|
|
||||||
.collect();
|
|
||||||
timeline
|
|
||||||
.update_gc_info(branchpoints, cutoff, pitr, cancel, ctx)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
gc_timelines.push(timeline);
|
let branchpoints: Vec<Lsn> = all_branchpoints
|
||||||
}
|
.range((
|
||||||
|
Included((timeline_id, Lsn(0))),
|
||||||
|
Included((timeline_id, Lsn(u64::MAX))),
|
||||||
|
))
|
||||||
|
.map(|&x| x.1)
|
||||||
|
.collect();
|
||||||
|
timeline
|
||||||
|
.update_gc_info(branchpoints, cutoff, pitr, cancel, ctx)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
gc_timelines.push(timeline);
|
||||||
}
|
}
|
||||||
drop(gc_cs);
|
drop(gc_cs);
|
||||||
Ok(gc_timelines)
|
Ok(gc_timelines)
|
||||||
@@ -3023,7 +3031,7 @@ impl Tenant {
|
|||||||
// See also https://github.com/neondatabase/neon/issues/3865
|
// See also https://github.com/neondatabase/neon/issues/3865
|
||||||
if let Some(remote_client) = new_timeline.remote_client.as_ref() {
|
if let Some(remote_client) = new_timeline.remote_client.as_ref() {
|
||||||
remote_client
|
remote_client
|
||||||
.schedule_index_upload_for_metadata_update(&metadata)
|
.schedule_index_upload_for_full_metadata_update(&metadata)
|
||||||
.context("branch initial metadata upload")?;
|
.context("branch initial metadata upload")?;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -3186,7 +3194,7 @@ impl Tenant {
|
|||||||
run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?;
|
run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?;
|
||||||
|
|
||||||
// Upload the created data dir to S3
|
// Upload the created data dir to S3
|
||||||
if self.tenant_shard_id().is_zero() {
|
if self.tenant_shard_id().is_shard_zero() {
|
||||||
self.upload_initdb(&timelines_path, &pgdata_path, &timeline_id)
|
self.upload_initdb(&timelines_path, &pgdata_path, &timeline_id)
|
||||||
.await?;
|
.await?;
|
||||||
}
|
}
|
||||||
@@ -3433,7 +3441,7 @@ impl Tenant {
|
|||||||
.store(size, Ordering::Relaxed);
|
.store(size, Ordering::Relaxed);
|
||||||
|
|
||||||
// Only shard zero should be calculating synthetic sizes
|
// Only shard zero should be calculating synthetic sizes
|
||||||
debug_assert!(self.shard_identity.is_zero());
|
debug_assert!(self.shard_identity.is_shard_zero());
|
||||||
|
|
||||||
TENANT_SYNTHETIC_SIZE_METRIC
|
TENANT_SYNTHETIC_SIZE_METRIC
|
||||||
.get_metric_with_label_values(&[&self.tenant_shard_id.tenant_id.to_string()])
|
.get_metric_with_label_values(&[&self.tenant_shard_id.tenant_id.to_string()])
|
||||||
@@ -3505,7 +3513,7 @@ impl Tenant {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt {
|
pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt {
|
||||||
self.tenant_conf.read().unwrap().tenant_conf.clone()
|
self.tenant_conf.load().tenant_conf.clone()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -3653,6 +3661,9 @@ pub(crate) mod harness {
|
|||||||
heatmap_period: Some(tenant_conf.heatmap_period),
|
heatmap_period: Some(tenant_conf.heatmap_period),
|
||||||
lazy_slru_download: Some(tenant_conf.lazy_slru_download),
|
lazy_slru_download: Some(tenant_conf.lazy_slru_download),
|
||||||
timeline_get_throttle: Some(tenant_conf.timeline_get_throttle),
|
timeline_get_throttle: Some(tenant_conf.timeline_get_throttle),
|
||||||
|
image_layer_creation_check_threshold: Some(
|
||||||
|
tenant_conf.image_layer_creation_check_threshold,
|
||||||
|
),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -3841,6 +3852,8 @@ pub(crate) mod harness {
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
use std::collections::BTreeMap;
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::keyspace::KeySpaceAccum;
|
use crate::keyspace::KeySpaceAccum;
|
||||||
use crate::repository::{Key, Value};
|
use crate::repository::{Key, Value};
|
||||||
@@ -3849,8 +3862,10 @@ mod tests {
|
|||||||
use crate::DEFAULT_PG_VERSION;
|
use crate::DEFAULT_PG_VERSION;
|
||||||
use bytes::BytesMut;
|
use bytes::BytesMut;
|
||||||
use hex_literal::hex;
|
use hex_literal::hex;
|
||||||
|
use pageserver_api::key::NON_INHERITED_RANGE;
|
||||||
use pageserver_api::keyspace::KeySpace;
|
use pageserver_api::keyspace::KeySpace;
|
||||||
use rand::{thread_rng, Rng};
|
use rand::{thread_rng, Rng};
|
||||||
|
use tests::timeline::{GetVectoredError, ShutdownMode};
|
||||||
|
|
||||||
static TEST_KEY: Lazy<Key> =
|
static TEST_KEY: Lazy<Key> =
|
||||||
Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
|
Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
|
||||||
@@ -4296,7 +4311,7 @@ mod tests {
|
|||||||
make_some_layers(tline.as_ref(), Lsn(0x8000), &ctx).await?;
|
make_some_layers(tline.as_ref(), Lsn(0x8000), &ctx).await?;
|
||||||
// so that all uploads finish & we can call harness.load() below again
|
// so that all uploads finish & we can call harness.load() below again
|
||||||
tenant
|
tenant
|
||||||
.shutdown(Default::default(), true)
|
.shutdown(Default::default(), ShutdownMode::FreezeAndFlush)
|
||||||
.instrument(harness.span())
|
.instrument(harness.span())
|
||||||
.await
|
.await
|
||||||
.ok()
|
.ok()
|
||||||
@@ -4337,7 +4352,7 @@ mod tests {
|
|||||||
|
|
||||||
// so that all uploads finish & we can call harness.load() below again
|
// so that all uploads finish & we can call harness.load() below again
|
||||||
tenant
|
tenant
|
||||||
.shutdown(Default::default(), true)
|
.shutdown(Default::default(), ShutdownMode::FreezeAndFlush)
|
||||||
.instrument(harness.span())
|
.instrument(harness.span())
|
||||||
.await
|
.await
|
||||||
.ok()
|
.ok()
|
||||||
@@ -4647,6 +4662,62 @@ mod tests {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_get_vectored_aux_files() -> anyhow::Result<()> {
|
||||||
|
let harness = TenantHarness::create("test_get_vectored_aux_files")?;
|
||||||
|
|
||||||
|
let (tenant, ctx) = harness.load().await;
|
||||||
|
let tline = tenant
|
||||||
|
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
|
||||||
|
.await?;
|
||||||
|
let tline = tline.raw_timeline().unwrap();
|
||||||
|
|
||||||
|
let mut modification = tline.begin_modification(Lsn(0x1000));
|
||||||
|
modification.put_file("foo/bar1", b"content1", &ctx).await?;
|
||||||
|
modification.set_lsn(Lsn(0x1008))?;
|
||||||
|
modification.put_file("foo/bar2", b"content2", &ctx).await?;
|
||||||
|
modification.commit(&ctx).await?;
|
||||||
|
|
||||||
|
let child_timeline_id = TimelineId::generate();
|
||||||
|
tenant
|
||||||
|
.branch_timeline_test(
|
||||||
|
tline,
|
||||||
|
child_timeline_id,
|
||||||
|
Some(tline.get_last_record_lsn()),
|
||||||
|
&ctx,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let child_timeline = tenant
|
||||||
|
.get_timeline(child_timeline_id, true)
|
||||||
|
.expect("Should have the branched timeline");
|
||||||
|
|
||||||
|
let aux_keyspace = KeySpace {
|
||||||
|
ranges: vec![NON_INHERITED_RANGE],
|
||||||
|
};
|
||||||
|
let read_lsn = child_timeline.get_last_record_lsn();
|
||||||
|
|
||||||
|
let vectored_res = child_timeline
|
||||||
|
.get_vectored_impl(aux_keyspace.clone(), read_lsn, &ctx)
|
||||||
|
.await;
|
||||||
|
|
||||||
|
child_timeline
|
||||||
|
.validate_get_vectored_impl(&vectored_res, aux_keyspace, read_lsn, &ctx)
|
||||||
|
.await;
|
||||||
|
|
||||||
|
let images = vectored_res?;
|
||||||
|
let mut key = NON_INHERITED_RANGE.start;
|
||||||
|
while key < NON_INHERITED_RANGE.end {
|
||||||
|
assert!(matches!(
|
||||||
|
images[&key],
|
||||||
|
Err(PageReconstructError::MissingKey(_))
|
||||||
|
));
|
||||||
|
key = key.next();
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
// Test that vectored get handles layer gaps correctly
|
// Test that vectored get handles layer gaps correctly
|
||||||
// by advancing into the next ancestor timeline if required.
|
// by advancing into the next ancestor timeline if required.
|
||||||
//
|
//
|
||||||
@@ -4786,6 +4857,166 @@ mod tests {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Test that vectored get descends into ancestor timelines correctly and
|
||||||
|
// does not return an image that's newer than requested.
|
||||||
|
//
|
||||||
|
// The diagram below ilustrates an interesting case. We have a parent timeline
|
||||||
|
// (top of the Lsn range) and a child timeline. The request key cannot be reconstructed
|
||||||
|
// from the child timeline, so the parent timeline must be visited. When advacing into
|
||||||
|
// the child timeline, the read path needs to remember what the requested Lsn was in
|
||||||
|
// order to avoid returning an image that's too new. The test below constructs such
|
||||||
|
// a timeline setup and does a few queries around the Lsn of each page image.
|
||||||
|
// ```
|
||||||
|
// LSN
|
||||||
|
// ^
|
||||||
|
// |
|
||||||
|
// |
|
||||||
|
// 500 | --------------------------------------> branch point
|
||||||
|
// 400 | X
|
||||||
|
// 300 | X
|
||||||
|
// 200 | --------------------------------------> requested lsn
|
||||||
|
// 100 | X
|
||||||
|
// |---------------------------------------> Key
|
||||||
|
// |
|
||||||
|
// ------> requested key
|
||||||
|
//
|
||||||
|
// Legend:
|
||||||
|
// * X - page images
|
||||||
|
// ```
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_get_vectored_ancestor_descent() -> anyhow::Result<()> {
|
||||||
|
let harness = TenantHarness::create("test_get_vectored_on_lsn_axis")?;
|
||||||
|
let (tenant, ctx) = harness.load().await;
|
||||||
|
|
||||||
|
let start_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
|
||||||
|
let end_key = start_key.add(1000);
|
||||||
|
let child_gap_at_key = start_key.add(500);
|
||||||
|
let mut parent_gap_lsns: BTreeMap<Lsn, String> = BTreeMap::new();
|
||||||
|
|
||||||
|
let mut current_lsn = Lsn(0x10);
|
||||||
|
|
||||||
|
let timeline_id = TimelineId::generate();
|
||||||
|
let parent_timeline = tenant
|
||||||
|
.create_test_timeline(timeline_id, current_lsn, DEFAULT_PG_VERSION, &ctx)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
current_lsn += 0x100;
|
||||||
|
|
||||||
|
for _ in 0..3 {
|
||||||
|
let mut key = start_key;
|
||||||
|
while key < end_key {
|
||||||
|
current_lsn += 0x10;
|
||||||
|
|
||||||
|
let image_value = format!("{} at {}", child_gap_at_key, current_lsn);
|
||||||
|
|
||||||
|
let mut writer = parent_timeline.writer().await;
|
||||||
|
writer
|
||||||
|
.put(
|
||||||
|
key,
|
||||||
|
current_lsn,
|
||||||
|
&Value::Image(test_img(&image_value)),
|
||||||
|
&ctx,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
writer.finish_write(current_lsn);
|
||||||
|
|
||||||
|
if key == child_gap_at_key {
|
||||||
|
parent_gap_lsns.insert(current_lsn, image_value);
|
||||||
|
}
|
||||||
|
|
||||||
|
key = key.next();
|
||||||
|
}
|
||||||
|
|
||||||
|
parent_timeline.freeze_and_flush().await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let child_timeline_id = TimelineId::generate();
|
||||||
|
|
||||||
|
let child_timeline = tenant
|
||||||
|
.branch_timeline_test(&parent_timeline, child_timeline_id, Some(current_lsn), &ctx)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let mut key = start_key;
|
||||||
|
while key < end_key {
|
||||||
|
if key == child_gap_at_key {
|
||||||
|
key = key.next();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
current_lsn += 0x10;
|
||||||
|
|
||||||
|
let mut writer = child_timeline.writer().await;
|
||||||
|
writer
|
||||||
|
.put(
|
||||||
|
key,
|
||||||
|
current_lsn,
|
||||||
|
&Value::Image(test_img(&format!("{} at {}", key, current_lsn))),
|
||||||
|
&ctx,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
writer.finish_write(current_lsn);
|
||||||
|
|
||||||
|
key = key.next();
|
||||||
|
}
|
||||||
|
|
||||||
|
child_timeline.freeze_and_flush().await?;
|
||||||
|
|
||||||
|
let lsn_offsets: [i64; 5] = [-10, -1, 0, 1, 10];
|
||||||
|
let mut query_lsns = Vec::new();
|
||||||
|
for image_lsn in parent_gap_lsns.keys().rev() {
|
||||||
|
for offset in lsn_offsets {
|
||||||
|
query_lsns.push(Lsn(image_lsn
|
||||||
|
.0
|
||||||
|
.checked_add_signed(offset)
|
||||||
|
.expect("Shouldn't overflow")));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for query_lsn in query_lsns {
|
||||||
|
let results = child_timeline
|
||||||
|
.get_vectored_impl(
|
||||||
|
KeySpace {
|
||||||
|
ranges: vec![child_gap_at_key..child_gap_at_key.next()],
|
||||||
|
},
|
||||||
|
query_lsn,
|
||||||
|
&ctx,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
|
||||||
|
let expected_item = parent_gap_lsns
|
||||||
|
.iter()
|
||||||
|
.rev()
|
||||||
|
.find(|(lsn, _)| **lsn <= query_lsn);
|
||||||
|
|
||||||
|
info!(
|
||||||
|
"Doing vectored read at LSN {}. Expecting image to be: {:?}",
|
||||||
|
query_lsn, expected_item
|
||||||
|
);
|
||||||
|
|
||||||
|
match expected_item {
|
||||||
|
Some((_, img_value)) => {
|
||||||
|
let key_results = results.expect("No vectored get error expected");
|
||||||
|
let key_result = &key_results[&child_gap_at_key];
|
||||||
|
let returned_img = key_result
|
||||||
|
.as_ref()
|
||||||
|
.expect("No page reconstruct error expected");
|
||||||
|
|
||||||
|
info!(
|
||||||
|
"Vectored read at LSN {} returned image {}",
|
||||||
|
query_lsn,
|
||||||
|
std::str::from_utf8(returned_img)?
|
||||||
|
);
|
||||||
|
assert_eq!(*returned_img, test_img(img_value));
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
assert!(matches!(results, Err(GetVectoredError::MissingKey(_))));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_random_updates() -> anyhow::Result<()> {
|
async fn test_random_updates() -> anyhow::Result<()> {
|
||||||
let harness = TenantHarness::create("test_random_updates")?;
|
let harness = TenantHarness::create("test_random_updates")?;
|
||||||
@@ -5118,7 +5349,7 @@ mod tests {
|
|||||||
// Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again
|
// Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again
|
||||||
let raw_tline = tline.raw_timeline().unwrap();
|
let raw_tline = tline.raw_timeline().unwrap();
|
||||||
raw_tline
|
raw_tline
|
||||||
.shutdown()
|
.shutdown(super::timeline::ShutdownMode::Hard)
|
||||||
.instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, shard_id=%raw_tline.tenant_shard_id.shard_slug(), timeline_id=%TIMELINE_ID))
|
.instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, shard_id=%raw_tline.tenant_shard_id.shard_slug(), timeline_id=%TIMELINE_ID))
|
||||||
.await;
|
.await;
|
||||||
std::mem::forget(tline);
|
std::mem::forget(tline);
|
||||||
|
|||||||
@@ -57,6 +57,9 @@ pub mod defaults {
|
|||||||
// throughputs up to 1GiB/s per timeline.
|
// throughputs up to 1GiB/s per timeline.
|
||||||
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
|
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
|
||||||
pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
|
pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
|
||||||
|
// By default ingest enough WAL for two new L0 layers before checking if new image
|
||||||
|
// image layers should be created.
|
||||||
|
pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
|
||||||
|
|
||||||
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
|
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
|
||||||
}
|
}
|
||||||
@@ -362,6 +365,10 @@ pub struct TenantConf {
|
|||||||
pub lazy_slru_download: bool,
|
pub lazy_slru_download: bool,
|
||||||
|
|
||||||
pub timeline_get_throttle: pageserver_api::models::ThrottleConfig,
|
pub timeline_get_throttle: pageserver_api::models::ThrottleConfig,
|
||||||
|
|
||||||
|
// How much WAL must be ingested before checking again whether a new image layer is required.
|
||||||
|
// Expresed in multiples of checkpoint distance.
|
||||||
|
pub image_layer_creation_check_threshold: u8,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Same as TenantConf, but this struct preserves the information about
|
/// Same as TenantConf, but this struct preserves the information about
|
||||||
@@ -454,6 +461,9 @@ pub struct TenantConfOpt {
|
|||||||
|
|
||||||
#[serde(skip_serializing_if = "Option::is_none")]
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
pub timeline_get_throttle: Option<pageserver_api::models::ThrottleConfig>,
|
pub timeline_get_throttle: Option<pageserver_api::models::ThrottleConfig>,
|
||||||
|
|
||||||
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
|
pub image_layer_creation_check_threshold: Option<u8>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TenantConfOpt {
|
impl TenantConfOpt {
|
||||||
@@ -508,6 +518,9 @@ impl TenantConfOpt {
|
|||||||
.timeline_get_throttle
|
.timeline_get_throttle
|
||||||
.clone()
|
.clone()
|
||||||
.unwrap_or(global_conf.timeline_get_throttle),
|
.unwrap_or(global_conf.timeline_get_throttle),
|
||||||
|
image_layer_creation_check_threshold: self
|
||||||
|
.image_layer_creation_check_threshold
|
||||||
|
.unwrap_or(global_conf.image_layer_creation_check_threshold),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -548,6 +561,7 @@ impl Default for TenantConf {
|
|||||||
heatmap_period: Duration::ZERO,
|
heatmap_period: Duration::ZERO,
|
||||||
lazy_slru_download: false,
|
lazy_slru_download: false,
|
||||||
timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
|
timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
|
||||||
|
image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -621,6 +635,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
|
|||||||
heatmap_period: value.heatmap_period.map(humantime),
|
heatmap_period: value.heatmap_period.map(humantime),
|
||||||
lazy_slru_download: value.lazy_slru_download,
|
lazy_slru_download: value.lazy_slru_download,
|
||||||
timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
|
timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
|
||||||
|
image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -14,7 +14,10 @@ use crate::{
|
|||||||
config::PageServerConf,
|
config::PageServerConf,
|
||||||
context::RequestContext,
|
context::RequestContext,
|
||||||
task_mgr::{self, TaskKind},
|
task_mgr::{self, TaskKind},
|
||||||
tenant::mgr::{TenantSlot, TenantsMapRemoveResult},
|
tenant::{
|
||||||
|
mgr::{TenantSlot, TenantsMapRemoveResult},
|
||||||
|
timeline::ShutdownMode,
|
||||||
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
use super::{
|
use super::{
|
||||||
@@ -433,6 +436,11 @@ impl DeleteTenantFlow {
|
|||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Check whether background deletion of this tenant is currently in progress
|
||||||
|
pub(crate) fn is_in_progress(tenant: &Tenant) -> bool {
|
||||||
|
tenant.delete_progress.try_lock().is_err()
|
||||||
|
}
|
||||||
|
|
||||||
async fn prepare(
|
async fn prepare(
|
||||||
tenant: &Arc<Tenant>,
|
tenant: &Arc<Tenant>,
|
||||||
) -> Result<tokio::sync::OwnedMutexGuard<Self>, DeleteTenantError> {
|
) -> Result<tokio::sync::OwnedMutexGuard<Self>, DeleteTenantError> {
|
||||||
@@ -463,7 +471,7 @@ impl DeleteTenantFlow {
|
|||||||
// tenant.shutdown
|
// tenant.shutdown
|
||||||
// Its also bad that we're holding tenants.read here.
|
// Its also bad that we're holding tenants.read here.
|
||||||
// TODO relax set_stopping to be idempotent?
|
// TODO relax set_stopping to be idempotent?
|
||||||
if tenant.shutdown(progress, false).await.is_err() {
|
if tenant.shutdown(progress, ShutdownMode::Hard).await.is_err() {
|
||||||
return Err(DeleteTenantError::Other(anyhow::anyhow!(
|
return Err(DeleteTenantError::Other(anyhow::anyhow!(
|
||||||
"tenant shutdown is already in progress"
|
"tenant shutdown is already in progress"
|
||||||
)));
|
)));
|
||||||
|
|||||||
@@ -72,6 +72,10 @@ impl EphemeralFile {
|
|||||||
self.len
|
self.len
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn id(&self) -> page_cache::FileId {
|
||||||
|
self.page_cache_file_id
|
||||||
|
}
|
||||||
|
|
||||||
pub(crate) async fn read_blk(
|
pub(crate) async fn read_blk(
|
||||||
&self,
|
&self,
|
||||||
blknum: u32,
|
blknum: u32,
|
||||||
|
|||||||
@@ -346,35 +346,6 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(PartialEq, Eq, Hash, Debug, Clone)]
|
|
||||||
pub enum InMemoryLayerHandle {
|
|
||||||
Open {
|
|
||||||
lsn_floor: Lsn,
|
|
||||||
end_lsn: Lsn,
|
|
||||||
},
|
|
||||||
Frozen {
|
|
||||||
idx: usize,
|
|
||||||
lsn_floor: Lsn,
|
|
||||||
end_lsn: Lsn,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
impl InMemoryLayerHandle {
|
|
||||||
pub fn get_lsn_floor(&self) -> Lsn {
|
|
||||||
match self {
|
|
||||||
InMemoryLayerHandle::Open { lsn_floor, .. } => *lsn_floor,
|
|
||||||
InMemoryLayerHandle::Frozen { lsn_floor, .. } => *lsn_floor,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn get_end_lsn(&self) -> Lsn {
|
|
||||||
match self {
|
|
||||||
InMemoryLayerHandle::Open { end_lsn, .. } => *end_lsn,
|
|
||||||
InMemoryLayerHandle::Frozen { end_lsn, .. } => *end_lsn,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl LayerMap {
|
impl LayerMap {
|
||||||
///
|
///
|
||||||
/// Find the latest layer (by lsn.end) that covers the given
|
/// Find the latest layer (by lsn.end) that covers the given
|
||||||
@@ -576,41 +547,18 @@ impl LayerMap {
|
|||||||
self.historic.iter()
|
self.historic.iter()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get a handle for the first in memory layer that matches the provided predicate.
|
/// Get a ref counted pointer for the first in memory layer that matches the provided predicate.
|
||||||
/// The handle should be used with [`Self::get_in_memory_layer`] to retrieve the actual layer.
|
pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<Arc<InMemoryLayer>>
|
||||||
///
|
|
||||||
/// Note: [`Self::find_in_memory_layer`] and [`Self::get_in_memory_layer`] should be called during
|
|
||||||
/// the same exclusive region established by holding the layer manager lock.
|
|
||||||
pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<InMemoryLayerHandle>
|
|
||||||
where
|
where
|
||||||
Pred: FnMut(&Arc<InMemoryLayer>) -> bool,
|
Pred: FnMut(&Arc<InMemoryLayer>) -> bool,
|
||||||
{
|
{
|
||||||
if let Some(open) = &self.open_layer {
|
if let Some(open) = &self.open_layer {
|
||||||
if pred(open) {
|
if pred(open) {
|
||||||
return Some(InMemoryLayerHandle::Open {
|
return Some(open.clone());
|
||||||
lsn_floor: open.get_lsn_range().start,
|
|
||||||
end_lsn: open.get_lsn_range().end,
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let pos = self.frozen_layers.iter().rev().position(pred);
|
self.frozen_layers.iter().rfind(|l| pred(l)).cloned()
|
||||||
pos.map(|rev_idx| {
|
|
||||||
let idx = self.frozen_layers.len() - 1 - rev_idx;
|
|
||||||
InMemoryLayerHandle::Frozen {
|
|
||||||
idx,
|
|
||||||
lsn_floor: self.frozen_layers[idx].get_lsn_range().start,
|
|
||||||
end_lsn: self.frozen_layers[idx].get_lsn_range().end,
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Get the layer pointed to by the provided handle.
|
|
||||||
pub fn get_in_memory_layer(&self, handle: &InMemoryLayerHandle) -> Option<Arc<InMemoryLayer>> {
|
|
||||||
match handle {
|
|
||||||
InMemoryLayerHandle::Open { .. } => self.open_layer.clone(),
|
|
||||||
InMemoryLayerHandle::Frozen { idx, .. } => self.frozen_layers.get(*idx).cloned(),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
///
|
///
|
||||||
|
|||||||
@@ -235,6 +235,12 @@ impl TimelineMetadata {
|
|||||||
let bytes = instance.to_bytes().unwrap();
|
let bytes = instance.to_bytes().unwrap();
|
||||||
Self::from_bytes(&bytes).unwrap()
|
Self::from_bytes(&bytes).unwrap()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn apply(&mut self, update: &MetadataUpdate) {
|
||||||
|
self.body.disk_consistent_lsn = update.disk_consistent_lsn;
|
||||||
|
self.body.prev_record_lsn = update.prev_record_lsn;
|
||||||
|
self.body.latest_gc_cutoff_lsn = update.latest_gc_cutoff_lsn;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'de> Deserialize<'de> for TimelineMetadata {
|
impl<'de> Deserialize<'de> for TimelineMetadata {
|
||||||
@@ -259,6 +265,27 @@ impl Serialize for TimelineMetadata {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Parts of the metadata which are regularly modified.
|
||||||
|
pub(crate) struct MetadataUpdate {
|
||||||
|
disk_consistent_lsn: Lsn,
|
||||||
|
prev_record_lsn: Option<Lsn>,
|
||||||
|
latest_gc_cutoff_lsn: Lsn,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MetadataUpdate {
|
||||||
|
pub(crate) fn new(
|
||||||
|
disk_consistent_lsn: Lsn,
|
||||||
|
prev_record_lsn: Option<Lsn>,
|
||||||
|
latest_gc_cutoff_lsn: Lsn,
|
||||||
|
) -> Self {
|
||||||
|
Self {
|
||||||
|
disk_consistent_lsn,
|
||||||
|
prev_record_lsn,
|
||||||
|
latest_gc_cutoff_lsn,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|||||||
@@ -44,6 +44,7 @@ use crate::tenant::config::{
|
|||||||
use crate::tenant::delete::DeleteTenantFlow;
|
use crate::tenant::delete::DeleteTenantFlow;
|
||||||
use crate::tenant::span::debug_assert_current_span_has_tenant_id;
|
use crate::tenant::span::debug_assert_current_span_has_tenant_id;
|
||||||
use crate::tenant::storage_layer::inmemory_layer;
|
use crate::tenant::storage_layer::inmemory_layer;
|
||||||
|
use crate::tenant::timeline::ShutdownMode;
|
||||||
use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
|
use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
|
||||||
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX};
|
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX};
|
||||||
|
|
||||||
@@ -677,12 +678,19 @@ pub async fn init_tenant_mgr(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LocationMode::Secondary(secondary_conf) => TenantSlot::Secondary(SecondaryTenant::new(
|
LocationMode::Secondary(secondary_conf) => {
|
||||||
tenant_shard_id,
|
info!(
|
||||||
shard_identity,
|
tenant_id = %tenant_shard_id.tenant_id,
|
||||||
location_conf.tenant_conf,
|
shard_id = %tenant_shard_id.shard_slug(),
|
||||||
&secondary_conf,
|
"Starting secondary tenant"
|
||||||
)),
|
);
|
||||||
|
TenantSlot::Secondary(SecondaryTenant::new(
|
||||||
|
tenant_shard_id,
|
||||||
|
shard_identity,
|
||||||
|
location_conf.tenant_conf,
|
||||||
|
&secondary_conf,
|
||||||
|
))
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
tenants.insert(tenant_shard_id, slot);
|
tenants.insert(tenant_shard_id, slot);
|
||||||
@@ -783,11 +791,9 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
|
|||||||
shutdown_state.insert(tenant_shard_id, TenantSlot::Attached(t.clone()));
|
shutdown_state.insert(tenant_shard_id, TenantSlot::Attached(t.clone()));
|
||||||
join_set.spawn(
|
join_set.spawn(
|
||||||
async move {
|
async move {
|
||||||
let freeze_and_flush = true;
|
|
||||||
|
|
||||||
let res = {
|
let res = {
|
||||||
let (_guard, shutdown_progress) = completion::channel();
|
let (_guard, shutdown_progress) = completion::channel();
|
||||||
t.shutdown(shutdown_progress, freeze_and_flush).await
|
t.shutdown(shutdown_progress, ShutdownMode::FreezeAndFlush).await
|
||||||
};
|
};
|
||||||
|
|
||||||
if let Err(other_progress) = res {
|
if let Err(other_progress) = res {
|
||||||
@@ -1107,7 +1113,7 @@ impl TenantManager {
|
|||||||
};
|
};
|
||||||
|
|
||||||
info!("Shutting down attached tenant");
|
info!("Shutting down attached tenant");
|
||||||
match tenant.shutdown(progress, false).await {
|
match tenant.shutdown(progress, ShutdownMode::Hard).await {
|
||||||
Ok(()) => {}
|
Ok(()) => {}
|
||||||
Err(barrier) => {
|
Err(barrier) => {
|
||||||
info!("Shutdown already in progress, waiting for it to complete");
|
info!("Shutdown already in progress, waiting for it to complete");
|
||||||
@@ -1223,7 +1229,7 @@ impl TenantManager {
|
|||||||
TenantSlot::Attached(tenant) => {
|
TenantSlot::Attached(tenant) => {
|
||||||
let (_guard, progress) = utils::completion::channel();
|
let (_guard, progress) = utils::completion::channel();
|
||||||
info!("Shutting down just-spawned tenant, because tenant manager is shut down");
|
info!("Shutting down just-spawned tenant, because tenant manager is shut down");
|
||||||
match tenant.shutdown(progress, false).await {
|
match tenant.shutdown(progress, ShutdownMode::Hard).await {
|
||||||
Ok(()) => {
|
Ok(()) => {
|
||||||
info!("Finished shutting down just-spawned tenant");
|
info!("Finished shutting down just-spawned tenant");
|
||||||
}
|
}
|
||||||
@@ -1273,7 +1279,7 @@ impl TenantManager {
|
|||||||
};
|
};
|
||||||
|
|
||||||
let (_guard, progress) = utils::completion::channel();
|
let (_guard, progress) = utils::completion::channel();
|
||||||
match tenant.shutdown(progress, false).await {
|
match tenant.shutdown(progress, ShutdownMode::Hard).await {
|
||||||
Ok(()) => {
|
Ok(()) => {
|
||||||
slot_guard.drop_old_value()?;
|
slot_guard.drop_old_value()?;
|
||||||
}
|
}
|
||||||
@@ -1411,9 +1417,15 @@ impl TenantManager {
|
|||||||
|
|
||||||
match tenant.current_state() {
|
match tenant.current_state() {
|
||||||
TenantState::Broken { .. } | TenantState::Stopping { .. } => {
|
TenantState::Broken { .. } | TenantState::Stopping { .. } => {
|
||||||
// If a tenant is broken or stopping, DeleteTenantFlow can
|
// If deletion is already in progress, return success (the semantics of this
|
||||||
// handle it: broken tenants proceed to delete, stopping tenants
|
// function are to rerturn success afterr deletion is spawned in background).
|
||||||
// are checked for deletion already in progress.
|
// Otherwise fall through and let [`DeleteTenantFlow`] handle this state.
|
||||||
|
if DeleteTenantFlow::is_in_progress(&tenant) {
|
||||||
|
// The `delete_progress` lock is held: deletion is already happening
|
||||||
|
// in the bacckground
|
||||||
|
slot_guard.revert();
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
_ => {
|
_ => {
|
||||||
tenant
|
tenant
|
||||||
@@ -1649,7 +1661,14 @@ impl TenantManager {
|
|||||||
fail::fail_point!("shard-split-lsn-wait", |_| Err(anyhow::anyhow!(
|
fail::fail_point!("shard-split-lsn-wait", |_| Err(anyhow::anyhow!(
|
||||||
"failpoint"
|
"failpoint"
|
||||||
)));
|
)));
|
||||||
if let Err(e) = timeline.wait_lsn(*target_lsn, ctx).await {
|
if let Err(e) = timeline
|
||||||
|
.wait_lsn(
|
||||||
|
*target_lsn,
|
||||||
|
crate::tenant::timeline::WaitLsnWaiter::Tenant,
|
||||||
|
ctx,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
// Failure here might mean shutdown, in any case this part is an optimization
|
// Failure here might mean shutdown, in any case this part is an optimization
|
||||||
// and we shouldn't hold up the split operation.
|
// and we shouldn't hold up the split operation.
|
||||||
tracing::warn!(
|
tracing::warn!(
|
||||||
@@ -1670,7 +1689,7 @@ impl TenantManager {
|
|||||||
|
|
||||||
// Phase 5: Shut down the parent shard, and erase it from disk
|
// Phase 5: Shut down the parent shard, and erase it from disk
|
||||||
let (_guard, progress) = completion::channel();
|
let (_guard, progress) = completion::channel();
|
||||||
match parent.shutdown(progress, false).await {
|
match parent.shutdown(progress, ShutdownMode::Hard).await {
|
||||||
Ok(()) => {}
|
Ok(()) => {}
|
||||||
Err(other) => {
|
Err(other) => {
|
||||||
other.wait().await;
|
other.wait().await;
|
||||||
@@ -2657,11 +2676,11 @@ where
|
|||||||
let attached_tenant = match slot_guard.get_old_value() {
|
let attached_tenant = match slot_guard.get_old_value() {
|
||||||
Some(TenantSlot::Attached(tenant)) => {
|
Some(TenantSlot::Attached(tenant)) => {
|
||||||
// whenever we remove a tenant from memory, we don't want to flush and wait for upload
|
// whenever we remove a tenant from memory, we don't want to flush and wait for upload
|
||||||
let freeze_and_flush = false;
|
let shutdown_mode = ShutdownMode::Hard;
|
||||||
|
|
||||||
// shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
|
// shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
|
||||||
// that we can continue safely to cleanup.
|
// that we can continue safely to cleanup.
|
||||||
match tenant.shutdown(progress, freeze_and_flush).await {
|
match tenant.shutdown(progress, shutdown_mode).await {
|
||||||
Ok(()) => {}
|
Ok(()) => {}
|
||||||
Err(_other) => {
|
Err(_other) => {
|
||||||
// if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
|
// if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
|
||||||
|
|||||||
@@ -200,14 +200,17 @@ use utils::backoff::{
|
|||||||
use std::collections::{HashMap, VecDeque};
|
use std::collections::{HashMap, VecDeque};
|
||||||
use std::sync::atomic::{AtomicU32, Ordering};
|
use std::sync::atomic::{AtomicU32, Ordering};
|
||||||
use std::sync::{Arc, Mutex};
|
use std::sync::{Arc, Mutex};
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath, TimeoutOrCancel};
|
use remote_storage::{
|
||||||
|
DownloadError, GenericRemoteStorage, ListingMode, RemotePath, TimeoutOrCancel,
|
||||||
|
};
|
||||||
use std::ops::DerefMut;
|
use std::ops::DerefMut;
|
||||||
use tracing::{debug, error, info, instrument, warn};
|
use tracing::{debug, error, info, instrument, warn};
|
||||||
use tracing::{info_span, Instrument};
|
use tracing::{info_span, Instrument};
|
||||||
use utils::lsn::Lsn;
|
use utils::lsn::Lsn;
|
||||||
|
|
||||||
use crate::deletion_queue::DeletionQueueClient;
|
use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
|
||||||
use crate::metrics::{
|
use crate::metrics::{
|
||||||
MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
|
MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
|
||||||
RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
|
RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
|
||||||
@@ -235,6 +238,7 @@ use utils::id::{TenantId, TimelineId};
|
|||||||
|
|
||||||
use self::index::IndexPart;
|
use self::index::IndexPart;
|
||||||
|
|
||||||
|
use super::metadata::MetadataUpdate;
|
||||||
use super::storage_layer::{Layer, LayerFileName, ResidentLayer};
|
use super::storage_layer::{Layer, LayerFileName, ResidentLayer};
|
||||||
use super::upload_queue::SetDeletedFlagProgress;
|
use super::upload_queue::SetDeletedFlagProgress;
|
||||||
use super::Generation;
|
use super::Generation;
|
||||||
@@ -261,6 +265,10 @@ pub(crate) const INITDB_PRESERVED_PATH: &str = "initdb-preserved.tar.zst";
|
|||||||
/// Default buffer size when interfacing with [`tokio::fs::File`].
|
/// Default buffer size when interfacing with [`tokio::fs::File`].
|
||||||
pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
|
pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
|
||||||
|
|
||||||
|
/// Doing non-essential flushes of deletion queue is subject to this timeout, after
|
||||||
|
/// which we warn and skip.
|
||||||
|
const DELETION_QUEUE_FLUSH_TIMEOUT: Duration = Duration::from_secs(10);
|
||||||
|
|
||||||
pub enum MaybeDeletedIndexPart {
|
pub enum MaybeDeletedIndexPart {
|
||||||
IndexPart(IndexPart),
|
IndexPart(IndexPart),
|
||||||
Deleted(IndexPart),
|
Deleted(IndexPart),
|
||||||
@@ -531,9 +539,10 @@ impl RemoteTimelineClient {
|
|||||||
// Upload operations.
|
// Upload operations.
|
||||||
//
|
//
|
||||||
|
|
||||||
///
|
|
||||||
/// Launch an index-file upload operation in the background, with
|
/// Launch an index-file upload operation in the background, with
|
||||||
/// updated metadata.
|
/// fully updated metadata.
|
||||||
|
///
|
||||||
|
/// This should only be used to upload initial metadata to remote storage.
|
||||||
///
|
///
|
||||||
/// The upload will be added to the queue immediately, but it
|
/// The upload will be added to the queue immediately, but it
|
||||||
/// won't be performed until all previously scheduled layer file
|
/// won't be performed until all previously scheduled layer file
|
||||||
@@ -545,7 +554,7 @@ impl RemoteTimelineClient {
|
|||||||
/// If there were any changes to the list of files, i.e. if any
|
/// If there were any changes to the list of files, i.e. if any
|
||||||
/// layer file uploads were scheduled, since the last index file
|
/// layer file uploads were scheduled, since the last index file
|
||||||
/// upload, those will be included too.
|
/// upload, those will be included too.
|
||||||
pub fn schedule_index_upload_for_metadata_update(
|
pub fn schedule_index_upload_for_full_metadata_update(
|
||||||
self: &Arc<Self>,
|
self: &Arc<Self>,
|
||||||
metadata: &TimelineMetadata,
|
metadata: &TimelineMetadata,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
@@ -561,6 +570,27 @@ impl RemoteTimelineClient {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Launch an index-file upload operation in the background, with only parts of the metadata
|
||||||
|
/// updated.
|
||||||
|
///
|
||||||
|
/// This is the regular way of updating metadata on layer flushes or Gc.
|
||||||
|
///
|
||||||
|
/// Using this lighter update mechanism allows for reparenting and detaching without changes to
|
||||||
|
/// `index_part.json`, while being more clear on what values update regularly.
|
||||||
|
pub(crate) fn schedule_index_upload_for_metadata_update(
|
||||||
|
self: &Arc<Self>,
|
||||||
|
update: &MetadataUpdate,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
let mut guard = self.upload_queue.lock().unwrap();
|
||||||
|
let upload_queue = guard.initialized_mut()?;
|
||||||
|
|
||||||
|
upload_queue.latest_metadata.apply(update);
|
||||||
|
|
||||||
|
self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
///
|
///
|
||||||
/// Launch an index-file upload operation in the background, if necessary.
|
/// Launch an index-file upload operation in the background, if necessary.
|
||||||
///
|
///
|
||||||
@@ -588,14 +618,14 @@ impl RemoteTimelineClient {
|
|||||||
upload_queue: &mut UploadQueueInitialized,
|
upload_queue: &mut UploadQueueInitialized,
|
||||||
metadata: TimelineMetadata,
|
metadata: TimelineMetadata,
|
||||||
) {
|
) {
|
||||||
|
let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
|
||||||
|
|
||||||
info!(
|
info!(
|
||||||
"scheduling metadata upload with {} files ({} changed)",
|
"scheduling metadata upload up to consistent LSN {disk_consistent_lsn} with {} files ({} changed)",
|
||||||
upload_queue.latest_files.len(),
|
upload_queue.latest_files.len(),
|
||||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled,
|
upload_queue.latest_files_changes_since_metadata_upload_scheduled,
|
||||||
);
|
);
|
||||||
|
|
||||||
let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
|
|
||||||
|
|
||||||
let index_part = IndexPart::new(
|
let index_part = IndexPart::new(
|
||||||
upload_queue.latest_files.clone(),
|
upload_queue.latest_files.clone(),
|
||||||
disk_consistent_lsn,
|
disk_consistent_lsn,
|
||||||
@@ -1050,6 +1080,26 @@ impl RemoteTimelineClient {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn flush_deletion_queue(&self) -> Result<(), DeletionQueueError> {
|
||||||
|
match tokio::time::timeout(
|
||||||
|
DELETION_QUEUE_FLUSH_TIMEOUT,
|
||||||
|
self.deletion_queue_client.flush_immediate(),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(result) => result,
|
||||||
|
Err(_timeout) => {
|
||||||
|
// Flushing remote deletions is not mandatory: we flush here to make the system easier to test, and
|
||||||
|
// to ensure that _usually_ objects are really gone after a DELETE is acked. However, in case of deletion
|
||||||
|
// queue issues (https://github.com/neondatabase/neon/issues/6440), we don't want to wait indefinitely here.
|
||||||
|
tracing::warn!(
|
||||||
|
"Timed out waiting for deletion queue flush, acking deletion anyway"
|
||||||
|
);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set.
|
/// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set.
|
||||||
/// The function deletes layer files one by one, then lists the prefix to see if we leaked something
|
/// The function deletes layer files one by one, then lists the prefix to see if we leaked something
|
||||||
/// deletes leaked files if any and proceeds with deletion of index file at the end.
|
/// deletes leaked files if any and proceeds with deletion of index file at the end.
|
||||||
@@ -1097,23 +1147,29 @@ impl RemoteTimelineClient {
|
|||||||
// and retry will arrive to different pageserver there wont be any traces of it on remote storage
|
// and retry will arrive to different pageserver there wont be any traces of it on remote storage
|
||||||
let timeline_storage_path = remote_timeline_path(&self.tenant_shard_id, &self.timeline_id);
|
let timeline_storage_path = remote_timeline_path(&self.tenant_shard_id, &self.timeline_id);
|
||||||
|
|
||||||
// Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't
|
// Execute all pending deletions, so that when we proceed to do a listing below, we aren't
|
||||||
// taking the burden of listing all the layers that we already know we should delete.
|
// taking the burden of listing all the layers that we already know we should delete.
|
||||||
self.deletion_queue_client.flush_immediate().await?;
|
self.flush_deletion_queue().await?;
|
||||||
|
|
||||||
let cancel = shutdown_token();
|
let cancel = shutdown_token();
|
||||||
|
|
||||||
let remaining = download_retry(
|
let remaining = download_retry(
|
||||||
|| async {
|
|| async {
|
||||||
self.storage_impl
|
self.storage_impl
|
||||||
.list_files(Some(&timeline_storage_path), None, &cancel)
|
.list(
|
||||||
|
Some(&timeline_storage_path),
|
||||||
|
ListingMode::NoDelimiter,
|
||||||
|
None,
|
||||||
|
&cancel,
|
||||||
|
)
|
||||||
.await
|
.await
|
||||||
},
|
},
|
||||||
"list remaining files",
|
"list remaining files",
|
||||||
&cancel,
|
&cancel,
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
.context("list files remaining files")?;
|
.context("list files remaining files")?
|
||||||
|
.keys;
|
||||||
|
|
||||||
// We will delete the current index_part object last, since it acts as a deletion
|
// We will delete the current index_part object last, since it acts as a deletion
|
||||||
// marker via its deleted_at attribute
|
// marker via its deleted_at attribute
|
||||||
@@ -1173,7 +1229,7 @@ impl RemoteTimelineClient {
|
|||||||
|
|
||||||
// Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait
|
// Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait
|
||||||
// for a flush to a persistent deletion list so that we may be sure deletion will occur.
|
// for a flush to a persistent deletion list so that we may be sure deletion will occur.
|
||||||
self.deletion_queue_client.flush_immediate().await?;
|
self.flush_deletion_queue().await?;
|
||||||
|
|
||||||
fail::fail_point!("timeline-delete-after-index-delete", |_| {
|
fail::fail_point!("timeline-delete-after-index-delete", |_| {
|
||||||
Err(anyhow::anyhow!(
|
Err(anyhow::anyhow!(
|
||||||
@@ -1569,7 +1625,7 @@ impl RemoteTimelineClient {
|
|||||||
/// Use [`RemoteTimelineClient::shutdown`] for graceful stop.
|
/// Use [`RemoteTimelineClient::shutdown`] for graceful stop.
|
||||||
///
|
///
|
||||||
/// In-progress operations will still be running after this function returns.
|
/// In-progress operations will still be running after this function returns.
|
||||||
/// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))`
|
/// Use `task_mgr::shutdown_tasks(Some(TaskKind::RemoteUploadTask), Some(self.tenant_shard_id), Some(timeline_id))`
|
||||||
/// to wait for them to complete, after calling this function.
|
/// to wait for them to complete, after calling this function.
|
||||||
pub(crate) fn stop(&self) {
|
pub(crate) fn stop(&self) {
|
||||||
// Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue
|
// Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue
|
||||||
@@ -1999,7 +2055,7 @@ mod tests {
|
|||||||
// Schedule upload of index. Check that it is queued
|
// Schedule upload of index. Check that it is queued
|
||||||
let metadata = dummy_metadata(Lsn(0x20));
|
let metadata = dummy_metadata(Lsn(0x20));
|
||||||
client
|
client
|
||||||
.schedule_index_upload_for_metadata_update(&metadata)
|
.schedule_index_upload_for_full_metadata_update(&metadata)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
{
|
{
|
||||||
let mut guard = client.upload_queue.lock().unwrap();
|
let mut guard = client.upload_queue.lock().unwrap();
|
||||||
|
|||||||
@@ -258,7 +258,7 @@ pub async fn list_remote_timelines(
|
|||||||
tenant_shard_id: TenantShardId,
|
tenant_shard_id: TenantShardId,
|
||||||
cancel: CancellationToken,
|
cancel: CancellationToken,
|
||||||
) -> anyhow::Result<(HashSet<TimelineId>, HashSet<String>)> {
|
) -> anyhow::Result<(HashSet<TimelineId>, HashSet<String>)> {
|
||||||
let remote_path = remote_timelines_path(&tenant_shard_id);
|
let remote_path = remote_timelines_path(&tenant_shard_id).add_trailing_slash();
|
||||||
|
|
||||||
fail::fail_point!("storage-sync-list-remote-timelines", |_| {
|
fail::fail_point!("storage-sync-list-remote-timelines", |_| {
|
||||||
anyhow::bail!("storage-sync-list-remote-timelines");
|
anyhow::bail!("storage-sync-list-remote-timelines");
|
||||||
@@ -417,11 +417,16 @@ pub(super) async fn download_index_part(
|
|||||||
let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
|
let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
|
||||||
|
|
||||||
let indices = download_retry(
|
let indices = download_retry(
|
||||||
|| async { storage.list_files(Some(&index_prefix), None, cancel).await },
|
|| async {
|
||||||
|
storage
|
||||||
|
.list(Some(&index_prefix), ListingMode::NoDelimiter, None, cancel)
|
||||||
|
.await
|
||||||
|
},
|
||||||
"list index_part files",
|
"list index_part files",
|
||||||
cancel,
|
cancel,
|
||||||
)
|
)
|
||||||
.await?;
|
.await?
|
||||||
|
.keys;
|
||||||
|
|
||||||
// General case logic for which index to use: the latest index whose generation
|
// General case logic for which index to use: the latest index whose generation
|
||||||
// is <= our own. See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
|
// is <= our own. See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
|
||||||
|
|||||||
@@ -167,7 +167,7 @@ pub(crate) async fn time_travel_recover_tenant(
|
|||||||
let warn_after = 3;
|
let warn_after = 3;
|
||||||
let max_attempts = 10;
|
let max_attempts = 10;
|
||||||
let mut prefixes = Vec::with_capacity(2);
|
let mut prefixes = Vec::with_capacity(2);
|
||||||
if tenant_shard_id.is_zero() {
|
if tenant_shard_id.is_shard_zero() {
|
||||||
// Also recover the unsharded prefix for a shard of zero:
|
// Also recover the unsharded prefix for a shard of zero:
|
||||||
// - if the tenant is totally unsharded, the unsharded prefix contains all the data
|
// - if the tenant is totally unsharded, the unsharded prefix contains all the data
|
||||||
// - if the tenant is sharded, we still want to recover the initdb data, but we only
|
// - if the tenant is sharded, we still want to recover the initdb data, but we only
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ use tokio_util::sync::CancellationToken;
|
|||||||
use tracing::{info_span, instrument, warn, Instrument};
|
use tracing::{info_span, instrument, warn, Instrument};
|
||||||
use utils::{
|
use utils::{
|
||||||
backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext,
|
backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext,
|
||||||
id::TimelineId,
|
id::TimelineId, serde_system_time,
|
||||||
};
|
};
|
||||||
|
|
||||||
use super::{
|
use super::{
|
||||||
@@ -312,7 +312,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
|||||||
(detail.last_download, detail.next_download.unwrap())
|
(detail.last_download, detail.next_download.unwrap())
|
||||||
};
|
};
|
||||||
|
|
||||||
if now < next_download {
|
if now > next_download {
|
||||||
Some(PendingDownload {
|
Some(PendingDownload {
|
||||||
secondary_state: secondary_tenant,
|
secondary_state: secondary_tenant,
|
||||||
last_download,
|
last_download,
|
||||||
@@ -591,7 +591,7 @@ impl<'a> TenantDownloader<'a> {
|
|||||||
let mut progress = SecondaryProgress {
|
let mut progress = SecondaryProgress {
|
||||||
layers_total: heatmap_stats.layers,
|
layers_total: heatmap_stats.layers,
|
||||||
bytes_total: heatmap_stats.bytes,
|
bytes_total: heatmap_stats.bytes,
|
||||||
heatmap_mtime: Some(heatmap_mtime),
|
heatmap_mtime: Some(serde_system_time::SystemTime(heatmap_mtime)),
|
||||||
layers_downloaded: 0,
|
layers_downloaded: 0,
|
||||||
bytes_downloaded: 0,
|
bytes_downloaded: 0,
|
||||||
};
|
};
|
||||||
@@ -647,6 +647,12 @@ impl<'a> TenantDownloader<'a> {
|
|||||||
progress.bytes_downloaded += layer_byte_count;
|
progress.bytes_downloaded += layer_byte_count;
|
||||||
progress.layers_downloaded += layer_count;
|
progress.layers_downloaded += layer_count;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for delete_timeline in &delete_timelines {
|
||||||
|
// We haven't removed from disk yet, but optimistically remove from in-memory state: if removal
|
||||||
|
// from disk fails that will be a fatal error.
|
||||||
|
detail.timelines.remove(delete_timeline);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Execute accumulated deletions
|
// Execute accumulated deletions
|
||||||
@@ -710,13 +716,14 @@ impl<'a> TenantDownloader<'a> {
|
|||||||
.await
|
.await
|
||||||
.map_err(UpdateError::from)?;
|
.map_err(UpdateError::from)?;
|
||||||
|
|
||||||
|
SECONDARY_MODE.download_heatmap.inc();
|
||||||
|
|
||||||
if Some(&download.etag) == prev_etag {
|
if Some(&download.etag) == prev_etag {
|
||||||
Ok(HeatMapDownload::Unmodified)
|
Ok(HeatMapDownload::Unmodified)
|
||||||
} else {
|
} else {
|
||||||
let mut heatmap_bytes = Vec::new();
|
let mut heatmap_bytes = Vec::new();
|
||||||
let mut body = tokio_util::io::StreamReader::new(download.download_stream);
|
let mut body = tokio_util::io::StreamReader::new(download.download_stream);
|
||||||
let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
|
let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
|
||||||
SECONDARY_MODE.download_heatmap.inc();
|
|
||||||
Ok(HeatMapDownload::Modified(HeatMapModified {
|
Ok(HeatMapDownload::Modified(HeatMapModified {
|
||||||
etag: download.etag,
|
etag: download.etag,
|
||||||
last_modified: download.last_modified,
|
last_modified: download.last_modified,
|
||||||
@@ -786,6 +793,35 @@ impl<'a> TenantDownloader<'a> {
|
|||||||
// Existing on-disk layers: just update their access time.
|
// Existing on-disk layers: just update their access time.
|
||||||
if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) {
|
if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) {
|
||||||
tracing::debug!("Layer {} is already on disk", layer.name);
|
tracing::debug!("Layer {} is already on disk", layer.name);
|
||||||
|
|
||||||
|
if cfg!(debug_assertions) {
|
||||||
|
// Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think
|
||||||
|
// are already present on disk are really there.
|
||||||
|
let local_path = self
|
||||||
|
.conf
|
||||||
|
.timeline_path(tenant_shard_id, &timeline.timeline_id)
|
||||||
|
.join(layer.name.file_name());
|
||||||
|
match tokio::fs::metadata(&local_path).await {
|
||||||
|
Ok(meta) => {
|
||||||
|
tracing::debug!(
|
||||||
|
"Layer {} present at {}, size {}",
|
||||||
|
layer.name,
|
||||||
|
local_path,
|
||||||
|
meta.len(),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
tracing::warn!(
|
||||||
|
"Layer {} not found at {} ({})",
|
||||||
|
layer.name,
|
||||||
|
local_path,
|
||||||
|
e
|
||||||
|
);
|
||||||
|
debug_assert!(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if on_disk.metadata != LayerFileMetadata::from(&layer.metadata)
|
if on_disk.metadata != LayerFileMetadata::from(&layer.metadata)
|
||||||
|| on_disk.access_time != layer.access_time
|
|| on_disk.access_time != layer.access_time
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ use std::cmp::{Ordering, Reverse};
|
|||||||
use std::collections::hash_map::Entry;
|
use std::collections::hash_map::Entry;
|
||||||
use std::collections::{BinaryHeap, HashMap};
|
use std::collections::{BinaryHeap, HashMap};
|
||||||
use std::ops::Range;
|
use std::ops::Range;
|
||||||
use std::sync::Mutex;
|
use std::sync::{Arc, Mutex};
|
||||||
use std::time::{Duration, SystemTime, UNIX_EPOCH};
|
use std::time::{Duration, SystemTime, UNIX_EPOCH};
|
||||||
use tracing::warn;
|
use tracing::warn;
|
||||||
use utils::history_buffer::HistoryBufferWithDropCounter;
|
use utils::history_buffer::HistoryBufferWithDropCounter;
|
||||||
@@ -41,8 +41,8 @@ pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
|
|||||||
|
|
||||||
pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
|
pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
|
||||||
|
|
||||||
use super::layer_map::InMemoryLayerHandle;
|
use self::inmemory_layer::InMemoryLayerFileId;
|
||||||
use super::timeline::layer_manager::LayerManager;
|
|
||||||
use super::timeline::GetVectoredError;
|
use super::timeline::GetVectoredError;
|
||||||
use super::PageReconstructError;
|
use super::PageReconstructError;
|
||||||
|
|
||||||
@@ -118,6 +118,7 @@ pub(crate) struct ValuesReconstructState {
|
|||||||
pub(crate) keys: HashMap<Key, Result<VectoredValueReconstructState, PageReconstructError>>,
|
pub(crate) keys: HashMap<Key, Result<VectoredValueReconstructState, PageReconstructError>>,
|
||||||
|
|
||||||
keys_done: KeySpaceRandomAccum,
|
keys_done: KeySpaceRandomAccum,
|
||||||
|
layers_visited: u32,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ValuesReconstructState {
|
impl ValuesReconstructState {
|
||||||
@@ -125,6 +126,7 @@ impl ValuesReconstructState {
|
|||||||
Self {
|
Self {
|
||||||
keys: HashMap::new(),
|
keys: HashMap::new(),
|
||||||
keys_done: KeySpaceRandomAccum::new(),
|
keys_done: KeySpaceRandomAccum::new(),
|
||||||
|
layers_visited: 0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -138,6 +140,14 @@ impl ValuesReconstructState {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn on_layer_visited(&mut self) {
|
||||||
|
self.layers_visited += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn get_layers_visited(&self) -> u32 {
|
||||||
|
self.layers_visited
|
||||||
|
}
|
||||||
|
|
||||||
/// Update the state collected for a given key.
|
/// Update the state collected for a given key.
|
||||||
/// Returns true if this was the last value needed for the key and false otherwise.
|
/// Returns true if this was the last value needed for the key and false otherwise.
|
||||||
///
|
///
|
||||||
@@ -204,23 +214,30 @@ impl Default for ValuesReconstructState {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Description of layer to be read - the layer map can turn
|
/// A key that uniquely identifies a layer in a timeline
|
||||||
/// this description into the actual layer.
|
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
|
||||||
#[derive(PartialEq, Eq, Hash, Debug, Clone)]
|
pub(crate) enum LayerId {
|
||||||
pub(crate) enum ReadableLayerDesc {
|
PersitentLayerId(PersistentLayerKey),
|
||||||
Persistent {
|
InMemoryLayerId(InMemoryLayerFileId),
|
||||||
desc: PersistentLayerDesc,
|
|
||||||
lsn_range: Range<Lsn>,
|
|
||||||
},
|
|
||||||
InMemory {
|
|
||||||
handle: InMemoryLayerHandle,
|
|
||||||
lsn_ceil: Lsn,
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Wraper for 'ReadableLayerDesc' sorted by Lsn
|
/// Layer wrapper for the read path. Note that it is valid
|
||||||
|
/// to use these layers even after external operations have
|
||||||
|
/// been performed on them (compaction, freeze, etc.).
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
struct ReadableLayerDescOrdered(ReadableLayerDesc);
|
pub(crate) enum ReadableLayer {
|
||||||
|
PersistentLayer(Layer),
|
||||||
|
InMemoryLayer(Arc<InMemoryLayer>),
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A partial description of a read to be done.
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
struct ReadDesc {
|
||||||
|
/// An id used to resolve the readable layer within the fringe
|
||||||
|
layer_id: LayerId,
|
||||||
|
/// Lsn range for the read, used for selecting the next read
|
||||||
|
lsn_range: Range<Lsn>,
|
||||||
|
}
|
||||||
|
|
||||||
/// Data structure which maintains a fringe of layers for the
|
/// Data structure which maintains a fringe of layers for the
|
||||||
/// read path. The fringe is the set of layers which intersects
|
/// read path. The fringe is the set of layers which intersects
|
||||||
@@ -231,41 +248,64 @@ struct ReadableLayerDescOrdered(ReadableLayerDesc);
|
|||||||
/// a two layer indexing scheme.
|
/// a two layer indexing scheme.
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub(crate) struct LayerFringe {
|
pub(crate) struct LayerFringe {
|
||||||
layers_by_lsn: BinaryHeap<ReadableLayerDescOrdered>,
|
planned_reads_by_lsn: BinaryHeap<ReadDesc>,
|
||||||
layers: HashMap<ReadableLayerDesc, KeySpace>,
|
layers: HashMap<LayerId, LayerKeyspace>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
struct LayerKeyspace {
|
||||||
|
layer: ReadableLayer,
|
||||||
|
target_keyspace: KeySpace,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl LayerFringe {
|
impl LayerFringe {
|
||||||
pub(crate) fn new() -> Self {
|
pub(crate) fn new() -> Self {
|
||||||
LayerFringe {
|
LayerFringe {
|
||||||
layers_by_lsn: BinaryHeap::new(),
|
planned_reads_by_lsn: BinaryHeap::new(),
|
||||||
layers: HashMap::new(),
|
layers: HashMap::new(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayerDesc, KeySpace)> {
|
pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayer, KeySpace, Range<Lsn>)> {
|
||||||
let handle = match self.layers_by_lsn.pop() {
|
let read_desc = match self.planned_reads_by_lsn.pop() {
|
||||||
Some(h) => h,
|
Some(desc) => desc,
|
||||||
None => return None,
|
None => return None,
|
||||||
};
|
};
|
||||||
|
|
||||||
let removed = self.layers.remove_entry(&handle.0);
|
let removed = self.layers.remove_entry(&read_desc.layer_id);
|
||||||
match removed {
|
match removed {
|
||||||
Some((layer, keyspace)) => Some((layer, keyspace)),
|
Some((
|
||||||
|
_,
|
||||||
|
LayerKeyspace {
|
||||||
|
layer,
|
||||||
|
target_keyspace,
|
||||||
|
},
|
||||||
|
)) => Some((layer, target_keyspace, read_desc.lsn_range)),
|
||||||
None => unreachable!("fringe internals are always consistent"),
|
None => unreachable!("fringe internals are always consistent"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn update(&mut self, layer: ReadableLayerDesc, keyspace: KeySpace) {
|
pub(crate) fn update(
|
||||||
let entry = self.layers.entry(layer.clone());
|
&mut self,
|
||||||
|
layer: ReadableLayer,
|
||||||
|
keyspace: KeySpace,
|
||||||
|
lsn_range: Range<Lsn>,
|
||||||
|
) {
|
||||||
|
let layer_id = layer.id();
|
||||||
|
let entry = self.layers.entry(layer_id.clone());
|
||||||
match entry {
|
match entry {
|
||||||
Entry::Occupied(mut entry) => {
|
Entry::Occupied(mut entry) => {
|
||||||
entry.get_mut().merge(&keyspace);
|
entry.get_mut().target_keyspace.merge(&keyspace);
|
||||||
}
|
}
|
||||||
Entry::Vacant(entry) => {
|
Entry::Vacant(entry) => {
|
||||||
self.layers_by_lsn
|
self.planned_reads_by_lsn.push(ReadDesc {
|
||||||
.push(ReadableLayerDescOrdered(entry.key().clone()));
|
lsn_range,
|
||||||
entry.insert(keyspace);
|
layer_id: layer_id.clone(),
|
||||||
|
});
|
||||||
|
entry.insert(LayerKeyspace {
|
||||||
|
layer,
|
||||||
|
target_keyspace: keyspace,
|
||||||
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -277,77 +317,55 @@ impl Default for LayerFringe {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Ord for ReadableLayerDescOrdered {
|
impl Ord for ReadDesc {
|
||||||
fn cmp(&self, other: &Self) -> Ordering {
|
fn cmp(&self, other: &Self) -> Ordering {
|
||||||
let ord = self.0.get_lsn_ceil().cmp(&other.0.get_lsn_ceil());
|
let ord = self.lsn_range.end.cmp(&other.lsn_range.end);
|
||||||
if ord == std::cmp::Ordering::Equal {
|
if ord == std::cmp::Ordering::Equal {
|
||||||
self.0
|
self.lsn_range.start.cmp(&other.lsn_range.start).reverse()
|
||||||
.get_lsn_floor()
|
|
||||||
.cmp(&other.0.get_lsn_floor())
|
|
||||||
.reverse()
|
|
||||||
} else {
|
} else {
|
||||||
ord
|
ord
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PartialOrd for ReadableLayerDescOrdered {
|
impl PartialOrd for ReadDesc {
|
||||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||||
Some(self.cmp(other))
|
Some(self.cmp(other))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PartialEq for ReadableLayerDescOrdered {
|
impl PartialEq for ReadDesc {
|
||||||
fn eq(&self, other: &Self) -> bool {
|
fn eq(&self, other: &Self) -> bool {
|
||||||
self.0.get_lsn_floor() == other.0.get_lsn_floor()
|
self.lsn_range == other.lsn_range
|
||||||
&& self.0.get_lsn_ceil() == other.0.get_lsn_ceil()
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Eq for ReadableLayerDescOrdered {}
|
impl Eq for ReadDesc {}
|
||||||
|
|
||||||
impl ReadableLayerDesc {
|
impl ReadableLayer {
|
||||||
pub(crate) fn get_lsn_floor(&self) -> Lsn {
|
pub(crate) fn id(&self) -> LayerId {
|
||||||
match self {
|
match self {
|
||||||
ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.start,
|
Self::PersistentLayer(layer) => LayerId::PersitentLayerId(layer.layer_desc().key()),
|
||||||
ReadableLayerDesc::InMemory { handle, .. } => handle.get_lsn_floor(),
|
Self::InMemoryLayer(layer) => LayerId::InMemoryLayerId(layer.file_id()),
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn get_lsn_ceil(&self) -> Lsn {
|
|
||||||
match self {
|
|
||||||
ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.end,
|
|
||||||
ReadableLayerDesc::InMemory { lsn_ceil, .. } => *lsn_ceil,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) async fn get_values_reconstruct_data(
|
pub(crate) async fn get_values_reconstruct_data(
|
||||||
&self,
|
&self,
|
||||||
layer_manager: &LayerManager,
|
|
||||||
keyspace: KeySpace,
|
keyspace: KeySpace,
|
||||||
|
lsn_range: Range<Lsn>,
|
||||||
reconstruct_state: &mut ValuesReconstructState,
|
reconstruct_state: &mut ValuesReconstructState,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<(), GetVectoredError> {
|
) -> Result<(), GetVectoredError> {
|
||||||
match self {
|
match self {
|
||||||
ReadableLayerDesc::Persistent { desc, lsn_range } => {
|
ReadableLayer::PersistentLayer(layer) => {
|
||||||
let layer = layer_manager.get_from_desc(desc);
|
|
||||||
layer
|
layer
|
||||||
.get_values_reconstruct_data(
|
.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx)
|
||||||
keyspace,
|
|
||||||
lsn_range.clone(),
|
|
||||||
reconstruct_state,
|
|
||||||
ctx,
|
|
||||||
)
|
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
ReadableLayerDesc::InMemory { handle, lsn_ceil } => {
|
ReadableLayer::InMemoryLayer(layer) => {
|
||||||
let layer = layer_manager
|
|
||||||
.layer_map()
|
|
||||||
.get_in_memory_layer(handle)
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
layer
|
layer
|
||||||
.get_values_reconstruct_data(keyspace, *lsn_ceil, reconstruct_state, ctx)
|
.get_values_reconstruct_data(keyspace, lsn_range.end, reconstruct_state, ctx)
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -20,8 +20,8 @@
|
|||||||
//! 000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051
|
//! 000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051
|
||||||
//! ```
|
//! ```
|
||||||
//!
|
//!
|
||||||
//! Every delta file consists of three parts: "summary", "index", and
|
//! Every delta file consists of three parts: "summary", "values", and
|
||||||
//! "values". The summary is a fixed size header at the beginning of the file,
|
//! "index". The summary is a fixed size header at the beginning of the file,
|
||||||
//! and it contains basic information about the layer, and offsets to the other
|
//! and it contains basic information about the layer, and offsets to the other
|
||||||
//! parts. The "index" is a B-tree, mapping from Key and LSN to an offset in the
|
//! parts. The "index" is a B-tree, mapping from Key and LSN to an offset in the
|
||||||
//! "values" part. The actual page images and WAL records are stored in the
|
//! "values" part. The actual page images and WAL records are stored in the
|
||||||
@@ -47,6 +47,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
|
|||||||
use bytes::BytesMut;
|
use bytes::BytesMut;
|
||||||
use camino::{Utf8Path, Utf8PathBuf};
|
use camino::{Utf8Path, Utf8PathBuf};
|
||||||
use futures::StreamExt;
|
use futures::StreamExt;
|
||||||
|
use itertools::Itertools;
|
||||||
use pageserver_api::keyspace::KeySpace;
|
use pageserver_api::keyspace::KeySpace;
|
||||||
use pageserver_api::models::LayerAccessKind;
|
use pageserver_api::models::LayerAccessKind;
|
||||||
use pageserver_api::shard::TenantShardId;
|
use pageserver_api::shard::TenantShardId;
|
||||||
@@ -727,6 +728,9 @@ impl DeltaLayerInner {
|
|||||||
// production code path
|
// production code path
|
||||||
expected_summary.index_start_blk = actual_summary.index_start_blk;
|
expected_summary.index_start_blk = actual_summary.index_start_blk;
|
||||||
expected_summary.index_root_blk = actual_summary.index_root_blk;
|
expected_summary.index_root_blk = actual_summary.index_root_blk;
|
||||||
|
// mask out the timeline_id, but still require the layers to be from the same tenant
|
||||||
|
expected_summary.timeline_id = actual_summary.timeline_id;
|
||||||
|
|
||||||
if actual_summary != expected_summary {
|
if actual_summary != expected_summary {
|
||||||
bail!(
|
bail!(
|
||||||
"in-file summary does not match expected summary. actual = {:?} expected = {:?}",
|
"in-file summary does not match expected summary. actual = {:?} expected = {:?}",
|
||||||
@@ -862,7 +866,7 @@ impl DeltaLayerInner {
|
|||||||
.into(),
|
.into(),
|
||||||
);
|
);
|
||||||
|
|
||||||
let data_end_offset = self.index_start_blk as u64 * PAGE_SZ as u64;
|
let data_end_offset = self.index_start_offset();
|
||||||
|
|
||||||
let reads = Self::plan_reads(
|
let reads = Self::plan_reads(
|
||||||
keyspace,
|
keyspace,
|
||||||
@@ -938,7 +942,7 @@ impl DeltaLayerInner {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if !range_end_handled {
|
if !range_end_handled {
|
||||||
tracing::info!("Handling range end fallback at {}", data_end_offset);
|
tracing::debug!("Handling range end fallback at {}", data_end_offset);
|
||||||
planner.handle_range_end(data_end_offset);
|
planner.handle_range_end(data_end_offset);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -946,6 +950,34 @@ impl DeltaLayerInner {
|
|||||||
Ok(planner.finish())
|
Ok(planner.finish())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn get_min_read_buffer_size(
|
||||||
|
planned_reads: &[VectoredRead],
|
||||||
|
read_size_soft_max: usize,
|
||||||
|
) -> usize {
|
||||||
|
let Some(largest_read) = planned_reads.iter().max_by_key(|read| read.size()) else {
|
||||||
|
return read_size_soft_max;
|
||||||
|
};
|
||||||
|
|
||||||
|
let largest_read_size = largest_read.size();
|
||||||
|
if largest_read_size > read_size_soft_max {
|
||||||
|
// If the read is oversized, it should only contain one key.
|
||||||
|
let offenders = largest_read
|
||||||
|
.blobs_at
|
||||||
|
.as_slice()
|
||||||
|
.iter()
|
||||||
|
.map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn))
|
||||||
|
.join(", ");
|
||||||
|
tracing::warn!(
|
||||||
|
"Oversized vectored read ({} > {}) for keys {}",
|
||||||
|
largest_read_size,
|
||||||
|
read_size_soft_max,
|
||||||
|
offenders
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
largest_read_size
|
||||||
|
}
|
||||||
|
|
||||||
async fn do_reads_and_update_state(
|
async fn do_reads_and_update_state(
|
||||||
&self,
|
&self,
|
||||||
reads: Vec<VectoredRead>,
|
reads: Vec<VectoredRead>,
|
||||||
@@ -959,7 +991,8 @@ impl DeltaLayerInner {
|
|||||||
.expect("Layer is loaded with max vectored bytes config")
|
.expect("Layer is loaded with max vectored bytes config")
|
||||||
.0
|
.0
|
||||||
.into();
|
.into();
|
||||||
let mut buf = Some(BytesMut::with_capacity(max_vectored_read_bytes));
|
let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes);
|
||||||
|
let mut buf = Some(BytesMut::with_capacity(buf_size));
|
||||||
|
|
||||||
// Note that reads are processed in reverse order (from highest key+lsn).
|
// Note that reads are processed in reverse order (from highest key+lsn).
|
||||||
// This is the order that `ReconstructState` requires such that it can
|
// This is the order that `ReconstructState` requires such that it can
|
||||||
@@ -986,7 +1019,7 @@ impl DeltaLayerInner {
|
|||||||
|
|
||||||
// We have "lost" the buffer since the lower level IO api
|
// We have "lost" the buffer since the lower level IO api
|
||||||
// doesn't return the buffer on error. Allocate a new one.
|
// doesn't return the buffer on error. Allocate a new one.
|
||||||
buf = Some(BytesMut::with_capacity(max_vectored_read_bytes));
|
buf = Some(BytesMut::with_capacity(buf_size));
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -1073,11 +1106,195 @@ impl DeltaLayerInner {
|
|||||||
if let Some(last) = all_keys.last_mut() {
|
if let Some(last) = all_keys.last_mut() {
|
||||||
// Last key occupies all space till end of value storage,
|
// Last key occupies all space till end of value storage,
|
||||||
// which corresponds to beginning of the index
|
// which corresponds to beginning of the index
|
||||||
last.size = self.index_start_blk as u64 * PAGE_SZ as u64 - last.size;
|
last.size = self.index_start_offset() - last.size;
|
||||||
}
|
}
|
||||||
Ok(all_keys)
|
Ok(all_keys)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Using the given writer, write out a truncated version, where LSNs higher than the
|
||||||
|
/// truncate_at are missing.
|
||||||
|
#[cfg(test)]
|
||||||
|
pub(super) async fn copy_prefix(
|
||||||
|
&self,
|
||||||
|
writer: &mut DeltaLayerWriter,
|
||||||
|
truncate_at: Lsn,
|
||||||
|
ctx: &RequestContext,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
use crate::tenant::vectored_blob_io::{
|
||||||
|
BlobMeta, VectoredReadBuilder, VectoredReadExtended,
|
||||||
|
};
|
||||||
|
use futures::stream::TryStreamExt;
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
enum Item {
|
||||||
|
Actual(Key, Lsn, BlobRef),
|
||||||
|
Sentinel,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<Item> for Option<(Key, Lsn, BlobRef)> {
|
||||||
|
fn from(value: Item) -> Self {
|
||||||
|
match value {
|
||||||
|
Item::Actual(key, lsn, blob) => Some((key, lsn, blob)),
|
||||||
|
Item::Sentinel => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Item {
|
||||||
|
fn offset(&self) -> Option<BlobRef> {
|
||||||
|
match self {
|
||||||
|
Item::Actual(_, _, blob) => Some(*blob),
|
||||||
|
Item::Sentinel => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_last(&self) -> bool {
|
||||||
|
matches!(self, Item::Sentinel)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||||
|
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||||
|
self.index_start_blk,
|
||||||
|
self.index_root_blk,
|
||||||
|
block_reader,
|
||||||
|
);
|
||||||
|
|
||||||
|
let stream = self.stream_index_forwards(&tree_reader, &[0u8; DELTA_KEY_SIZE], ctx);
|
||||||
|
let stream = stream.map_ok(|(key, lsn, pos)| Item::Actual(key, lsn, pos));
|
||||||
|
// put in a sentinel value for getting the end offset for last item, and not having to
|
||||||
|
// repeat the whole read part
|
||||||
|
let stream = stream.chain(futures::stream::once(futures::future::ready(Ok(
|
||||||
|
Item::Sentinel,
|
||||||
|
))));
|
||||||
|
let mut stream = std::pin::pin!(stream);
|
||||||
|
|
||||||
|
let mut prev: Option<(Key, Lsn, BlobRef)> = None;
|
||||||
|
|
||||||
|
let mut read_builder: Option<VectoredReadBuilder> = None;
|
||||||
|
|
||||||
|
let max_read_size = self
|
||||||
|
.max_vectored_read_bytes
|
||||||
|
.map(|x| x.0.get())
|
||||||
|
.unwrap_or(8192);
|
||||||
|
|
||||||
|
let mut buffer = Some(BytesMut::with_capacity(max_read_size));
|
||||||
|
|
||||||
|
// FIXME: buffering of DeltaLayerWriter
|
||||||
|
let mut per_blob_copy = Vec::new();
|
||||||
|
|
||||||
|
while let Some(item) = stream.try_next().await? {
|
||||||
|
tracing::debug!(?item, "popped");
|
||||||
|
let offset = item
|
||||||
|
.offset()
|
||||||
|
.unwrap_or(BlobRef::new(self.index_start_offset(), false));
|
||||||
|
|
||||||
|
let actionable = if let Some((key, lsn, start_offset)) = prev.take() {
|
||||||
|
let end_offset = offset;
|
||||||
|
|
||||||
|
Some((BlobMeta { key, lsn }, start_offset..end_offset))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
let is_last = item.is_last();
|
||||||
|
|
||||||
|
prev = Option::from(item);
|
||||||
|
|
||||||
|
let actionable = actionable.filter(|x| x.0.lsn < truncate_at);
|
||||||
|
|
||||||
|
let builder = if let Some((meta, offsets)) = actionable {
|
||||||
|
// extend or create a new builder
|
||||||
|
if read_builder
|
||||||
|
.as_mut()
|
||||||
|
.map(|x| x.extend(offsets.start.pos(), offsets.end.pos(), meta))
|
||||||
|
.unwrap_or(VectoredReadExtended::No)
|
||||||
|
== VectoredReadExtended::Yes
|
||||||
|
{
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
read_builder.replace(VectoredReadBuilder::new(
|
||||||
|
offsets.start.pos(),
|
||||||
|
offsets.end.pos(),
|
||||||
|
meta,
|
||||||
|
max_read_size,
|
||||||
|
))
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// nothing to do, except perhaps flush any existing for the last element
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
// flush the possible older builder and also the new one if the item was the last one
|
||||||
|
let builders = builder.into_iter();
|
||||||
|
let builders = if is_last {
|
||||||
|
builders.chain(read_builder.take())
|
||||||
|
} else {
|
||||||
|
builders.chain(None)
|
||||||
|
};
|
||||||
|
|
||||||
|
for builder in builders {
|
||||||
|
let read = builder.build();
|
||||||
|
|
||||||
|
let reader = VectoredBlobReader::new(&self.file);
|
||||||
|
|
||||||
|
let mut buf = buffer.take().unwrap();
|
||||||
|
|
||||||
|
buf.clear();
|
||||||
|
buf.reserve(read.size());
|
||||||
|
let res = reader.read_blobs(&read, buf).await?;
|
||||||
|
|
||||||
|
for blob in res.blobs {
|
||||||
|
let key = blob.meta.key;
|
||||||
|
let lsn = blob.meta.lsn;
|
||||||
|
let data = &res.buf[blob.start..blob.end];
|
||||||
|
|
||||||
|
#[cfg(debug_assertions)]
|
||||||
|
Value::des(data)
|
||||||
|
.with_context(|| {
|
||||||
|
format!(
|
||||||
|
"blob failed to deserialize for {}@{}, {}..{}: {:?}",
|
||||||
|
blob.meta.key,
|
||||||
|
blob.meta.lsn,
|
||||||
|
blob.start,
|
||||||
|
blob.end,
|
||||||
|
utils::Hex(data)
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
// is it an image or will_init walrecord?
|
||||||
|
// FIXME: this could be handled by threading the BlobRef to the
|
||||||
|
// VectoredReadBuilder
|
||||||
|
let will_init = crate::repository::ValueBytes::will_init(data)
|
||||||
|
.inspect_err(|_e| {
|
||||||
|
#[cfg(feature = "testing")]
|
||||||
|
tracing::error!(data=?utils::Hex(data), err=?_e, "failed to parse will_init out of serialized value");
|
||||||
|
})
|
||||||
|
.unwrap_or(false);
|
||||||
|
|
||||||
|
per_blob_copy.clear();
|
||||||
|
per_blob_copy.extend_from_slice(data);
|
||||||
|
|
||||||
|
let (tmp, res) = writer
|
||||||
|
.put_value_bytes(key, lsn, std::mem::take(&mut per_blob_copy), will_init)
|
||||||
|
.await;
|
||||||
|
per_blob_copy = tmp;
|
||||||
|
res?;
|
||||||
|
}
|
||||||
|
|
||||||
|
buffer = Some(res.buf);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
assert!(
|
||||||
|
read_builder.is_none(),
|
||||||
|
"with the sentinel above loop should had handled all"
|
||||||
|
);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
|
pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||||
println!(
|
println!(
|
||||||
"index_start_blk: {}, root {}",
|
"index_start_blk: {}, root {}",
|
||||||
@@ -1147,6 +1364,44 @@ impl DeltaLayerInner {
|
|||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
fn stream_index_forwards<'a, R>(
|
||||||
|
&'a self,
|
||||||
|
reader: &'a DiskBtreeReader<R, DELTA_KEY_SIZE>,
|
||||||
|
start: &'a [u8; DELTA_KEY_SIZE],
|
||||||
|
ctx: &'a RequestContext,
|
||||||
|
) -> impl futures::stream::Stream<
|
||||||
|
Item = Result<(Key, Lsn, BlobRef), crate::tenant::disk_btree::DiskBtreeError>,
|
||||||
|
> + 'a
|
||||||
|
where
|
||||||
|
R: BlockReader,
|
||||||
|
{
|
||||||
|
use futures::stream::TryStreamExt;
|
||||||
|
let stream = reader.get_stream_from(start, ctx);
|
||||||
|
stream.map_ok(|(key, value)| {
|
||||||
|
let key = DeltaKey::from_slice(&key);
|
||||||
|
let (key, lsn) = (key.key(), key.lsn());
|
||||||
|
let offset = BlobRef(value);
|
||||||
|
|
||||||
|
(key, lsn, offset)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The file offset to the first block of index.
|
||||||
|
///
|
||||||
|
/// The file structure is summary, values, and index. We often need this for the size of last blob.
|
||||||
|
fn index_start_offset(&self) -> u64 {
|
||||||
|
let offset = self.index_start_blk as u64 * PAGE_SZ as u64;
|
||||||
|
let bref = BlobRef(offset);
|
||||||
|
tracing::debug!(
|
||||||
|
index_start_blk = self.index_start_blk,
|
||||||
|
offset,
|
||||||
|
pos = bref.pos(),
|
||||||
|
"index_start_offset"
|
||||||
|
);
|
||||||
|
offset
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A set of data associated with a delta layer key and its value
|
/// A set of data associated with a delta layer key and its value
|
||||||
@@ -1210,9 +1465,16 @@ impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for Del
|
|||||||
mod test {
|
mod test {
|
||||||
use std::collections::BTreeMap;
|
use std::collections::BTreeMap;
|
||||||
|
|
||||||
|
use itertools::MinMaxResult;
|
||||||
|
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
|
||||||
|
use rand::RngCore;
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::{
|
use crate::{
|
||||||
context::DownloadBehavior, task_mgr::TaskKind, tenant::disk_btree::tests::TestDisk,
|
context::DownloadBehavior,
|
||||||
|
task_mgr::TaskKind,
|
||||||
|
tenant::{disk_btree::tests::TestDisk, harness::TenantHarness},
|
||||||
|
DEFAULT_PG_VERSION,
|
||||||
};
|
};
|
||||||
|
|
||||||
/// Construct an index for a fictional delta layer and and then
|
/// Construct an index for a fictional delta layer and and then
|
||||||
@@ -1332,4 +1594,442 @@ mod test {
|
|||||||
|
|
||||||
assert_eq!(planned_blobs, expected_blobs);
|
assert_eq!(planned_blobs, expected_blobs);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
mod constants {
|
||||||
|
use utils::lsn::Lsn;
|
||||||
|
|
||||||
|
/// Offset used by all lsns in this test
|
||||||
|
pub(super) const LSN_OFFSET: Lsn = Lsn(0x08);
|
||||||
|
/// Number of unique keys including in the test data
|
||||||
|
pub(super) const KEY_COUNT: u8 = 60;
|
||||||
|
/// Max number of different lsns for each key
|
||||||
|
pub(super) const MAX_ENTRIES_PER_KEY: u8 = 20;
|
||||||
|
/// Possible value sizes for each key along with a probability weight
|
||||||
|
pub(super) const VALUE_SIZES: [(usize, u8); 3] = [(100, 2), (1024, 2), (1024 * 1024, 1)];
|
||||||
|
/// Probability that there will be a gap between the current key and the next one (33.3%)
|
||||||
|
pub(super) const KEY_GAP_CHANGES: [(bool, u8); 2] = [(true, 1), (false, 2)];
|
||||||
|
/// The minimum size of a key range in all the generated reads
|
||||||
|
pub(super) const MIN_RANGE_SIZE: i128 = 10;
|
||||||
|
/// The number of ranges included in each vectored read
|
||||||
|
pub(super) const RANGES_COUNT: u8 = 2;
|
||||||
|
/// The number of vectored reads performed
|
||||||
|
pub(super) const READS_COUNT: u8 = 100;
|
||||||
|
/// Soft max size of a vectored read. Will be violated if we have to read keys
|
||||||
|
/// with values larger than the limit
|
||||||
|
pub(super) const MAX_VECTORED_READ_BYTES: usize = 64 * 1024;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct Entry {
|
||||||
|
key: Key,
|
||||||
|
lsn: Lsn,
|
||||||
|
value: Vec<u8>,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn generate_entries(rng: &mut StdRng) -> Vec<Entry> {
|
||||||
|
let mut current_key = Key::MIN;
|
||||||
|
|
||||||
|
let mut entries = Vec::new();
|
||||||
|
for _ in 0..constants::KEY_COUNT {
|
||||||
|
let count = rng.gen_range(1..constants::MAX_ENTRIES_PER_KEY);
|
||||||
|
let mut lsns_iter =
|
||||||
|
std::iter::successors(Some(Lsn(constants::LSN_OFFSET.0 + 0x08)), |lsn| {
|
||||||
|
Some(Lsn(lsn.0 + 0x08))
|
||||||
|
});
|
||||||
|
let mut lsns = Vec::new();
|
||||||
|
while lsns.len() < count as usize {
|
||||||
|
let take = rng.gen_bool(0.5);
|
||||||
|
let lsn = lsns_iter.next().unwrap();
|
||||||
|
if take {
|
||||||
|
lsns.push(lsn);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for lsn in lsns {
|
||||||
|
let size = constants::VALUE_SIZES
|
||||||
|
.choose_weighted(rng, |item| item.1)
|
||||||
|
.unwrap()
|
||||||
|
.0;
|
||||||
|
let mut buf = vec![0; size];
|
||||||
|
rng.fill_bytes(&mut buf);
|
||||||
|
|
||||||
|
entries.push(Entry {
|
||||||
|
key: current_key,
|
||||||
|
lsn,
|
||||||
|
value: buf,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
let gap = constants::KEY_GAP_CHANGES
|
||||||
|
.choose_weighted(rng, |item| item.1)
|
||||||
|
.unwrap()
|
||||||
|
.0;
|
||||||
|
if gap {
|
||||||
|
current_key = current_key.add(2);
|
||||||
|
} else {
|
||||||
|
current_key = current_key.add(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
entries
|
||||||
|
}
|
||||||
|
|
||||||
|
struct EntriesMeta {
|
||||||
|
key_range: Range<Key>,
|
||||||
|
lsn_range: Range<Lsn>,
|
||||||
|
index: BTreeMap<(Key, Lsn), Vec<u8>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_entries_meta(entries: &[Entry]) -> EntriesMeta {
|
||||||
|
let key_range = match entries.iter().minmax_by_key(|e| e.key) {
|
||||||
|
MinMaxResult::MinMax(min, max) => min.key..max.key.next(),
|
||||||
|
_ => panic!("More than one entry is always expected"),
|
||||||
|
};
|
||||||
|
|
||||||
|
let lsn_range = match entries.iter().minmax_by_key(|e| e.lsn) {
|
||||||
|
MinMaxResult::MinMax(min, max) => min.lsn..Lsn(max.lsn.0 + 1),
|
||||||
|
_ => panic!("More than one entry is always expected"),
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut index = BTreeMap::new();
|
||||||
|
for entry in entries.iter() {
|
||||||
|
index.insert((entry.key, entry.lsn), entry.value.clone());
|
||||||
|
}
|
||||||
|
|
||||||
|
EntriesMeta {
|
||||||
|
key_range,
|
||||||
|
lsn_range,
|
||||||
|
index,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn pick_random_keyspace(rng: &mut StdRng, key_range: &Range<Key>) -> KeySpace {
|
||||||
|
let start = key_range.start.to_i128();
|
||||||
|
let end = key_range.end.to_i128();
|
||||||
|
|
||||||
|
let mut keyspace = KeySpace::default();
|
||||||
|
|
||||||
|
for _ in 0..constants::RANGES_COUNT {
|
||||||
|
let mut range: Option<Range<Key>> = Option::default();
|
||||||
|
while range.is_none() || keyspace.overlaps(range.as_ref().unwrap()) {
|
||||||
|
let range_start = rng.gen_range(start..end);
|
||||||
|
let range_end_offset = range_start + constants::MIN_RANGE_SIZE;
|
||||||
|
if range_end_offset >= end {
|
||||||
|
range = Some(Key::from_i128(range_start)..Key::from_i128(end));
|
||||||
|
} else {
|
||||||
|
let range_end = rng.gen_range((range_start + constants::MIN_RANGE_SIZE)..end);
|
||||||
|
range = Some(Key::from_i128(range_start)..Key::from_i128(range_end));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
keyspace.ranges.push(range.unwrap());
|
||||||
|
}
|
||||||
|
|
||||||
|
keyspace
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_delta_layer_vectored_read_end_to_end() -> anyhow::Result<()> {
|
||||||
|
let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read")?;
|
||||||
|
let (tenant, ctx) = harness.load().await;
|
||||||
|
|
||||||
|
let timeline_id = TimelineId::generate();
|
||||||
|
let timeline = tenant
|
||||||
|
.create_test_timeline(timeline_id, constants::LSN_OFFSET, DEFAULT_PG_VERSION, &ctx)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
tracing::info!("Generating test data ...");
|
||||||
|
|
||||||
|
let rng = &mut StdRng::seed_from_u64(0);
|
||||||
|
let entries = generate_entries(rng);
|
||||||
|
let entries_meta = get_entries_meta(&entries);
|
||||||
|
|
||||||
|
tracing::info!("Done generating {} entries", entries.len());
|
||||||
|
|
||||||
|
tracing::info!("Writing test data to delta layer ...");
|
||||||
|
let mut writer = DeltaLayerWriter::new(
|
||||||
|
harness.conf,
|
||||||
|
timeline_id,
|
||||||
|
harness.tenant_shard_id,
|
||||||
|
entries_meta.key_range.start,
|
||||||
|
entries_meta.lsn_range.clone(),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
for entry in entries {
|
||||||
|
let (_, res) = writer
|
||||||
|
.put_value_bytes(entry.key, entry.lsn, entry.value, false)
|
||||||
|
.await;
|
||||||
|
res?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let resident = writer.finish(entries_meta.key_range.end, &timeline).await?;
|
||||||
|
|
||||||
|
let inner = resident.as_delta(&ctx).await?;
|
||||||
|
|
||||||
|
let file_size = inner.file.metadata().await?.len();
|
||||||
|
tracing::info!(
|
||||||
|
"Done writing test data to delta layer. Resulting file size is: {}",
|
||||||
|
file_size
|
||||||
|
);
|
||||||
|
|
||||||
|
for i in 0..constants::READS_COUNT {
|
||||||
|
tracing::info!("Doing vectored read {}/{}", i + 1, constants::READS_COUNT);
|
||||||
|
|
||||||
|
let block_reader = FileBlockReader::new(&inner.file, inner.file_id);
|
||||||
|
let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||||
|
inner.index_start_blk,
|
||||||
|
inner.index_root_blk,
|
||||||
|
block_reader,
|
||||||
|
);
|
||||||
|
|
||||||
|
let planner = VectoredReadPlanner::new(constants::MAX_VECTORED_READ_BYTES);
|
||||||
|
let mut reconstruct_state = ValuesReconstructState::new();
|
||||||
|
let keyspace = pick_random_keyspace(rng, &entries_meta.key_range);
|
||||||
|
let data_end_offset = inner.index_start_blk as u64 * PAGE_SZ as u64;
|
||||||
|
|
||||||
|
let vectored_reads = DeltaLayerInner::plan_reads(
|
||||||
|
keyspace.clone(),
|
||||||
|
entries_meta.lsn_range.clone(),
|
||||||
|
data_end_offset,
|
||||||
|
index_reader,
|
||||||
|
planner,
|
||||||
|
&mut reconstruct_state,
|
||||||
|
&ctx,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let vectored_blob_reader = VectoredBlobReader::new(&inner.file);
|
||||||
|
let buf_size = DeltaLayerInner::get_min_read_buffer_size(
|
||||||
|
&vectored_reads,
|
||||||
|
constants::MAX_VECTORED_READ_BYTES,
|
||||||
|
);
|
||||||
|
let mut buf = Some(BytesMut::with_capacity(buf_size));
|
||||||
|
|
||||||
|
for read in vectored_reads {
|
||||||
|
let blobs_buf = vectored_blob_reader
|
||||||
|
.read_blobs(&read, buf.take().expect("Should have a buffer"))
|
||||||
|
.await?;
|
||||||
|
for meta in blobs_buf.blobs.iter() {
|
||||||
|
let value = &blobs_buf.buf[meta.start..meta.end];
|
||||||
|
assert_eq!(value, entries_meta.index[&(meta.meta.key, meta.meta.lsn)]);
|
||||||
|
}
|
||||||
|
|
||||||
|
buf = Some(blobs_buf.buf);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn copy_delta_prefix_smoke() {
|
||||||
|
use crate::walrecord::NeonWalRecord;
|
||||||
|
use bytes::Bytes;
|
||||||
|
|
||||||
|
let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke").unwrap();
|
||||||
|
let (tenant, ctx) = h.load().await;
|
||||||
|
let ctx = &ctx;
|
||||||
|
let timeline = tenant
|
||||||
|
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, ctx)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let initdb_layer = timeline
|
||||||
|
.layers
|
||||||
|
.read()
|
||||||
|
.await
|
||||||
|
.likely_resident_layers()
|
||||||
|
.next()
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
{
|
||||||
|
let mut writer = timeline.writer().await;
|
||||||
|
|
||||||
|
let data = [
|
||||||
|
(0x20, 12, Value::Image(Bytes::from_static(b"foobar"))),
|
||||||
|
(
|
||||||
|
0x30,
|
||||||
|
12,
|
||||||
|
Value::WalRecord(NeonWalRecord::Postgres {
|
||||||
|
will_init: false,
|
||||||
|
rec: Bytes::from_static(b"1"),
|
||||||
|
}),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
0x40,
|
||||||
|
12,
|
||||||
|
Value::WalRecord(NeonWalRecord::Postgres {
|
||||||
|
will_init: true,
|
||||||
|
rec: Bytes::from_static(b"2"),
|
||||||
|
}),
|
||||||
|
),
|
||||||
|
// build an oversized value so we cannot extend and existing read over
|
||||||
|
// this
|
||||||
|
(
|
||||||
|
0x50,
|
||||||
|
12,
|
||||||
|
Value::WalRecord(NeonWalRecord::Postgres {
|
||||||
|
will_init: true,
|
||||||
|
rec: {
|
||||||
|
let mut buf =
|
||||||
|
vec![0u8; tenant.conf.max_vectored_read_bytes.0.get() + 1024];
|
||||||
|
buf.iter_mut()
|
||||||
|
.enumerate()
|
||||||
|
.for_each(|(i, slot)| *slot = (i % 256) as u8);
|
||||||
|
Bytes::from(buf)
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
),
|
||||||
|
// because the oversized read cannot be extended further, we are sure to exercise the
|
||||||
|
// builder created on the last round with this:
|
||||||
|
(
|
||||||
|
0x60,
|
||||||
|
12,
|
||||||
|
Value::WalRecord(NeonWalRecord::Postgres {
|
||||||
|
will_init: true,
|
||||||
|
rec: Bytes::from_static(b"3"),
|
||||||
|
}),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
0x60,
|
||||||
|
9,
|
||||||
|
Value::Image(Bytes::from_static(b"something for a different key")),
|
||||||
|
),
|
||||||
|
];
|
||||||
|
|
||||||
|
let mut last_lsn = None;
|
||||||
|
|
||||||
|
for (lsn, key, value) in data {
|
||||||
|
let key = Key::from_i128(key);
|
||||||
|
writer.put(key, Lsn(lsn), &value, ctx).await.unwrap();
|
||||||
|
last_lsn = Some(lsn);
|
||||||
|
}
|
||||||
|
|
||||||
|
writer.finish_write(Lsn(last_lsn.unwrap()));
|
||||||
|
}
|
||||||
|
timeline.freeze_and_flush().await.unwrap();
|
||||||
|
|
||||||
|
let new_layer = timeline
|
||||||
|
.layers
|
||||||
|
.read()
|
||||||
|
.await
|
||||||
|
.likely_resident_layers()
|
||||||
|
.find(|x| x != &initdb_layer)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
// create a copy for the timeline, so we don't overwrite the file
|
||||||
|
let branch = tenant
|
||||||
|
.branch_timeline_test(&timeline, TimelineId::generate(), None, ctx)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
assert_eq!(branch.get_ancestor_lsn(), Lsn(0x60));
|
||||||
|
|
||||||
|
// truncating at 0x61 gives us a full copy, otherwise just go backwards until there's just
|
||||||
|
// a single key
|
||||||
|
|
||||||
|
for truncate_at in [0x61, 0x51, 0x41, 0x31, 0x21] {
|
||||||
|
let truncate_at = Lsn(truncate_at);
|
||||||
|
|
||||||
|
let mut writer = DeltaLayerWriter::new(
|
||||||
|
tenant.conf,
|
||||||
|
branch.timeline_id,
|
||||||
|
tenant.tenant_shard_id,
|
||||||
|
Key::MIN,
|
||||||
|
Lsn(0x11)..truncate_at,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let new_layer = new_layer.download_and_keep_resident().await.unwrap();
|
||||||
|
|
||||||
|
new_layer
|
||||||
|
.copy_delta_prefix(&mut writer, truncate_at, ctx)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let copied_layer = writer.finish(Key::MAX, &branch).await.unwrap();
|
||||||
|
|
||||||
|
copied_layer.as_delta(ctx).await.unwrap();
|
||||||
|
|
||||||
|
assert_keys_and_values_eq(
|
||||||
|
new_layer.as_delta(ctx).await.unwrap(),
|
||||||
|
copied_layer.as_delta(ctx).await.unwrap(),
|
||||||
|
truncate_at,
|
||||||
|
ctx,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn assert_keys_and_values_eq(
|
||||||
|
source: &DeltaLayerInner,
|
||||||
|
truncated: &DeltaLayerInner,
|
||||||
|
truncated_at: Lsn,
|
||||||
|
ctx: &RequestContext,
|
||||||
|
) {
|
||||||
|
use futures::future::ready;
|
||||||
|
use futures::stream::TryStreamExt;
|
||||||
|
|
||||||
|
let start_key = [0u8; DELTA_KEY_SIZE];
|
||||||
|
|
||||||
|
let source_reader = FileBlockReader::new(&source.file, source.file_id);
|
||||||
|
let source_tree = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||||
|
source.index_start_blk,
|
||||||
|
source.index_root_blk,
|
||||||
|
&source_reader,
|
||||||
|
);
|
||||||
|
let source_stream = source.stream_index_forwards(&source_tree, &start_key, ctx);
|
||||||
|
let source_stream = source_stream.filter(|res| match res {
|
||||||
|
Ok((_, lsn, _)) => ready(lsn < &truncated_at),
|
||||||
|
_ => ready(true),
|
||||||
|
});
|
||||||
|
let mut source_stream = std::pin::pin!(source_stream);
|
||||||
|
|
||||||
|
let truncated_reader = FileBlockReader::new(&truncated.file, truncated.file_id);
|
||||||
|
let truncated_tree = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||||
|
truncated.index_start_blk,
|
||||||
|
truncated.index_root_blk,
|
||||||
|
&truncated_reader,
|
||||||
|
);
|
||||||
|
let truncated_stream = truncated.stream_index_forwards(&truncated_tree, &start_key, ctx);
|
||||||
|
let mut truncated_stream = std::pin::pin!(truncated_stream);
|
||||||
|
|
||||||
|
let mut scratch_left = Vec::new();
|
||||||
|
let mut scratch_right = Vec::new();
|
||||||
|
|
||||||
|
loop {
|
||||||
|
let (src, truncated) = (source_stream.try_next(), truncated_stream.try_next());
|
||||||
|
let (src, truncated) = tokio::try_join!(src, truncated).unwrap();
|
||||||
|
|
||||||
|
if src.is_none() {
|
||||||
|
assert!(truncated.is_none());
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
let (src, truncated) = (src.unwrap(), truncated.unwrap());
|
||||||
|
|
||||||
|
// because we've filtered the source with Lsn, we should always have the same keys from both.
|
||||||
|
assert_eq!(src.0, truncated.0);
|
||||||
|
assert_eq!(src.1, truncated.1);
|
||||||
|
|
||||||
|
// if this is needed for something else, just drop this assert.
|
||||||
|
assert!(
|
||||||
|
src.2.pos() >= truncated.2.pos(),
|
||||||
|
"value position should not go backwards {} vs. {}",
|
||||||
|
src.2.pos(),
|
||||||
|
truncated.2.pos()
|
||||||
|
);
|
||||||
|
|
||||||
|
scratch_left.clear();
|
||||||
|
let src_cursor = source_reader.block_cursor();
|
||||||
|
let left = src_cursor.read_blob_into_buf(src.2.pos(), &mut scratch_left, ctx);
|
||||||
|
scratch_right.clear();
|
||||||
|
let trunc_cursor = truncated_reader.block_cursor();
|
||||||
|
let right = trunc_cursor.read_blob_into_buf(truncated.2.pos(), &mut scratch_right, ctx);
|
||||||
|
|
||||||
|
tokio::try_join!(left, right).unwrap();
|
||||||
|
|
||||||
|
assert_eq!(utils::Hex(&scratch_left), utils::Hex(&scratch_right));
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user