mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-29 18:20:38 +00:00
Compare commits
290 Commits
jemalloc-p
...
proxy-remo
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1450d8dc43 | ||
|
|
b2d34a82b9 | ||
|
|
3797566c36 | ||
|
|
43f9a16e46 | ||
|
|
71a7fd983e | ||
|
|
a3f5b83677 | ||
|
|
1455f5a261 | ||
|
|
3860bc9c6c | ||
|
|
c1f4028fc0 | ||
|
|
0e4f182680 | ||
|
|
ea2e830707 | ||
|
|
7cf726e36e | ||
|
|
6b3164269c | ||
|
|
75a52ac7fd | ||
|
|
e28e46f20b | ||
|
|
d5d15eb6eb | ||
|
|
49d7f9b5a4 | ||
|
|
95a49f0075 | ||
|
|
545f7e8cd7 | ||
|
|
cd6d811213 | ||
|
|
8f3c316bae | ||
|
|
58e31fe098 | ||
|
|
a43a1ad1df | ||
|
|
eb0c026aac | ||
|
|
ff560a1113 | ||
|
|
4a278cce7c | ||
|
|
f98fdd20e3 | ||
|
|
014f822a78 | ||
|
|
ddd8ebd253 | ||
|
|
9cfe08e3d9 | ||
|
|
64577cfddc | ||
|
|
37f81289c2 | ||
|
|
9217564026 | ||
|
|
3404e76a51 | ||
|
|
62aac6c8ad | ||
|
|
e015b2bf3e | ||
|
|
a7f31f1a59 | ||
|
|
325f3784f9 | ||
|
|
900f391115 | ||
|
|
8901ce9c99 | ||
|
|
ce44dfe353 | ||
|
|
d1d55bbd9f | ||
|
|
df9ab1b5e3 | ||
|
|
ef96c82c9f | ||
|
|
b43f6daa48 | ||
|
|
664f92dc6e | ||
|
|
bd5cb9e86b | ||
|
|
00d66e8012 | ||
|
|
679e031cf6 | ||
|
|
e3f6a07ca3 | ||
|
|
a8a88ba7bc | ||
|
|
353afe4fe7 | ||
|
|
1988ad8db7 | ||
|
|
e3415706b7 | ||
|
|
9d081851ec | ||
|
|
781352bd8e | ||
|
|
8030b8e4c5 | ||
|
|
9a4b896636 | ||
|
|
e8b8ebfa1d | ||
|
|
d9d471e3c4 | ||
|
|
d43dcceef9 | ||
|
|
f2771a99b7 | ||
|
|
f54c3b96e0 | ||
|
|
478cc37a70 | ||
|
|
4ce6e2d2fc | ||
|
|
baeb58432f | ||
|
|
6f3e043a76 | ||
|
|
6810d2aa53 | ||
|
|
2d7091871f | ||
|
|
7701ca45dd | ||
|
|
de8dfee4bd | ||
|
|
e3f51abadf | ||
|
|
a7b84cca5a | ||
|
|
291fcb9e4f | ||
|
|
a5ecca976e | ||
|
|
5caee4ca54 | ||
|
|
e1a9669d05 | ||
|
|
aaf60819fa | ||
|
|
c84656a53e | ||
|
|
af99c959ef | ||
|
|
a8e6d259cb | ||
|
|
c1390bfc3b | ||
|
|
6d951e69d6 | ||
|
|
4b8809b280 | ||
|
|
4c5afb7b10 | ||
|
|
ec069dc45e | ||
|
|
790c05d675 | ||
|
|
923cf91aa4 | ||
|
|
03c6039707 | ||
|
|
c6d5ff944d | ||
|
|
4b97683338 | ||
|
|
affc18f912 | ||
|
|
3ef6e21211 | ||
|
|
1075386d77 | ||
|
|
c3dd646ab3 | ||
|
|
bc78b0e9cc | ||
|
|
f342b87f30 | ||
|
|
438bacc32e | ||
|
|
1a2a3cb446 | ||
|
|
4eedb3b6f1 | ||
|
|
e67fcf9563 | ||
|
|
82960b2175 | ||
|
|
30d15ad403 | ||
|
|
b6ee91835b | ||
|
|
df0f1e359b | ||
|
|
cd0e344938 | ||
|
|
22afaea6e1 | ||
|
|
ba20752b76 | ||
|
|
3a6fa76828 | ||
|
|
9ffb852359 | ||
|
|
972470b174 | ||
|
|
1412e9b3e8 | ||
|
|
be0c73f8e7 | ||
|
|
7f51764001 | ||
|
|
4d8a10af1c | ||
|
|
55ba885f6b | ||
|
|
6ff74295b5 | ||
|
|
bbe730d7ca | ||
|
|
5a0da93c53 | ||
|
|
d9dcbffac3 | ||
|
|
f50ff14560 | ||
|
|
b58a615197 | ||
|
|
1a1d527875 | ||
|
|
216fc5ba7b | ||
|
|
4270e86eb2 | ||
|
|
6351313ae9 | ||
|
|
95098c3216 | ||
|
|
d7c68dc981 | ||
|
|
6206f76419 | ||
|
|
d7f34bc339 | ||
|
|
86905c1322 | ||
|
|
0b02043ba4 | ||
|
|
873b222080 | ||
|
|
13d9589c35 | ||
|
|
be1a88e574 | ||
|
|
b9fd8dcf13 | ||
|
|
5ea117cddf | ||
|
|
2682e0254f | ||
|
|
41fb838799 | ||
|
|
107f535294 | ||
|
|
39c712f2ca | ||
|
|
ab10523cc1 | ||
|
|
d5399b729b | ||
|
|
b06eec41fa | ||
|
|
ca154d9cd8 | ||
|
|
1173ee6a7e | ||
|
|
21e1a496a3 | ||
|
|
0457980728 | ||
|
|
8728d5a5fd | ||
|
|
a4a4d78993 | ||
|
|
870786bd82 | ||
|
|
b6d547cf92 | ||
|
|
e3a2631df9 | ||
|
|
02d42861e4 | ||
|
|
586e77bb24 | ||
|
|
b827e7b330 | ||
|
|
26b1483204 | ||
|
|
d709bcba81 | ||
|
|
b158a5eda0 | ||
|
|
0c99e5ec6d | ||
|
|
0af66a6003 | ||
|
|
017c34b773 | ||
|
|
308227fa51 | ||
|
|
d041f9a887 | ||
|
|
ea531d448e | ||
|
|
2dbd1c1ed5 | ||
|
|
51376ef3c8 | ||
|
|
5a3d8e75ed | ||
|
|
6e4e578841 | ||
|
|
3c9b484c4d | ||
|
|
af849a1f61 | ||
|
|
ac7dc82103 | ||
|
|
f1b654b77d | ||
|
|
7dd58e1449 | ||
|
|
f3af5f4660 | ||
|
|
a96e15cb6b | ||
|
|
df1def7018 | ||
|
|
69337be5c2 | ||
|
|
67a2215163 | ||
|
|
3764dd2e84 | ||
|
|
0115fe6cb2 | ||
|
|
e6da7e29ed | ||
|
|
0353a72a00 | ||
|
|
ce4d3da3ae | ||
|
|
5da3e2113a | ||
|
|
4deb8dc52e | ||
|
|
64f0613edf | ||
|
|
1e7cd6ac9f | ||
|
|
ef03b38e52 | ||
|
|
9b65946566 | ||
|
|
a3fe12b6d8 | ||
|
|
b5a6e68e68 | ||
|
|
ce0ddd749c | ||
|
|
426598cf76 | ||
|
|
8b4dd5dc27 | ||
|
|
ed9a114bde | ||
|
|
b7385bb016 | ||
|
|
37b1930b2f | ||
|
|
d76963691f | ||
|
|
60f570c70d | ||
|
|
3582a95c87 | ||
|
|
00423152c6 | ||
|
|
240efb82f9 | ||
|
|
5f099dc760 | ||
|
|
7a49e5d5c2 | ||
|
|
45ec8688ea | ||
|
|
4b55dad813 | ||
|
|
ab95942fc2 | ||
|
|
f656db09a4 | ||
|
|
69bf1bae7d | ||
|
|
25af32e834 | ||
|
|
cb4b4750ba | ||
|
|
d43d77389e | ||
|
|
5558457c84 | ||
|
|
26e6ff8ba6 | ||
|
|
50a45e67dc | ||
|
|
fcbe60f436 | ||
|
|
e018cac1f7 | ||
|
|
a74b60066c | ||
|
|
3a2f10712a | ||
|
|
4ac4b21598 | ||
|
|
9f792f9c0b | ||
|
|
7434674d86 | ||
|
|
ea37234ccc | ||
|
|
3da54e6d90 | ||
|
|
010f0a310a | ||
|
|
eb53345d48 | ||
|
|
45c625fb34 | ||
|
|
84b6b95783 | ||
|
|
577982b778 | ||
|
|
574645412b | ||
|
|
11945e64ec | ||
|
|
cddafc79e1 | ||
|
|
af7cca4949 | ||
|
|
89cae64e38 | ||
|
|
1f417af9fd | ||
|
|
1684bbf162 | ||
|
|
90cadfa986 | ||
|
|
2226acef7c | ||
|
|
24ce878039 | ||
|
|
84914434e3 | ||
|
|
b655c7030f | ||
|
|
3695a1efa1 | ||
|
|
75b4440d07 | ||
|
|
ee3437cbd8 | ||
|
|
dbe0aa653a | ||
|
|
39427925c2 | ||
|
|
af43f78561 | ||
|
|
ed57772793 | ||
|
|
f1de18f1c9 | ||
|
|
dbb0c967d5 | ||
|
|
bf369f4268 | ||
|
|
70f4a16a05 | ||
|
|
d63185fa6c | ||
|
|
ca8fca0e9f | ||
|
|
0397427dcf | ||
|
|
a2a44ea213 | ||
|
|
4917f52c88 | ||
|
|
04a682021f | ||
|
|
c59abedd85 | ||
|
|
5357f40183 | ||
|
|
e4a279db13 | ||
|
|
b1d47f3911 | ||
|
|
a3d62b31bb | ||
|
|
cdccab4bd9 | ||
|
|
e8814b6f81 | ||
|
|
c18d3340b5 | ||
|
|
447a063f3c | ||
|
|
c12861cccd | ||
|
|
2a3a8ee31d | ||
|
|
5dda371c2b | ||
|
|
a60035b23a | ||
|
|
18fd73d84a | ||
|
|
ee9ec26808 | ||
|
|
e22c072064 | ||
|
|
89f023e6b0 | ||
|
|
8426fb886b | ||
|
|
28e7fa98c4 | ||
|
|
a9fda8c832 | ||
|
|
fa12d60237 | ||
|
|
d551bfee09 | ||
|
|
e69ff3fc00 | ||
|
|
25d9dc6eaf | ||
|
|
139d1346d5 | ||
|
|
0bd16182f7 | ||
|
|
6a5650d40c | ||
|
|
47addc15f1 | ||
|
|
b91c58a8bf | ||
|
|
00d9c2d9a8 | ||
|
|
3a673dce67 |
@@ -1,2 +1,2 @@
|
|||||||
[profile.default]
|
[profile.default]
|
||||||
slow-timeout = { period = "20s", terminate-after = 3 }
|
slow-timeout = { period = "60s", terminate-after = 3 }
|
||||||
|
|||||||
@@ -17,6 +17,7 @@
|
|||||||
!libs/
|
!libs/
|
||||||
!neon_local/
|
!neon_local/
|
||||||
!pageserver/
|
!pageserver/
|
||||||
|
!patches/
|
||||||
!pgxn/
|
!pgxn/
|
||||||
!proxy/
|
!proxy/
|
||||||
!s3_scrubber/
|
!s3_scrubber/
|
||||||
|
|||||||
5
.github/actionlint.yml
vendored
5
.github/actionlint.yml
vendored
@@ -1,12 +1,11 @@
|
|||||||
self-hosted-runner:
|
self-hosted-runner:
|
||||||
labels:
|
labels:
|
||||||
- arm64
|
- arm64
|
||||||
- dev
|
|
||||||
- gen3
|
- gen3
|
||||||
- large
|
- large
|
||||||
# Remove `macos-14` from the list after https://github.com/rhysd/actionlint/pull/392 is merged.
|
- large-arm64
|
||||||
- macos-14
|
|
||||||
- small
|
- small
|
||||||
|
- small-arm64
|
||||||
- us-east-2
|
- us-east-2
|
||||||
config-variables:
|
config-variables:
|
||||||
- REMOTE_STORAGE_AZURE_CONTAINER
|
- REMOTE_STORAGE_AZURE_CONTAINER
|
||||||
|
|||||||
@@ -3,13 +3,13 @@ description: 'Create Branch using API'
|
|||||||
|
|
||||||
inputs:
|
inputs:
|
||||||
api_key:
|
api_key:
|
||||||
desctiption: 'Neon API key'
|
description: 'Neon API key'
|
||||||
required: true
|
required: true
|
||||||
project_id:
|
project_id:
|
||||||
desctiption: 'ID of the Project to create Branch in'
|
description: 'ID of the Project to create Branch in'
|
||||||
required: true
|
required: true
|
||||||
api_host:
|
api_host:
|
||||||
desctiption: 'Neon API host'
|
description: 'Neon API host'
|
||||||
default: console-stage.neon.build
|
default: console-stage.neon.build
|
||||||
outputs:
|
outputs:
|
||||||
dsn:
|
dsn:
|
||||||
|
|||||||
@@ -3,16 +3,16 @@ description: 'Delete Branch using API'
|
|||||||
|
|
||||||
inputs:
|
inputs:
|
||||||
api_key:
|
api_key:
|
||||||
desctiption: 'Neon API key'
|
description: 'Neon API key'
|
||||||
required: true
|
required: true
|
||||||
project_id:
|
project_id:
|
||||||
desctiption: 'ID of the Project which should be deleted'
|
description: 'ID of the Project which should be deleted'
|
||||||
required: true
|
required: true
|
||||||
branch_id:
|
branch_id:
|
||||||
desctiption: 'ID of the branch to delete'
|
description: 'ID of the branch to delete'
|
||||||
required: true
|
required: true
|
||||||
api_host:
|
api_host:
|
||||||
desctiption: 'Neon API host'
|
description: 'Neon API host'
|
||||||
default: console-stage.neon.build
|
default: console-stage.neon.build
|
||||||
|
|
||||||
runs:
|
runs:
|
||||||
|
|||||||
14
.github/actions/neon-project-create/action.yml
vendored
14
.github/actions/neon-project-create/action.yml
vendored
@@ -3,22 +3,22 @@ description: 'Create Neon Project using API'
|
|||||||
|
|
||||||
inputs:
|
inputs:
|
||||||
api_key:
|
api_key:
|
||||||
desctiption: 'Neon API key'
|
description: 'Neon API key'
|
||||||
required: true
|
required: true
|
||||||
region_id:
|
region_id:
|
||||||
desctiption: 'Region ID, if not set the project will be created in the default region'
|
description: 'Region ID, if not set the project will be created in the default region'
|
||||||
default: aws-us-east-2
|
default: aws-us-east-2
|
||||||
postgres_version:
|
postgres_version:
|
||||||
desctiption: 'Postgres version; default is 15'
|
description: 'Postgres version; default is 15'
|
||||||
default: 15
|
default: '15'
|
||||||
api_host:
|
api_host:
|
||||||
desctiption: 'Neon API host'
|
description: 'Neon API host'
|
||||||
default: console-stage.neon.build
|
default: console-stage.neon.build
|
||||||
provisioner:
|
provisioner:
|
||||||
desctiption: 'k8s-pod or k8s-neonvm'
|
description: 'k8s-pod or k8s-neonvm'
|
||||||
default: 'k8s-pod'
|
default: 'k8s-pod'
|
||||||
compute_units:
|
compute_units:
|
||||||
desctiption: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
|
description: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
|
||||||
default: '[1, 1]'
|
default: '[1, 1]'
|
||||||
|
|
||||||
outputs:
|
outputs:
|
||||||
|
|||||||
@@ -3,13 +3,13 @@ description: 'Delete Neon Project using API'
|
|||||||
|
|
||||||
inputs:
|
inputs:
|
||||||
api_key:
|
api_key:
|
||||||
desctiption: 'Neon API key'
|
description: 'Neon API key'
|
||||||
required: true
|
required: true
|
||||||
project_id:
|
project_id:
|
||||||
desctiption: 'ID of the Project to delete'
|
description: 'ID of the Project to delete'
|
||||||
required: true
|
required: true
|
||||||
api_host:
|
api_host:
|
||||||
desctiption: 'Neon API host'
|
description: 'Neon API host'
|
||||||
default: console-stage.neon.build
|
default: console-stage.neon.build
|
||||||
|
|
||||||
runs:
|
runs:
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ jobs:
|
|||||||
matrix:
|
matrix:
|
||||||
arch: [ x64, arm64 ]
|
arch: [ x64, arm64 ]
|
||||||
|
|
||||||
runs-on: ${{ fromJson(format('["self-hosted", "dev", "{0}"]', matrix.arch)) }}
|
runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
||||||
|
|
||||||
env:
|
env:
|
||||||
IMAGE_TAG: ${{ inputs.image-tag }}
|
IMAGE_TAG: ${{ inputs.image-tag }}
|
||||||
|
|||||||
289
.github/workflows/build_and_test.yml
vendored
289
.github/workflows/build_and_test.yml
vendored
@@ -236,27 +236,6 @@ jobs:
|
|||||||
submodules: true
|
submodules: true
|
||||||
fetch-depth: 1
|
fetch-depth: 1
|
||||||
|
|
||||||
- name: Check Postgres submodules revision
|
|
||||||
shell: bash -euo pipefail {0}
|
|
||||||
run: |
|
|
||||||
# This is a temporary solution to ensure that the Postgres submodules revision is correct (i.e. the updated intentionally).
|
|
||||||
# Eventually it will be replaced by a regression test https://github.com/neondatabase/neon/pull/4603
|
|
||||||
|
|
||||||
FAILED=false
|
|
||||||
for postgres in postgres-v14 postgres-v15 postgres-v16; do
|
|
||||||
expected=$(cat vendor/revisions.json | jq --raw-output '."'"${postgres}"'"')
|
|
||||||
actual=$(git rev-parse "HEAD:vendor/${postgres}")
|
|
||||||
if [ "${expected}" != "${actual}" ]; then
|
|
||||||
echo >&2 "Expected ${postgres} rev to be at '${expected}', but it is at '${actual}'"
|
|
||||||
FAILED=true
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
if [ "${FAILED}" = "true" ]; then
|
|
||||||
echo >&2 "Please update vendor/revisions.json if these changes are intentional"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
- name: Set pg 14 revision for caching
|
- name: Set pg 14 revision for caching
|
||||||
id: pg_v14_rev
|
id: pg_v14_rev
|
||||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
|
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
|
||||||
@@ -362,6 +341,9 @@ jobs:
|
|||||||
env:
|
env:
|
||||||
NEXTEST_RETRIES: 3
|
NEXTEST_RETRIES: 3
|
||||||
run: |
|
run: |
|
||||||
|
#nextest does not yet support running doctests
|
||||||
|
cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
|
||||||
|
|
||||||
for io_engine in std-fs tokio-epoll-uring ; do
|
for io_engine in std-fs tokio-epoll-uring ; do
|
||||||
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
|
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
|
||||||
done
|
done
|
||||||
@@ -477,6 +459,8 @@ jobs:
|
|||||||
BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
|
BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
|
||||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
||||||
PAGESERVER_GET_VECTORED_IMPL: vectored
|
PAGESERVER_GET_VECTORED_IMPL: vectored
|
||||||
|
PAGESERVER_GET_IMPL: vectored
|
||||||
|
PAGESERVER_VALIDATE_VEC_GET: true
|
||||||
|
|
||||||
# Temporary disable this step until we figure out why it's so flaky
|
# Temporary disable this step until we figure out why it's so flaky
|
||||||
# Ref https://github.com/neondatabase/neon/issues/4540
|
# Ref https://github.com/neondatabase/neon/issues/4540
|
||||||
@@ -556,12 +540,33 @@ jobs:
|
|||||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||||
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
|
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
|
||||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
||||||
|
PAGESERVER_GET_VECTORED_IMPL: vectored
|
||||||
|
PAGESERVER_GET_IMPL: vectored
|
||||||
|
PAGESERVER_VALIDATE_VEC_GET: false
|
||||||
# XXX: no coverage data handling here, since benchmarks are run on release builds,
|
# XXX: no coverage data handling here, since benchmarks are run on release builds,
|
||||||
# while coverage is currently collected for the debug ones
|
# while coverage is currently collected for the debug ones
|
||||||
|
|
||||||
|
report-benchmarks-failures:
|
||||||
|
needs: [ benchmarks, create-test-report ]
|
||||||
|
if: github.ref_name == 'main' && failure() && needs.benchmarks.result == 'failure'
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: slackapi/slack-github-action@v1
|
||||||
|
with:
|
||||||
|
channel-id: C060CNA47S9 # on-call-staging-storage-stream
|
||||||
|
slack-message: |
|
||||||
|
Benchmarks failed on main: ${{ github.event.head_commit.url }}
|
||||||
|
|
||||||
|
Allure report: ${{ needs.create-test-report.outputs.report-url }}
|
||||||
|
env:
|
||||||
|
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||||
|
|
||||||
create-test-report:
|
create-test-report:
|
||||||
needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-build-tools-image ]
|
needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-build-tools-image ]
|
||||||
if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
|
if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
|
||||||
|
outputs:
|
||||||
|
report-url: ${{ steps.create-allure-report.outputs.report-url }}
|
||||||
|
|
||||||
runs-on: [ self-hosted, gen3, small ]
|
runs-on: [ self-hosted, gen3, small ]
|
||||||
container:
|
container:
|
||||||
@@ -718,9 +723,13 @@ jobs:
|
|||||||
uses: ./.github/workflows/trigger-e2e-tests.yml
|
uses: ./.github/workflows/trigger-e2e-tests.yml
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
neon-image:
|
neon-image-arch:
|
||||||
needs: [ check-permissions, build-build-tools-image, tag ]
|
needs: [ check-permissions, build-build-tools-image, tag ]
|
||||||
runs-on: [ self-hosted, gen3, large ]
|
strategy:
|
||||||
|
matrix:
|
||||||
|
arch: [ x64, arm64 ]
|
||||||
|
|
||||||
|
runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
@@ -742,12 +751,6 @@ jobs:
|
|||||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||||
|
|
||||||
- uses: docker/login-action@v3
|
|
||||||
with:
|
|
||||||
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
|
||||||
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
|
||||||
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
|
||||||
|
|
||||||
- uses: docker/build-push-action@v5
|
- uses: docker/build-push-action@v5
|
||||||
with:
|
with:
|
||||||
context: .
|
context: .
|
||||||
@@ -759,25 +762,52 @@ jobs:
|
|||||||
push: true
|
push: true
|
||||||
pull: true
|
pull: true
|
||||||
file: Dockerfile
|
file: Dockerfile
|
||||||
cache-from: type=registry,ref=neondatabase/neon:cache
|
cache-from: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }}
|
||||||
cache-to: type=registry,ref=neondatabase/neon:cache,mode=max
|
cache-to: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }},mode=max
|
||||||
tags: |
|
tags: |
|
||||||
369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
|
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
|
||||||
neondatabase/neon:${{needs.tag.outputs.build-tag}}
|
|
||||||
|
|
||||||
- name: Remove custom docker config directory
|
- name: Remove custom docker config directory
|
||||||
if: always()
|
if: always()
|
||||||
run: |
|
run: |
|
||||||
rm -rf .docker-custom
|
rm -rf .docker-custom
|
||||||
|
|
||||||
compute-node-image:
|
neon-image:
|
||||||
needs: [ check-permissions, build-build-tools-image, tag ]
|
needs: [ neon-image-arch, tag ]
|
||||||
runs-on: [ self-hosted, gen3, large ]
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: docker/login-action@v3
|
||||||
|
with:
|
||||||
|
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||||
|
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||||
|
|
||||||
|
- name: Create multi-arch image
|
||||||
|
run: |
|
||||||
|
docker buildx imagetools create -t neondatabase/neon:${{ needs.tag.outputs.build-tag }} \
|
||||||
|
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-x64 \
|
||||||
|
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-arm64
|
||||||
|
|
||||||
|
- uses: docker/login-action@v3
|
||||||
|
with:
|
||||||
|
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||||
|
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||||
|
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||||
|
|
||||||
|
- name: Push multi-arch image to ECR
|
||||||
|
run: |
|
||||||
|
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{ needs.tag.outputs.build-tag }} \
|
||||||
|
neondatabase/neon:${{ needs.tag.outputs.build-tag }}
|
||||||
|
|
||||||
|
compute-node-image-arch:
|
||||||
|
needs: [ check-permissions, build-build-tools-image, tag ]
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
version: [ v14, v15, v16 ]
|
version: [ v14, v15, v16 ]
|
||||||
|
arch: [ x64, arm64 ]
|
||||||
|
|
||||||
|
runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
@@ -824,15 +854,14 @@ jobs:
|
|||||||
push: true
|
push: true
|
||||||
pull: true
|
pull: true
|
||||||
file: Dockerfile.compute-node
|
file: Dockerfile.compute-node
|
||||||
cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache
|
cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }}
|
||||||
cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache,mode=max
|
cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max
|
||||||
tags: |
|
tags: |
|
||||||
369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
|
||||||
neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
|
||||||
|
|
||||||
- name: Build compute-tools image
|
- name: Build compute-tools image
|
||||||
# compute-tools are Postgres independent, so build it only once
|
# compute-tools are Postgres independent, so build it only once
|
||||||
if: ${{ matrix.version == 'v16' }}
|
if: matrix.version == 'v16'
|
||||||
uses: docker/build-push-action@v5
|
uses: docker/build-push-action@v5
|
||||||
with:
|
with:
|
||||||
target: compute-tools-image
|
target: compute-tools-image
|
||||||
@@ -846,14 +875,57 @@ jobs:
|
|||||||
pull: true
|
pull: true
|
||||||
file: Dockerfile.compute-node
|
file: Dockerfile.compute-node
|
||||||
tags: |
|
tags: |
|
||||||
369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }}
|
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
|
||||||
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
|
|
||||||
|
|
||||||
- name: Remove custom docker config directory
|
- name: Remove custom docker config directory
|
||||||
if: always()
|
if: always()
|
||||||
run: |
|
run: |
|
||||||
rm -rf .docker-custom
|
rm -rf .docker-custom
|
||||||
|
|
||||||
|
compute-node-image:
|
||||||
|
needs: [ compute-node-image-arch, tag ]
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
version: [ v14, v15, v16 ]
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: docker/login-action@v3
|
||||||
|
with:
|
||||||
|
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||||
|
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||||
|
|
||||||
|
- name: Create multi-arch compute-node image
|
||||||
|
run: |
|
||||||
|
docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
|
||||||
|
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \
|
||||||
|
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64
|
||||||
|
|
||||||
|
- name: Create multi-arch compute-tools image
|
||||||
|
if: matrix.version == 'v16'
|
||||||
|
run: |
|
||||||
|
docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \
|
||||||
|
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-x64 \
|
||||||
|
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-arm64
|
||||||
|
|
||||||
|
- uses: docker/login-action@v3
|
||||||
|
with:
|
||||||
|
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||||
|
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||||
|
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||||
|
|
||||||
|
- name: Push multi-arch compute-node-${{ matrix.version }} image to ECR
|
||||||
|
run: |
|
||||||
|
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
|
||||||
|
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
|
||||||
|
|
||||||
|
- name: Push multi-arch compute-tools image to ECR
|
||||||
|
if: matrix.version == 'v16'
|
||||||
|
run: |
|
||||||
|
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \
|
||||||
|
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
|
||||||
|
|
||||||
vm-compute-node-image:
|
vm-compute-node-image:
|
||||||
needs: [ check-permissions, tag, compute-node-image ]
|
needs: [ check-permissions, tag, compute-node-image ]
|
||||||
runs-on: [ self-hosted, gen3, large ]
|
runs-on: [ self-hosted, gen3, large ]
|
||||||
@@ -861,11 +933,8 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
version: [ v14, v15, v16 ]
|
version: [ v14, v15, v16 ]
|
||||||
defaults:
|
|
||||||
run:
|
|
||||||
shell: sh -eu {0}
|
|
||||||
env:
|
env:
|
||||||
VM_BUILDER_VERSION: v0.28.1
|
VM_BUILDER_VERSION: v0.29.3
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
@@ -878,26 +947,48 @@ jobs:
|
|||||||
curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
|
curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
|
||||||
chmod +x vm-builder
|
chmod +x vm-builder
|
||||||
|
|
||||||
|
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
|
||||||
|
# The default value is ~/.docker
|
||||||
|
- name: Set custom docker config directory
|
||||||
|
run: |
|
||||||
|
mkdir -p .docker-custom
|
||||||
|
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
|
||||||
|
|
||||||
|
- uses: docker/login-action@v3
|
||||||
|
with:
|
||||||
|
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||||
|
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||||
|
|
||||||
# Note: we need a separate pull step here because otherwise vm-builder will try to pull, and
|
# Note: we need a separate pull step here because otherwise vm-builder will try to pull, and
|
||||||
# it won't have the proper authentication (written at v0.6.0)
|
# it won't have the proper authentication (written at v0.6.0)
|
||||||
- name: Pulling compute-node image
|
- name: Pulling compute-node image
|
||||||
run: |
|
run: |
|
||||||
docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
docker pull neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
|
||||||
|
|
||||||
- name: Build vm image
|
- name: Build vm image
|
||||||
run: |
|
run: |
|
||||||
./vm-builder \
|
./vm-builder \
|
||||||
-spec=vm-image-spec.yaml \
|
-spec=vm-image-spec.yaml \
|
||||||
-src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
|
-src=neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
|
||||||
-dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
-dst=neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
|
||||||
|
|
||||||
- name: Pushing vm-compute-node image
|
- name: Pushing vm-compute-node image
|
||||||
run: |
|
run: |
|
||||||
docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
docker push neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
|
||||||
|
|
||||||
|
- name: Remove custom docker config directory
|
||||||
|
if: always()
|
||||||
|
run: |
|
||||||
|
rm -rf .docker-custom
|
||||||
|
|
||||||
test-images:
|
test-images:
|
||||||
needs: [ check-permissions, tag, neon-image, compute-node-image ]
|
needs: [ check-permissions, tag, neon-image, compute-node-image ]
|
||||||
runs-on: [ self-hosted, gen3, small ]
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
arch: [ x64, arm64 ]
|
||||||
|
|
||||||
|
runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
@@ -915,7 +1006,7 @@ jobs:
|
|||||||
- name: Verify image versions
|
- name: Verify image versions
|
||||||
shell: bash # ensure no set -e for better error messages
|
shell: bash # ensure no set -e for better error messages
|
||||||
run: |
|
run: |
|
||||||
pageserver_version=$(docker run --rm 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
|
pageserver_version=$(docker run --rm neondatabase/neon:${{ needs.tag.outputs.build-tag }} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
|
||||||
|
|
||||||
echo "Pageserver version string: $pageserver_version"
|
echo "Pageserver version string: $pageserver_version"
|
||||||
|
|
||||||
@@ -941,78 +1032,48 @@ jobs:
|
|||||||
|
|
||||||
promote-images:
|
promote-images:
|
||||||
needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
|
needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
|
||||||
runs-on: [ self-hosted, gen3, small ]
|
runs-on: ubuntu-latest
|
||||||
container: golang:1.19-bullseye
|
|
||||||
# Don't add if-condition here.
|
env:
|
||||||
# The job should always be run because we have dependant other jobs that shouldn't be skipped
|
VERSIONS: v14 v15 v16
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Install Crane & ECR helper
|
- uses: docker/login-action@v3
|
||||||
run: |
|
with:
|
||||||
go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0
|
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||||
go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
|
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||||
|
|
||||||
- name: Configure ECR login
|
- uses: docker/login-action@v3
|
||||||
run: |
|
with:
|
||||||
mkdir /github/home/.docker/
|
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||||
echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
|
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||||
|
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||||
|
|
||||||
- name: Copy vm-compute-node images to Docker Hub
|
- name: Copy vm-compute-node images to ECR
|
||||||
run: |
|
run: |
|
||||||
crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14
|
for version in ${VERSIONS}; do
|
||||||
crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15
|
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} \
|
||||||
crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} vm-compute-node-v16
|
neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
|
||||||
|
done
|
||||||
|
|
||||||
- name: Add latest tag to images
|
- name: Add latest tag to images
|
||||||
if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy'
|
if: github.ref_name == 'main'
|
||||||
run: |
|
run: |
|
||||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest
|
for repo in neondatabase 369495373322.dkr.ecr.eu-central-1.amazonaws.com; do
|
||||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
|
docker buildx imagetools create -t $repo/neon:latest \
|
||||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
|
$repo/neon:${{ needs.tag.outputs.build-tag }}
|
||||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
|
|
||||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
|
|
||||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
|
|
||||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
|
|
||||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest
|
|
||||||
|
|
||||||
- name: Push images to production ECR
|
docker buildx imagetools create -t $repo/compute-tools:latest \
|
||||||
if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
|
$repo/compute-tools:${{ needs.tag.outputs.build-tag }}
|
||||||
run: |
|
|
||||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
|
|
||||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
|
|
||||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
|
|
||||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
|
|
||||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
|
|
||||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
|
|
||||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:latest
|
|
||||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:latest
|
|
||||||
|
|
||||||
- name: Configure Docker Hub login
|
for version in ${VERSIONS}; do
|
||||||
run: |
|
docker buildx imagetools create -t $repo/compute-node-${version}:latest \
|
||||||
# ECR Credential Helper & Docker Hub don't work together in config, hence reset
|
$repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }}
|
||||||
echo "" > /github/home/.docker/config.json
|
|
||||||
crane auth login -u ${{ secrets.NEON_DOCKERHUB_USERNAME }} -p ${{ secrets.NEON_DOCKERHUB_PASSWORD }} index.docker.io
|
|
||||||
|
|
||||||
- name: Push vm-compute-node to Docker Hub
|
docker buildx imagetools create -t $repo/vm-compute-node-${version}:latest \
|
||||||
run: |
|
$repo/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
|
||||||
crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
|
done
|
||||||
crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}
|
done
|
||||||
crane push vm-compute-node-v16 neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}}
|
|
||||||
|
|
||||||
- name: Push latest tags to Docker Hub
|
|
||||||
if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
|
|
||||||
run: |
|
|
||||||
crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
|
|
||||||
crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
|
|
||||||
crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
|
|
||||||
crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
|
|
||||||
crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
|
|
||||||
crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
|
|
||||||
crane tag neondatabase/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
|
|
||||||
crane tag neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest
|
|
||||||
|
|
||||||
- name: Cleanup ECR folder
|
|
||||||
run: rm -rf ~/.ecr
|
|
||||||
|
|
||||||
trigger-custom-extensions-build-and-wait:
|
trigger-custom-extensions-build-and-wait:
|
||||||
needs: [ check-permissions, tag ]
|
needs: [ check-permissions, tag ]
|
||||||
|
|||||||
33
.github/workflows/neon_extra_builds.yml
vendored
33
.github/workflows/neon_extra_builds.yml
vendored
@@ -136,7 +136,7 @@ jobs:
|
|||||||
check-linux-arm-build:
|
check-linux-arm-build:
|
||||||
needs: [ check-permissions, build-build-tools-image ]
|
needs: [ check-permissions, build-build-tools-image ]
|
||||||
timeout-minutes: 90
|
timeout-minutes: 90
|
||||||
runs-on: [ self-hosted, dev, arm64 ]
|
runs-on: [ self-hosted, small-arm64 ]
|
||||||
|
|
||||||
env:
|
env:
|
||||||
# Use release build only, to have less debug info around
|
# Use release build only, to have less debug info around
|
||||||
@@ -232,20 +232,20 @@ jobs:
|
|||||||
|
|
||||||
- name: Run cargo build
|
- name: Run cargo build
|
||||||
run: |
|
run: |
|
||||||
mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests
|
mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests -j$(nproc)
|
||||||
|
|
||||||
- name: Run cargo test
|
- name: Run cargo test
|
||||||
env:
|
env:
|
||||||
NEXTEST_RETRIES: 3
|
NEXTEST_RETRIES: 3
|
||||||
run: |
|
run: |
|
||||||
cargo nextest run $CARGO_FEATURES
|
cargo nextest run $CARGO_FEATURES -j$(nproc)
|
||||||
|
|
||||||
# Run separate tests for real S3
|
# Run separate tests for real S3
|
||||||
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
|
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
|
||||||
export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
|
export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
|
||||||
export REMOTE_STORAGE_S3_REGION=eu-central-1
|
export REMOTE_STORAGE_S3_REGION=eu-central-1
|
||||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
||||||
cargo nextest run --package remote_storage --test test_real_s3
|
cargo nextest run --package remote_storage --test test_real_s3 -j$(nproc)
|
||||||
|
|
||||||
# Run separate tests for real Azure Blob Storage
|
# Run separate tests for real Azure Blob Storage
|
||||||
# XXX: replace region with `eu-central-1`-like region
|
# XXX: replace region with `eu-central-1`-like region
|
||||||
@@ -255,12 +255,12 @@ jobs:
|
|||||||
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
|
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
|
||||||
export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
|
export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
|
||||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
||||||
cargo nextest run --package remote_storage --test test_real_azure
|
cargo nextest run --package remote_storage --test test_real_azure -j$(nproc)
|
||||||
|
|
||||||
check-codestyle-rust-arm:
|
check-codestyle-rust-arm:
|
||||||
needs: [ check-permissions, build-build-tools-image ]
|
needs: [ check-permissions, build-build-tools-image ]
|
||||||
timeout-minutes: 90
|
timeout-minutes: 90
|
||||||
runs-on: [ self-hosted, dev, arm64 ]
|
runs-on: [ self-hosted, small-arm64 ]
|
||||||
|
|
||||||
container:
|
container:
|
||||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||||
@@ -269,6 +269,11 @@ jobs:
|
|||||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||||
options: --init
|
options: --init
|
||||||
|
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
build_type: [ debug, release ]
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Fix git ownership
|
- name: Fix git ownership
|
||||||
run: |
|
run: |
|
||||||
@@ -305,31 +310,35 @@ jobs:
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
|
echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
|
||||||
|
|
||||||
- name: Run cargo clippy (debug)
|
- name: Run cargo clippy (debug)
|
||||||
|
if: matrix.build_type == 'debug'
|
||||||
run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
|
run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
|
||||||
- name: Run cargo clippy (release)
|
- name: Run cargo clippy (release)
|
||||||
|
if: matrix.build_type == 'release'
|
||||||
run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
|
run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
|
||||||
|
|
||||||
- name: Check documentation generation
|
- name: Check documentation generation
|
||||||
run: cargo doc --workspace --no-deps --document-private-items
|
if: matrix.build_type == 'release'
|
||||||
|
run: cargo doc --workspace --no-deps --document-private-items -j$(nproc)
|
||||||
env:
|
env:
|
||||||
RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
|
RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
|
||||||
|
|
||||||
# Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
|
# Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
|
||||||
- name: Check formatting
|
- name: Check formatting
|
||||||
if: ${{ !cancelled() }}
|
if: ${{ !cancelled() && matrix.build_type == 'release' }}
|
||||||
run: cargo fmt --all -- --check
|
run: cargo fmt --all -- --check
|
||||||
|
|
||||||
# https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
|
# https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
|
||||||
- name: Check rust dependencies
|
- name: Check rust dependencies
|
||||||
if: ${{ !cancelled() }}
|
if: ${{ !cancelled() && matrix.build_type == 'release' }}
|
||||||
run: |
|
run: |
|
||||||
cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date
|
cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date
|
||||||
cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack
|
cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack
|
||||||
|
|
||||||
# https://github.com/EmbarkStudios/cargo-deny
|
# https://github.com/EmbarkStudios/cargo-deny
|
||||||
- name: Check rust licenses/bans/advisories/sources
|
- name: Check rust licenses/bans/advisories/sources
|
||||||
if: ${{ !cancelled() }}
|
if: ${{ !cancelled() && matrix.build_type == 'release' }}
|
||||||
run: cargo deny check
|
run: cargo deny check
|
||||||
|
|
||||||
gather-rust-build-stats:
|
gather-rust-build-stats:
|
||||||
@@ -338,7 +347,7 @@ jobs:
|
|||||||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
|
contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
|
||||||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
|
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
|
||||||
github.ref_name == 'main'
|
github.ref_name == 'main'
|
||||||
runs-on: [ self-hosted, gen3, large ]
|
runs-on: [ self-hosted, large ]
|
||||||
container:
|
container:
|
||||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||||
credentials:
|
credentials:
|
||||||
@@ -369,7 +378,7 @@ jobs:
|
|||||||
run: make walproposer-lib -j$(nproc)
|
run: make walproposer-lib -j$(nproc)
|
||||||
|
|
||||||
- name: Produce the build stats
|
- name: Produce the build stats
|
||||||
run: cargo build --all --release --timings
|
run: cargo build --all --release --timings -j$(nproc)
|
||||||
|
|
||||||
- name: Upload the build stats
|
- name: Upload the build stats
|
||||||
id: upload-stats
|
id: upload-stats
|
||||||
|
|||||||
2
.github/workflows/release.yml
vendored
2
.github/workflows/release.yml
vendored
@@ -53,7 +53,7 @@ jobs:
|
|||||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||||
run: |
|
run: |
|
||||||
cat << EOF > body.md
|
cat << EOF > body.md
|
||||||
## Release ${RELEASE_DATE}
|
## Storage & Compute release ${RELEASE_DATE}
|
||||||
|
|
||||||
**Please merge this Pull Request using 'Create a merge commit' button**
|
**Please merge this Pull Request using 'Create a merge commit' button**
|
||||||
EOF
|
EOF
|
||||||
|
|||||||
617
Cargo.lock
generated
617
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
60
Cargo.toml
60
Cargo.toml
@@ -41,25 +41,26 @@ license = "Apache-2.0"
|
|||||||
|
|
||||||
## All dependency versions, used in the project
|
## All dependency versions, used in the project
|
||||||
[workspace.dependencies]
|
[workspace.dependencies]
|
||||||
|
ahash = "0.8"
|
||||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||||
arc-swap = "1.6"
|
arc-swap = "1.6"
|
||||||
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
|
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
|
||||||
atomic-take = "1.1.0"
|
atomic-take = "1.1.0"
|
||||||
azure_core = "0.18"
|
azure_core = "0.19"
|
||||||
azure_identity = "0.18"
|
azure_identity = "0.19"
|
||||||
azure_storage = "0.18"
|
azure_storage = "0.19"
|
||||||
azure_storage_blobs = "0.18"
|
azure_storage_blobs = "0.19"
|
||||||
flate2 = "1.0.26"
|
flate2 = "1.0.26"
|
||||||
async-stream = "0.3"
|
async-stream = "0.3"
|
||||||
async-trait = "0.1"
|
async-trait = "0.1"
|
||||||
aws-config = { version = "1.1.4", default-features = false, features=["rustls"] }
|
aws-config = { version = "1.3", default-features = false, features=["rustls"] }
|
||||||
aws-sdk-s3 = "1.14"
|
aws-sdk-s3 = "1.26"
|
||||||
aws-sdk-iam = "1.15.0"
|
aws-sdk-iam = "1.15.0"
|
||||||
aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] }
|
aws-smithy-async = { version = "1.2.1", default-features = false, features=["rt-tokio"] }
|
||||||
aws-smithy-types = "1.1.4"
|
aws-smithy-types = "1.1.9"
|
||||||
aws-credential-types = "1.1.4"
|
aws-credential-types = "1.2.0"
|
||||||
aws-sigv4 = { version = "1.2.0", features = ["sign-http"] }
|
aws-sigv4 = { version = "1.2.1", features = ["sign-http"] }
|
||||||
aws-types = "1.1.7"
|
aws-types = "1.2.0"
|
||||||
axum = { version = "0.6.20", features = ["ws"] }
|
axum = { version = "0.6.20", features = ["ws"] }
|
||||||
base64 = "0.13.0"
|
base64 = "0.13.0"
|
||||||
bincode = "1.3"
|
bincode = "1.3"
|
||||||
@@ -74,6 +75,7 @@ clap = { version = "4.0", features = ["derive"] }
|
|||||||
comfy-table = "6.1"
|
comfy-table = "6.1"
|
||||||
const_format = "0.2"
|
const_format = "0.2"
|
||||||
crc32c = "0.6"
|
crc32c = "0.6"
|
||||||
|
crossbeam-deque = "0.8.5"
|
||||||
crossbeam-utils = "0.8.5"
|
crossbeam-utils = "0.8.5"
|
||||||
dashmap = { version = "5.5.0", features = ["raw-api"] }
|
dashmap = { version = "5.5.0", features = ["raw-api"] }
|
||||||
either = "1.8"
|
either = "1.8"
|
||||||
@@ -81,13 +83,14 @@ enum-map = "2.4.2"
|
|||||||
enumset = "1.0.12"
|
enumset = "1.0.12"
|
||||||
fail = "0.5.0"
|
fail = "0.5.0"
|
||||||
fallible-iterator = "0.2"
|
fallible-iterator = "0.2"
|
||||||
|
framed-websockets = { version = "0.1.0", git = "https://github.com/neondatabase/framed-websockets" }
|
||||||
fs2 = "0.4.3"
|
fs2 = "0.4.3"
|
||||||
futures = "0.3"
|
futures = "0.3"
|
||||||
futures-core = "0.3"
|
futures-core = "0.3"
|
||||||
futures-util = "0.3"
|
futures-util = "0.3"
|
||||||
git-version = "0.3"
|
git-version = "0.3"
|
||||||
hashbrown = "0.13"
|
hashbrown = "0.14"
|
||||||
hashlink = "0.8.4"
|
hashlink = "0.9.1"
|
||||||
hdrhistogram = "7.5.2"
|
hdrhistogram = "7.5.2"
|
||||||
hex = "0.4"
|
hex = "0.4"
|
||||||
hex-literal = "0.4"
|
hex-literal = "0.4"
|
||||||
@@ -98,7 +101,8 @@ http-types = { version = "2", default-features = false }
|
|||||||
humantime = "2.1"
|
humantime = "2.1"
|
||||||
humantime-serde = "1.1.1"
|
humantime-serde = "1.1.1"
|
||||||
hyper = "0.14"
|
hyper = "0.14"
|
||||||
hyper-tungstenite = "0.13.0"
|
tokio-tungstenite = "0.20.0"
|
||||||
|
indexmap = "2"
|
||||||
inotify = "0.10.2"
|
inotify = "0.10.2"
|
||||||
ipnet = "2.9.0"
|
ipnet = "2.9.0"
|
||||||
itertools = "0.10"
|
itertools = "0.10"
|
||||||
@@ -120,8 +124,8 @@ opentelemetry = "0.20.0"
|
|||||||
opentelemetry-otlp = { version = "0.13.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
opentelemetry-otlp = { version = "0.13.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||||
opentelemetry-semantic-conventions = "0.12.0"
|
opentelemetry-semantic-conventions = "0.12.0"
|
||||||
parking_lot = "0.12"
|
parking_lot = "0.12"
|
||||||
parquet = { version = "49.0.0", default-features = false, features = ["zstd"] }
|
parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
|
||||||
parquet_derive = "49.0.0"
|
parquet_derive = "51.0.0"
|
||||||
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
|
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
|
||||||
pin-project-lite = "0.2"
|
pin-project-lite = "0.2"
|
||||||
procfs = "0.14"
|
procfs = "0.14"
|
||||||
@@ -130,10 +134,10 @@ prost = "0.11"
|
|||||||
rand = "0.8"
|
rand = "0.8"
|
||||||
redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
|
redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
|
||||||
regex = "1.10.2"
|
regex = "1.10.2"
|
||||||
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
|
reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] }
|
||||||
reqwest-tracing = { version = "0.4.7", features = ["opentelemetry_0_20"] }
|
reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_20"] }
|
||||||
reqwest-middleware = "0.2.0"
|
reqwest-middleware = "0.3.0"
|
||||||
reqwest-retry = "0.2.2"
|
reqwest-retry = "0.5"
|
||||||
routerify = "3"
|
routerify = "3"
|
||||||
rpds = "0.13"
|
rpds = "0.13"
|
||||||
rustc-hash = "1.1.0"
|
rustc-hash = "1.1.0"
|
||||||
@@ -143,7 +147,7 @@ rustls-split = "0.3"
|
|||||||
scopeguard = "1.1"
|
scopeguard = "1.1"
|
||||||
sysinfo = "0.29.2"
|
sysinfo = "0.29.2"
|
||||||
sd-notify = "0.4.1"
|
sd-notify = "0.4.1"
|
||||||
sentry = { version = "0.31", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
|
sentry = { version = "0.32", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
|
||||||
serde = { version = "1.0", features = ["derive"] }
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
serde_json = "1"
|
serde_json = "1"
|
||||||
serde_path_to_error = "0.1"
|
serde_path_to_error = "0.1"
|
||||||
@@ -157,7 +161,8 @@ socket2 = "0.5"
|
|||||||
strum = "0.24"
|
strum = "0.24"
|
||||||
strum_macros = "0.24"
|
strum_macros = "0.24"
|
||||||
"subtle" = "2.5.0"
|
"subtle" = "2.5.0"
|
||||||
svg_fmt = "0.4.1"
|
# Our PR https://github.com/nical/rust_debug/pull/4 has been merged but no new version released yet
|
||||||
|
svg_fmt = { git = "https://github.com/nical/rust_debug", rev = "28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4" }
|
||||||
sync_wrapper = "0.1.2"
|
sync_wrapper = "0.1.2"
|
||||||
tar = "0.4"
|
tar = "0.4"
|
||||||
task-local-extensions = "0.1.4"
|
task-local-extensions = "0.1.4"
|
||||||
@@ -176,10 +181,11 @@ tokio-util = { version = "0.7.10", features = ["io", "rt"] }
|
|||||||
toml = "0.7"
|
toml = "0.7"
|
||||||
toml_edit = "0.19"
|
toml_edit = "0.19"
|
||||||
tonic = {version = "0.9", features = ["tls", "tls-roots"]}
|
tonic = {version = "0.9", features = ["tls", "tls-roots"]}
|
||||||
|
tower-service = "0.3.2"
|
||||||
tracing = "0.1"
|
tracing = "0.1"
|
||||||
tracing-error = "0.2.0"
|
tracing-error = "0.2.0"
|
||||||
tracing-opentelemetry = "0.20.0"
|
tracing-opentelemetry = "0.21.0"
|
||||||
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
|
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
|
||||||
twox-hash = { version = "1.6.3", default-features = false }
|
twox-hash = { version = "1.6.3", default-features = false }
|
||||||
url = "2.2"
|
url = "2.2"
|
||||||
urlencoding = "2.1"
|
urlencoding = "2.1"
|
||||||
@@ -240,8 +246,8 @@ tonic-build = "0.9"
|
|||||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||||
|
|
||||||
# bug fixes for UUID
|
# bug fixes for UUID
|
||||||
parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
|
parquet = { git = "https://github.com/apache/arrow-rs", branch = "master" }
|
||||||
parquet_derive = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
|
parquet_derive = { git = "https://github.com/apache/arrow-rs", branch = "master" }
|
||||||
|
|
||||||
################# Binary contents sections
|
################# Binary contents sections
|
||||||
|
|
||||||
@@ -252,7 +258,7 @@ debug = true
|
|||||||
|
|
||||||
# disable debug symbols for all packages except this one to decrease binaries size
|
# disable debug symbols for all packages except this one to decrease binaries size
|
||||||
[profile.release.package."*"]
|
[profile.release.package."*"]
|
||||||
debug = true
|
debug = false
|
||||||
|
|
||||||
[profile.release-line-debug]
|
[profile.release-line-debug]
|
||||||
inherits = "release"
|
inherits = "release"
|
||||||
|
|||||||
@@ -44,7 +44,6 @@ COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_i
|
|||||||
COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
|
COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
|
||||||
COPY --chown=nonroot . .
|
COPY --chown=nonroot . .
|
||||||
|
|
||||||
ENV _RJEM_MALLOC_CONF="prof:true"
|
|
||||||
# Show build caching stats to check if it was used in the end.
|
# Show build caching stats to check if it was used in the end.
|
||||||
# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
|
# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
|
||||||
RUN set -e \
|
RUN set -e \
|
||||||
|
|||||||
@@ -65,7 +65,7 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/
|
|||||||
&& mv s5cmd /usr/local/bin/s5cmd
|
&& mv s5cmd /usr/local/bin/s5cmd
|
||||||
|
|
||||||
# LLVM
|
# LLVM
|
||||||
ENV LLVM_VERSION=17
|
ENV LLVM_VERSION=18
|
||||||
RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
|
RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
|
||||||
&& echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
|
&& echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
|
||||||
&& apt update \
|
&& apt update \
|
||||||
@@ -87,7 +87,7 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "aws
|
|||||||
&& rm awscliv2.zip
|
&& rm awscliv2.zip
|
||||||
|
|
||||||
# Mold: A Modern Linker
|
# Mold: A Modern Linker
|
||||||
ENV MOLD_VERSION v2.4.0
|
ENV MOLD_VERSION v2.31.0
|
||||||
RUN set -e \
|
RUN set -e \
|
||||||
&& git clone https://github.com/rui314/mold.git \
|
&& git clone https://github.com/rui314/mold.git \
|
||||||
&& mkdir mold/build \
|
&& mkdir mold/build \
|
||||||
@@ -141,7 +141,7 @@ WORKDIR /home/nonroot
|
|||||||
|
|
||||||
# Rust
|
# Rust
|
||||||
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
|
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
|
||||||
ENV RUSTC_VERSION=1.77.0
|
ENV RUSTC_VERSION=1.78.0
|
||||||
ENV RUSTUP_HOME="/home/nonroot/.rustup"
|
ENV RUSTUP_HOME="/home/nonroot/.rustup"
|
||||||
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
|
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
|
||||||
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
|
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
|
||||||
|
|||||||
@@ -241,11 +241,17 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
|
|||||||
FROM build-deps AS vector-pg-build
|
FROM build-deps AS vector-pg-build
|
||||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||||
|
|
||||||
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \
|
COPY patches/pgvector.patch /pgvector.patch
|
||||||
echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \
|
|
||||||
|
# By default, pgvector Makefile uses `-march=native`. We don't want that,
|
||||||
|
# because we build the images on different machines than where we run them.
|
||||||
|
# Pass OPTFLAGS="" to remove it.
|
||||||
|
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.0.tar.gz -O pgvector.tar.gz && \
|
||||||
|
echo "1b5503a35c265408b6eb282621c5e1e75f7801afc04eecb950796cfee2e3d1d8 pgvector.tar.gz" | sha256sum --check && \
|
||||||
mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
|
mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
|
||||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
patch -p1 < /pgvector.patch && \
|
||||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||||
|
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
|
echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
|
||||||
|
|
||||||
#########################################################################################
|
#########################################################################################
|
||||||
|
|||||||
29
Makefile
29
Makefile
@@ -25,14 +25,16 @@ ifeq ($(UNAME_S),Linux)
|
|||||||
# Seccomp BPF is only available for Linux
|
# Seccomp BPF is only available for Linux
|
||||||
PG_CONFIGURE_OPTS += --with-libseccomp
|
PG_CONFIGURE_OPTS += --with-libseccomp
|
||||||
else ifeq ($(UNAME_S),Darwin)
|
else ifeq ($(UNAME_S),Darwin)
|
||||||
# macOS with brew-installed openssl requires explicit paths
|
ifndef DISABLE_HOMEBREW
|
||||||
# It can be configured with OPENSSL_PREFIX variable
|
# macOS with brew-installed openssl requires explicit paths
|
||||||
OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
|
# It can be configured with OPENSSL_PREFIX variable
|
||||||
PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
|
OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
|
||||||
PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
|
PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
|
||||||
# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
|
PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
|
||||||
# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
|
# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
|
||||||
EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
|
# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
|
||||||
|
EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
# Use -C option so that when PostgreSQL "make install" installs the
|
# Use -C option so that when PostgreSQL "make install" installs the
|
||||||
@@ -79,11 +81,14 @@ $(POSTGRES_INSTALL_DIR)/build/%/config.status:
|
|||||||
echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \
|
echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \
|
||||||
exit 1; }
|
exit 1; }
|
||||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/$*
|
mkdir -p $(POSTGRES_INSTALL_DIR)/build/$*
|
||||||
(cd $(POSTGRES_INSTALL_DIR)/build/$* && \
|
|
||||||
env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \
|
VERSION=$*; \
|
||||||
|
EXTRA_VERSION=$$(cd $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION && git rev-parse HEAD); \
|
||||||
|
(cd $(POSTGRES_INSTALL_DIR)/build/$$VERSION && \
|
||||||
|
env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION/configure \
|
||||||
CFLAGS='$(PG_CFLAGS)' \
|
CFLAGS='$(PG_CFLAGS)' \
|
||||||
$(PG_CONFIGURE_OPTS) \
|
$(PG_CONFIGURE_OPTS) --with-extra-version=" ($$EXTRA_VERSION)" \
|
||||||
--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$* > configure.log)
|
--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$$VERSION > configure.log)
|
||||||
|
|
||||||
# nicer alias to run 'configure'
|
# nicer alias to run 'configure'
|
||||||
# Note: I've been unable to use templates for this part of our configuration.
|
# Note: I've been unable to use templates for this part of our configuration.
|
||||||
|
|||||||
@@ -1,4 +1,6 @@
|
|||||||
[](https://neon.tech)
|
[](https://neon.tech)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Neon
|
# Neon
|
||||||
|
|
||||||
|
|||||||
@@ -27,10 +27,12 @@ reqwest = { workspace = true, features = ["json"] }
|
|||||||
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
|
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
|
||||||
tokio-postgres.workspace = true
|
tokio-postgres.workspace = true
|
||||||
tokio-util.workspace = true
|
tokio-util.workspace = true
|
||||||
|
tokio-stream.workspace = true
|
||||||
tracing.workspace = true
|
tracing.workspace = true
|
||||||
tracing-opentelemetry.workspace = true
|
tracing-opentelemetry.workspace = true
|
||||||
tracing-subscriber.workspace = true
|
tracing-subscriber.workspace = true
|
||||||
tracing-utils.workspace = true
|
tracing-utils.workspace = true
|
||||||
|
thiserror.workspace = true
|
||||||
url.workspace = true
|
url.workspace = true
|
||||||
|
|
||||||
compute_api.workspace = true
|
compute_api.workspace = true
|
||||||
|
|||||||
@@ -47,10 +47,11 @@ use chrono::Utc;
|
|||||||
use clap::Arg;
|
use clap::Arg;
|
||||||
use signal_hook::consts::{SIGQUIT, SIGTERM};
|
use signal_hook::consts::{SIGQUIT, SIGTERM};
|
||||||
use signal_hook::{consts::SIGINT, iterator::Signals};
|
use signal_hook::{consts::SIGINT, iterator::Signals};
|
||||||
use tracing::{error, info};
|
use tracing::{error, info, warn};
|
||||||
use url::Url;
|
use url::Url;
|
||||||
|
|
||||||
use compute_api::responses::ComputeStatus;
|
use compute_api::responses::ComputeStatus;
|
||||||
|
use compute_api::spec::ComputeSpec;
|
||||||
|
|
||||||
use compute_tools::compute::{
|
use compute_tools::compute::{
|
||||||
forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID,
|
forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID,
|
||||||
@@ -62,12 +63,41 @@ use compute_tools::logger::*;
|
|||||||
use compute_tools::monitor::launch_monitor;
|
use compute_tools::monitor::launch_monitor;
|
||||||
use compute_tools::params::*;
|
use compute_tools::params::*;
|
||||||
use compute_tools::spec::*;
|
use compute_tools::spec::*;
|
||||||
|
use compute_tools::swap::resize_swap;
|
||||||
|
|
||||||
// this is an arbitrary build tag. Fine as a default / for testing purposes
|
// this is an arbitrary build tag. Fine as a default / for testing purposes
|
||||||
// in-case of not-set environment var
|
// in-case of not-set environment var
|
||||||
const BUILD_TAG_DEFAULT: &str = "latest";
|
const BUILD_TAG_DEFAULT: &str = "latest";
|
||||||
|
|
||||||
fn main() -> Result<()> {
|
fn main() -> Result<()> {
|
||||||
|
let (build_tag, clap_args) = init()?;
|
||||||
|
|
||||||
|
let (pg_handle, start_pg_result) = {
|
||||||
|
// Enter startup tracing context
|
||||||
|
let _startup_context_guard = startup_context_from_env();
|
||||||
|
|
||||||
|
let cli_args = process_cli(&clap_args)?;
|
||||||
|
|
||||||
|
let cli_spec = try_spec_from_cli(&clap_args, &cli_args)?;
|
||||||
|
|
||||||
|
let wait_spec_result = wait_spec(build_tag, cli_args, cli_spec)?;
|
||||||
|
|
||||||
|
start_postgres(&clap_args, wait_spec_result)?
|
||||||
|
|
||||||
|
// Startup is finished, exit the startup tracing span
|
||||||
|
};
|
||||||
|
|
||||||
|
// PostgreSQL is now running, if startup was successful. Wait until it exits.
|
||||||
|
let wait_pg_result = wait_postgres(pg_handle)?;
|
||||||
|
|
||||||
|
let delay_exit = cleanup_after_postgres_exit(start_pg_result)?;
|
||||||
|
|
||||||
|
maybe_delay_exit(delay_exit);
|
||||||
|
|
||||||
|
deinit_and_exit(wait_pg_result);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn init() -> Result<(String, clap::ArgMatches)> {
|
||||||
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
|
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
|
||||||
|
|
||||||
let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
|
let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
|
||||||
@@ -82,9 +112,15 @@ fn main() -> Result<()> {
|
|||||||
.to_string();
|
.to_string();
|
||||||
info!("build_tag: {build_tag}");
|
info!("build_tag: {build_tag}");
|
||||||
|
|
||||||
let matches = cli().get_matches();
|
Ok((build_tag, cli().get_matches()))
|
||||||
let pgbin_default = String::from("postgres");
|
}
|
||||||
let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
|
|
||||||
|
fn process_cli(matches: &clap::ArgMatches) -> Result<ProcessCliResult> {
|
||||||
|
let pgbin_default = "postgres";
|
||||||
|
let pgbin = matches
|
||||||
|
.get_one::<String>("pgbin")
|
||||||
|
.map(|s| s.as_str())
|
||||||
|
.unwrap_or(pgbin_default);
|
||||||
|
|
||||||
let ext_remote_storage = matches
|
let ext_remote_storage = matches
|
||||||
.get_one::<String>("remote-ext-config")
|
.get_one::<String>("remote-ext-config")
|
||||||
@@ -110,7 +146,32 @@ fn main() -> Result<()> {
|
|||||||
.expect("Postgres connection string is required");
|
.expect("Postgres connection string is required");
|
||||||
let spec_json = matches.get_one::<String>("spec");
|
let spec_json = matches.get_one::<String>("spec");
|
||||||
let spec_path = matches.get_one::<String>("spec-path");
|
let spec_path = matches.get_one::<String>("spec-path");
|
||||||
|
let resize_swap_on_bind = matches.get_flag("resize-swap-on-bind");
|
||||||
|
|
||||||
|
Ok(ProcessCliResult {
|
||||||
|
connstr,
|
||||||
|
pgdata,
|
||||||
|
pgbin,
|
||||||
|
ext_remote_storage,
|
||||||
|
http_port,
|
||||||
|
spec_json,
|
||||||
|
spec_path,
|
||||||
|
resize_swap_on_bind,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ProcessCliResult<'clap> {
|
||||||
|
connstr: &'clap str,
|
||||||
|
pgdata: &'clap str,
|
||||||
|
pgbin: &'clap str,
|
||||||
|
ext_remote_storage: Option<&'clap str>,
|
||||||
|
http_port: u16,
|
||||||
|
spec_json: Option<&'clap String>,
|
||||||
|
spec_path: Option<&'clap String>,
|
||||||
|
resize_swap_on_bind: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
|
||||||
// Extract OpenTelemetry context for the startup actions from the
|
// Extract OpenTelemetry context for the startup actions from the
|
||||||
// TRACEPARENT and TRACESTATE env variables, and attach it to the current
|
// TRACEPARENT and TRACESTATE env variables, and attach it to the current
|
||||||
// tracing context.
|
// tracing context.
|
||||||
@@ -147,7 +208,7 @@ fn main() -> Result<()> {
|
|||||||
if let Ok(val) = std::env::var("TRACESTATE") {
|
if let Ok(val) = std::env::var("TRACESTATE") {
|
||||||
startup_tracing_carrier.insert("tracestate".to_string(), val);
|
startup_tracing_carrier.insert("tracestate".to_string(), val);
|
||||||
}
|
}
|
||||||
let startup_context_guard = if !startup_tracing_carrier.is_empty() {
|
if !startup_tracing_carrier.is_empty() {
|
||||||
use opentelemetry::propagation::TextMapPropagator;
|
use opentelemetry::propagation::TextMapPropagator;
|
||||||
use opentelemetry::sdk::propagation::TraceContextPropagator;
|
use opentelemetry::sdk::propagation::TraceContextPropagator;
|
||||||
let guard = TraceContextPropagator::new()
|
let guard = TraceContextPropagator::new()
|
||||||
@@ -157,8 +218,17 @@ fn main() -> Result<()> {
|
|||||||
Some(guard)
|
Some(guard)
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
};
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn try_spec_from_cli(
|
||||||
|
matches: &clap::ArgMatches,
|
||||||
|
ProcessCliResult {
|
||||||
|
spec_json,
|
||||||
|
spec_path,
|
||||||
|
..
|
||||||
|
}: &ProcessCliResult,
|
||||||
|
) -> Result<CliSpecParams> {
|
||||||
let compute_id = matches.get_one::<String>("compute-id");
|
let compute_id = matches.get_one::<String>("compute-id");
|
||||||
let control_plane_uri = matches.get_one::<String>("control-plane-uri");
|
let control_plane_uri = matches.get_one::<String>("control-plane-uri");
|
||||||
|
|
||||||
@@ -199,6 +269,34 @@ fn main() -> Result<()> {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
Ok(CliSpecParams {
|
||||||
|
spec,
|
||||||
|
live_config_allowed,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
struct CliSpecParams {
|
||||||
|
/// If a spec was provided via CLI or file, the [`ComputeSpec`]
|
||||||
|
spec: Option<ComputeSpec>,
|
||||||
|
live_config_allowed: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn wait_spec(
|
||||||
|
build_tag: String,
|
||||||
|
ProcessCliResult {
|
||||||
|
connstr,
|
||||||
|
pgdata,
|
||||||
|
pgbin,
|
||||||
|
ext_remote_storage,
|
||||||
|
resize_swap_on_bind,
|
||||||
|
http_port,
|
||||||
|
..
|
||||||
|
}: ProcessCliResult,
|
||||||
|
CliSpecParams {
|
||||||
|
spec,
|
||||||
|
live_config_allowed,
|
||||||
|
}: CliSpecParams,
|
||||||
|
) -> Result<WaitSpecResult> {
|
||||||
let mut new_state = ComputeState::new();
|
let mut new_state = ComputeState::new();
|
||||||
let spec_set;
|
let spec_set;
|
||||||
|
|
||||||
@@ -226,19 +324,17 @@ fn main() -> Result<()> {
|
|||||||
|
|
||||||
// If this is a pooled VM, prewarm before starting HTTP server and becoming
|
// If this is a pooled VM, prewarm before starting HTTP server and becoming
|
||||||
// available for binding. Prewarming helps Postgres start quicker later,
|
// available for binding. Prewarming helps Postgres start quicker later,
|
||||||
// because QEMU will already have it's memory allocated from the host, and
|
// because QEMU will already have its memory allocated from the host, and
|
||||||
// the necessary binaries will already be cached.
|
// the necessary binaries will already be cached.
|
||||||
if !spec_set {
|
if !spec_set {
|
||||||
compute.prewarm_postgres()?;
|
compute.prewarm_postgres()?;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Launch http service first, so we were able to serve control-plane
|
// Launch http service first, so that we can serve control-plane requests
|
||||||
// requests, while configuration is still in progress.
|
// while configuration is still in progress.
|
||||||
let _http_handle =
|
let _http_handle =
|
||||||
launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");
|
launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");
|
||||||
|
|
||||||
let extension_server_port: u16 = http_port;
|
|
||||||
|
|
||||||
if !spec_set {
|
if !spec_set {
|
||||||
// No spec provided, hang waiting for it.
|
// No spec provided, hang waiting for it.
|
||||||
info!("no compute spec provided, waiting");
|
info!("no compute spec provided, waiting");
|
||||||
@@ -253,21 +349,45 @@ fn main() -> Result<()> {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Record for how long we slept waiting for the spec.
|
||||||
|
let now = Utc::now();
|
||||||
|
state.metrics.wait_for_spec_ms = now
|
||||||
|
.signed_duration_since(state.start_time)
|
||||||
|
.to_std()
|
||||||
|
.unwrap()
|
||||||
|
.as_millis() as u64;
|
||||||
|
|
||||||
|
// Reset start time, so that the total startup time that is calculated later will
|
||||||
|
// not include the time that we waited for the spec.
|
||||||
|
state.start_time = now;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Ok(WaitSpecResult {
|
||||||
|
compute,
|
||||||
|
http_port,
|
||||||
|
resize_swap_on_bind,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
struct WaitSpecResult {
|
||||||
|
compute: Arc<ComputeNode>,
|
||||||
|
// passed through from ProcessCliResult
|
||||||
|
http_port: u16,
|
||||||
|
resize_swap_on_bind: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn start_postgres(
|
||||||
|
// need to allow unused because `matches` is only used if target_os = "linux"
|
||||||
|
#[allow(unused_variables)] matches: &clap::ArgMatches,
|
||||||
|
WaitSpecResult {
|
||||||
|
compute,
|
||||||
|
http_port,
|
||||||
|
resize_swap_on_bind,
|
||||||
|
}: WaitSpecResult,
|
||||||
|
) -> Result<(Option<PostgresHandle>, StartPostgresResult)> {
|
||||||
// We got all we need, update the state.
|
// We got all we need, update the state.
|
||||||
let mut state = compute.state.lock().unwrap();
|
let mut state = compute.state.lock().unwrap();
|
||||||
|
|
||||||
// Record for how long we slept waiting for the spec.
|
|
||||||
state.metrics.wait_for_spec_ms = Utc::now()
|
|
||||||
.signed_duration_since(state.start_time)
|
|
||||||
.to_std()
|
|
||||||
.unwrap()
|
|
||||||
.as_millis() as u64;
|
|
||||||
// Reset start time to the actual start of the configuration, so that
|
|
||||||
// total startup time was properly measured at the end.
|
|
||||||
state.start_time = Utc::now();
|
|
||||||
|
|
||||||
state.status = ComputeStatus::Init;
|
state.status = ComputeStatus::Init;
|
||||||
compute.state_changed.notify_all();
|
compute.state_changed.notify_all();
|
||||||
|
|
||||||
@@ -275,33 +395,72 @@ fn main() -> Result<()> {
|
|||||||
"running compute with features: {:?}",
|
"running compute with features: {:?}",
|
||||||
state.pspec.as_ref().unwrap().spec.features
|
state.pspec.as_ref().unwrap().spec.features
|
||||||
);
|
);
|
||||||
|
// before we release the mutex, fetch the swap size (if any) for later.
|
||||||
|
let swap_size_bytes = state.pspec.as_ref().unwrap().spec.swap_size_bytes;
|
||||||
drop(state);
|
drop(state);
|
||||||
|
|
||||||
// Launch remaining service threads
|
// Launch remaining service threads
|
||||||
let _monitor_handle = launch_monitor(&compute);
|
let _monitor_handle = launch_monitor(&compute);
|
||||||
let _configurator_handle = launch_configurator(&compute);
|
let _configurator_handle = launch_configurator(&compute);
|
||||||
|
|
||||||
// Start Postgres
|
let mut prestartup_failed = false;
|
||||||
let mut delay_exit = false;
|
let mut delay_exit = false;
|
||||||
let mut exit_code = None;
|
|
||||||
let pg = match compute.start_compute(extension_server_port) {
|
// Resize swap to the desired size if the compute spec says so
|
||||||
Ok(pg) => Some(pg),
|
if let (Some(size_bytes), true) = (swap_size_bytes, resize_swap_on_bind) {
|
||||||
Err(err) => {
|
// To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion
|
||||||
error!("could not start the compute node: {:#}", err);
|
// *before* starting postgres.
|
||||||
let mut state = compute.state.lock().unwrap();
|
//
|
||||||
state.error = Some(format!("{:?}", err));
|
// In theory, we could do this asynchronously if SkipSwapon was enabled for VMs, but this
|
||||||
state.status = ComputeStatus::Failed;
|
// carries a risk of introducing hard-to-debug issues - e.g. if postgres sometimes gets
|
||||||
// Notify others that Postgres failed to start. In case of configuring the
|
// OOM-killed during startup because swap wasn't available yet.
|
||||||
// empty compute, it's likely that API handler is still waiting for compute
|
match resize_swap(size_bytes) {
|
||||||
// state change. With this we will notify it that compute is in Failed state,
|
Ok(()) => {
|
||||||
// so control plane will know about it earlier and record proper error instead
|
let size_gib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
|
||||||
// of timeout.
|
info!(%size_bytes, %size_gib, "resized swap");
|
||||||
compute.state_changed.notify_all();
|
}
|
||||||
drop(state); // unlock
|
Err(err) => {
|
||||||
delay_exit = true;
|
let err = err.context("failed to resize swap");
|
||||||
None
|
error!("{err:#}");
|
||||||
|
|
||||||
|
// Mark compute startup as failed; don't try to start postgres, and report this
|
||||||
|
// error to the control plane when it next asks.
|
||||||
|
prestartup_failed = true;
|
||||||
|
let mut state = compute.state.lock().unwrap();
|
||||||
|
state.error = Some(format!("{err:?}"));
|
||||||
|
state.status = ComputeStatus::Failed;
|
||||||
|
compute.state_changed.notify_all();
|
||||||
|
delay_exit = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
}
|
||||||
|
|
||||||
|
let extension_server_port: u16 = http_port;
|
||||||
|
|
||||||
|
// Start Postgres
|
||||||
|
let mut pg = None;
|
||||||
|
if !prestartup_failed {
|
||||||
|
pg = match compute.start_compute(extension_server_port) {
|
||||||
|
Ok(pg) => Some(pg),
|
||||||
|
Err(err) => {
|
||||||
|
error!("could not start the compute node: {:#}", err);
|
||||||
|
let mut state = compute.state.lock().unwrap();
|
||||||
|
state.error = Some(format!("{:?}", err));
|
||||||
|
state.status = ComputeStatus::Failed;
|
||||||
|
// Notify others that Postgres failed to start. In case of configuring the
|
||||||
|
// empty compute, it's likely that API handler is still waiting for compute
|
||||||
|
// state change. With this we will notify it that compute is in Failed state,
|
||||||
|
// so control plane will know about it earlier and record proper error instead
|
||||||
|
// of timeout.
|
||||||
|
compute.state_changed.notify_all();
|
||||||
|
drop(state); // unlock
|
||||||
|
delay_exit = true;
|
||||||
|
None
|
||||||
|
}
|
||||||
|
};
|
||||||
|
} else {
|
||||||
|
warn!("skipping postgres startup because pre-startup step failed");
|
||||||
|
}
|
||||||
|
|
||||||
// Start the vm-monitor if directed to. The vm-monitor only runs on linux
|
// Start the vm-monitor if directed to. The vm-monitor only runs on linux
|
||||||
// because it requires cgroups.
|
// because it requires cgroups.
|
||||||
@@ -334,7 +493,7 @@ fn main() -> Result<()> {
|
|||||||
// This token is used internally by the monitor to clean up all threads
|
// This token is used internally by the monitor to clean up all threads
|
||||||
let token = CancellationToken::new();
|
let token = CancellationToken::new();
|
||||||
|
|
||||||
let vm_monitor = &rt.as_ref().map(|rt| {
|
let vm_monitor = rt.as_ref().map(|rt| {
|
||||||
rt.spawn(vm_monitor::start(
|
rt.spawn(vm_monitor::start(
|
||||||
Box::leak(Box::new(vm_monitor::Args {
|
Box::leak(Box::new(vm_monitor::Args {
|
||||||
cgroup: cgroup.cloned(),
|
cgroup: cgroup.cloned(),
|
||||||
@@ -347,12 +506,41 @@ fn main() -> Result<()> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Ok((
|
||||||
|
pg,
|
||||||
|
StartPostgresResult {
|
||||||
|
delay_exit,
|
||||||
|
compute,
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
rt,
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
token,
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
vm_monitor,
|
||||||
|
},
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
type PostgresHandle = (std::process::Child, std::thread::JoinHandle<()>);
|
||||||
|
|
||||||
|
struct StartPostgresResult {
|
||||||
|
delay_exit: bool,
|
||||||
|
// passed through from WaitSpecResult
|
||||||
|
compute: Arc<ComputeNode>,
|
||||||
|
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
rt: Option<tokio::runtime::Runtime>,
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
token: tokio_util::sync::CancellationToken,
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
vm_monitor: Option<tokio::task::JoinHandle<Result<()>>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn wait_postgres(pg: Option<PostgresHandle>) -> Result<WaitPostgresResult> {
|
||||||
// Wait for the child Postgres process forever. In this state Ctrl+C will
|
// Wait for the child Postgres process forever. In this state Ctrl+C will
|
||||||
// propagate to Postgres and it will be shut down as well.
|
// propagate to Postgres and it will be shut down as well.
|
||||||
|
let mut exit_code = None;
|
||||||
if let Some((mut pg, logs_handle)) = pg {
|
if let Some((mut pg, logs_handle)) = pg {
|
||||||
// Startup is finished, exit the startup tracing span
|
|
||||||
drop(startup_context_guard);
|
|
||||||
|
|
||||||
let ecode = pg
|
let ecode = pg
|
||||||
.wait()
|
.wait()
|
||||||
.expect("failed to start waiting on Postgres process");
|
.expect("failed to start waiting on Postgres process");
|
||||||
@@ -367,6 +555,25 @@ fn main() -> Result<()> {
|
|||||||
exit_code = ecode.code()
|
exit_code = ecode.code()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Ok(WaitPostgresResult { exit_code })
|
||||||
|
}
|
||||||
|
|
||||||
|
struct WaitPostgresResult {
|
||||||
|
exit_code: Option<i32>,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn cleanup_after_postgres_exit(
|
||||||
|
StartPostgresResult {
|
||||||
|
mut delay_exit,
|
||||||
|
compute,
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
vm_monitor,
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
token,
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
rt,
|
||||||
|
}: StartPostgresResult,
|
||||||
|
) -> Result<bool> {
|
||||||
// Terminate the vm_monitor so it releases the file watcher on
|
// Terminate the vm_monitor so it releases the file watcher on
|
||||||
// /sys/fs/cgroup/neon-postgres.
|
// /sys/fs/cgroup/neon-postgres.
|
||||||
// Note: the vm-monitor only runs on linux because it requires cgroups.
|
// Note: the vm-monitor only runs on linux because it requires cgroups.
|
||||||
@@ -408,13 +615,19 @@ fn main() -> Result<()> {
|
|||||||
error!("error while checking for core dumps: {err:?}");
|
error!("error while checking for core dumps: {err:?}");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Ok(delay_exit)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn maybe_delay_exit(delay_exit: bool) {
|
||||||
// If launch failed, keep serving HTTP requests for a while, so the cloud
|
// If launch failed, keep serving HTTP requests for a while, so the cloud
|
||||||
// control plane can get the actual error.
|
// control plane can get the actual error.
|
||||||
if delay_exit {
|
if delay_exit {
|
||||||
info!("giving control plane 30s to collect the error before shutdown");
|
info!("giving control plane 30s to collect the error before shutdown");
|
||||||
thread::sleep(Duration::from_secs(30));
|
thread::sleep(Duration::from_secs(30));
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn deinit_and_exit(WaitPostgresResult { exit_code }: WaitPostgresResult) -> ! {
|
||||||
// Shutdown trace pipeline gracefully, so that it has a chance to send any
|
// Shutdown trace pipeline gracefully, so that it has a chance to send any
|
||||||
// pending traces before we exit. Shutting down OTEL tracing provider may
|
// pending traces before we exit. Shutting down OTEL tracing provider may
|
||||||
// hang for quite some time, see, for example:
|
// hang for quite some time, see, for example:
|
||||||
@@ -526,6 +739,11 @@ fn cli() -> clap::Command {
|
|||||||
)
|
)
|
||||||
.value_name("FILECACHE_CONNSTR"),
|
.value_name("FILECACHE_CONNSTR"),
|
||||||
)
|
)
|
||||||
|
.arg(
|
||||||
|
Arg::new("resize-swap-on-bind")
|
||||||
|
.long("resize-swap-on-bind")
|
||||||
|
.action(clap::ArgAction::SetTrue),
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// When compute_ctl is killed, send also termination signal to sync-safekeepers
|
/// When compute_ctl is killed, send also termination signal to sync-safekeepers
|
||||||
|
|||||||
116
compute_tools/src/catalog.rs
Normal file
116
compute_tools/src/catalog.rs
Normal file
@@ -0,0 +1,116 @@
|
|||||||
|
use compute_api::{
|
||||||
|
responses::CatalogObjects,
|
||||||
|
spec::{Database, Role},
|
||||||
|
};
|
||||||
|
use futures::Stream;
|
||||||
|
use postgres::{Client, NoTls};
|
||||||
|
use std::{path::Path, process::Stdio, result::Result, sync::Arc};
|
||||||
|
use tokio::{
|
||||||
|
io::{AsyncBufReadExt, BufReader},
|
||||||
|
process::Command,
|
||||||
|
task,
|
||||||
|
};
|
||||||
|
use tokio_stream::{self as stream, StreamExt};
|
||||||
|
use tokio_util::codec::{BytesCodec, FramedRead};
|
||||||
|
use tracing::warn;
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
compute::ComputeNode,
|
||||||
|
pg_helpers::{get_existing_dbs, get_existing_roles},
|
||||||
|
};
|
||||||
|
|
||||||
|
pub async fn get_dbs_and_roles(compute: &Arc<ComputeNode>) -> anyhow::Result<CatalogObjects> {
|
||||||
|
let connstr = compute.connstr.clone();
|
||||||
|
task::spawn_blocking(move || {
|
||||||
|
let mut client = Client::connect(connstr.as_str(), NoTls)?;
|
||||||
|
let roles: Vec<Role>;
|
||||||
|
{
|
||||||
|
let mut xact = client.transaction()?;
|
||||||
|
roles = get_existing_roles(&mut xact)?;
|
||||||
|
}
|
||||||
|
let databases: Vec<Database> = get_existing_dbs(&mut client)?.values().cloned().collect();
|
||||||
|
|
||||||
|
Ok(CatalogObjects { roles, databases })
|
||||||
|
})
|
||||||
|
.await?
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, thiserror::Error)]
|
||||||
|
pub enum SchemaDumpError {
|
||||||
|
#[error("Database does not exist.")]
|
||||||
|
DatabaseDoesNotExist,
|
||||||
|
#[error("Failed to execute pg_dump.")]
|
||||||
|
IO(#[from] std::io::Error),
|
||||||
|
}
|
||||||
|
|
||||||
|
// It uses the pg_dump utility to dump the schema of the specified database.
|
||||||
|
// The output is streamed back to the caller and supposed to be streamed via HTTP.
|
||||||
|
//
|
||||||
|
// Before return the result with the output, it checks that pg_dump produced any output.
|
||||||
|
// If not, it tries to parse the stderr output to determine if the database does not exist
|
||||||
|
// and special error is returned.
|
||||||
|
//
|
||||||
|
// To make sure that the process is killed when the caller drops the stream, we use tokio kill_on_drop feature.
|
||||||
|
pub async fn get_database_schema(
|
||||||
|
compute: &Arc<ComputeNode>,
|
||||||
|
dbname: &str,
|
||||||
|
) -> Result<impl Stream<Item = Result<bytes::Bytes, std::io::Error>>, SchemaDumpError> {
|
||||||
|
let pgbin = &compute.pgbin;
|
||||||
|
let basepath = Path::new(pgbin).parent().unwrap();
|
||||||
|
let pgdump = basepath.join("pg_dump");
|
||||||
|
let mut connstr = compute.connstr.clone();
|
||||||
|
connstr.set_path(dbname);
|
||||||
|
let mut cmd = Command::new(pgdump)
|
||||||
|
.arg("--schema-only")
|
||||||
|
.arg(connstr.as_str())
|
||||||
|
.stdout(Stdio::piped())
|
||||||
|
.stderr(Stdio::piped())
|
||||||
|
.kill_on_drop(true)
|
||||||
|
.spawn()?;
|
||||||
|
|
||||||
|
let stdout = cmd.stdout.take().ok_or_else(|| {
|
||||||
|
std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stdout.")
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let stderr = cmd.stderr.take().ok_or_else(|| {
|
||||||
|
std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stderr.")
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let mut stdout_reader = FramedRead::new(stdout, BytesCodec::new());
|
||||||
|
let stderr_reader = BufReader::new(stderr);
|
||||||
|
|
||||||
|
let first_chunk = match stdout_reader.next().await {
|
||||||
|
Some(Ok(bytes)) if !bytes.is_empty() => bytes,
|
||||||
|
Some(Err(e)) => {
|
||||||
|
return Err(SchemaDumpError::IO(e));
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
let mut lines = stderr_reader.lines();
|
||||||
|
if let Some(line) = lines.next_line().await? {
|
||||||
|
if line.contains(&format!("FATAL: database \"{}\" does not exist", dbname)) {
|
||||||
|
return Err(SchemaDumpError::DatabaseDoesNotExist);
|
||||||
|
}
|
||||||
|
warn!("pg_dump stderr: {}", line)
|
||||||
|
}
|
||||||
|
tokio::spawn(async move {
|
||||||
|
while let Ok(Some(line)) = lines.next_line().await {
|
||||||
|
warn!("pg_dump stderr: {}", line)
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return Err(SchemaDumpError::IO(std::io::Error::new(
|
||||||
|
std::io::ErrorKind::Other,
|
||||||
|
"failed to start pg_dump",
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let initial_stream = stream::once(Ok(first_chunk.freeze()));
|
||||||
|
// Consume stderr and log warnings
|
||||||
|
tokio::spawn(async move {
|
||||||
|
let mut lines = stderr_reader.lines();
|
||||||
|
while let Ok(Some(line)) = lines.next_line().await {
|
||||||
|
warn!("pg_dump stderr: {}", line)
|
||||||
|
}
|
||||||
|
});
|
||||||
|
Ok(initial_stream.chain(stdout_reader.map(|res| res.map(|b| b.freeze()))))
|
||||||
|
}
|
||||||
@@ -5,17 +5,21 @@ use std::net::SocketAddr;
|
|||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::thread;
|
use std::thread;
|
||||||
|
|
||||||
|
use crate::catalog::SchemaDumpError;
|
||||||
|
use crate::catalog::{get_database_schema, get_dbs_and_roles};
|
||||||
use crate::compute::forward_termination_signal;
|
use crate::compute::forward_termination_signal;
|
||||||
use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
|
use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
|
||||||
use compute_api::requests::ConfigurationRequest;
|
use compute_api::requests::ConfigurationRequest;
|
||||||
use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};
|
use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};
|
||||||
|
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
|
use hyper::header::CONTENT_TYPE;
|
||||||
use hyper::service::{make_service_fn, service_fn};
|
use hyper::service::{make_service_fn, service_fn};
|
||||||
use hyper::{Body, Method, Request, Response, Server, StatusCode};
|
use hyper::{Body, Method, Request, Response, Server, StatusCode};
|
||||||
use tokio::task;
|
use tokio::task;
|
||||||
use tracing::{error, info, warn};
|
use tracing::{error, info, warn};
|
||||||
use tracing_utils::http::OtelName;
|
use tracing_utils::http::OtelName;
|
||||||
|
use utils::http::request::must_get_query_param;
|
||||||
|
|
||||||
fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
|
fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
|
||||||
ComputeStatusResponse {
|
ComputeStatusResponse {
|
||||||
@@ -133,6 +137,34 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
(&Method::GET, "/dbs_and_roles") => {
|
||||||
|
info!("serving /dbs_and_roles GET request",);
|
||||||
|
match get_dbs_and_roles(compute).await {
|
||||||
|
Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())),
|
||||||
|
Err(_) => {
|
||||||
|
render_json_error("can't get dbs and roles", StatusCode::INTERNAL_SERVER_ERROR)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
(&Method::GET, "/database_schema") => {
|
||||||
|
let database = match must_get_query_param(&req, "database") {
|
||||||
|
Err(e) => return e.into_response(),
|
||||||
|
Ok(database) => database,
|
||||||
|
};
|
||||||
|
info!("serving /database_schema GET request with database: {database}",);
|
||||||
|
match get_database_schema(compute, &database).await {
|
||||||
|
Ok(res) => render_plain(Body::wrap_stream(res)),
|
||||||
|
Err(SchemaDumpError::DatabaseDoesNotExist) => {
|
||||||
|
render_json_error("database does not exist", StatusCode::NOT_FOUND)
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
error!("can't get schema dump: {}", e);
|
||||||
|
render_json_error("can't get schema dump", StatusCode::INTERNAL_SERVER_ERROR)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// download extension files from remote extension storage on demand
|
// download extension files from remote extension storage on demand
|
||||||
(&Method::POST, route) if route.starts_with("/extension_server/") => {
|
(&Method::POST, route) if route.starts_with("/extension_server/") => {
|
||||||
info!("serving {:?} POST request", route);
|
info!("serving {:?} POST request", route);
|
||||||
@@ -303,10 +335,25 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
|
|||||||
};
|
};
|
||||||
Response::builder()
|
Response::builder()
|
||||||
.status(status)
|
.status(status)
|
||||||
|
.header(CONTENT_TYPE, "application/json")
|
||||||
.body(Body::from(serde_json::to_string(&error).unwrap()))
|
.body(Body::from(serde_json::to_string(&error).unwrap()))
|
||||||
.unwrap()
|
.unwrap()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn render_json(body: Body) -> Response<Body> {
|
||||||
|
Response::builder()
|
||||||
|
.header(CONTENT_TYPE, "application/json")
|
||||||
|
.body(body)
|
||||||
|
.unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn render_plain(body: Body) -> Response<Body> {
|
||||||
|
Response::builder()
|
||||||
|
.header(CONTENT_TYPE, "text/plain")
|
||||||
|
.body(body)
|
||||||
|
.unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
async fn handle_terminate_request(compute: &Arc<ComputeNode>) -> Result<(), (String, StatusCode)> {
|
async fn handle_terminate_request(compute: &Arc<ComputeNode>) -> Result<(), (String, StatusCode)> {
|
||||||
{
|
{
|
||||||
let mut state = compute.state.lock().unwrap();
|
let mut state = compute.state.lock().unwrap();
|
||||||
|
|||||||
@@ -68,6 +68,51 @@ paths:
|
|||||||
schema:
|
schema:
|
||||||
$ref: "#/components/schemas/Info"
|
$ref: "#/components/schemas/Info"
|
||||||
|
|
||||||
|
/dbs_and_roles:
|
||||||
|
get:
|
||||||
|
tags:
|
||||||
|
- Info
|
||||||
|
summary: Get databases and roles in the catalog.
|
||||||
|
description: ""
|
||||||
|
operationId: getDbsAndRoles
|
||||||
|
responses:
|
||||||
|
200:
|
||||||
|
description: Compute schema objects
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/DbsAndRoles"
|
||||||
|
|
||||||
|
/database_schema:
|
||||||
|
get:
|
||||||
|
tags:
|
||||||
|
- Info
|
||||||
|
summary: Get schema dump
|
||||||
|
parameters:
|
||||||
|
- name: database
|
||||||
|
in: query
|
||||||
|
description: Database name to dump.
|
||||||
|
required: true
|
||||||
|
schema:
|
||||||
|
type: string
|
||||||
|
example: "postgres"
|
||||||
|
description: Get schema dump in SQL format.
|
||||||
|
operationId: getDatabaseSchema
|
||||||
|
responses:
|
||||||
|
200:
|
||||||
|
description: Schema dump
|
||||||
|
content:
|
||||||
|
text/plain:
|
||||||
|
schema:
|
||||||
|
type: string
|
||||||
|
description: Schema dump in SQL format.
|
||||||
|
404:
|
||||||
|
description: Non existing database.
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/GenericError"
|
||||||
|
|
||||||
/check_writability:
|
/check_writability:
|
||||||
post:
|
post:
|
||||||
tags:
|
tags:
|
||||||
@@ -229,6 +274,73 @@ components:
|
|||||||
num_cpus:
|
num_cpus:
|
||||||
type: integer
|
type: integer
|
||||||
|
|
||||||
|
DbsAndRoles:
|
||||||
|
type: object
|
||||||
|
description: Databases and Roles
|
||||||
|
required:
|
||||||
|
- roles
|
||||||
|
- databases
|
||||||
|
properties:
|
||||||
|
roles:
|
||||||
|
type: array
|
||||||
|
items:
|
||||||
|
$ref: "#/components/schemas/Role"
|
||||||
|
databases:
|
||||||
|
type: array
|
||||||
|
items:
|
||||||
|
$ref: "#/components/schemas/Database"
|
||||||
|
|
||||||
|
Database:
|
||||||
|
type: object
|
||||||
|
description: Database
|
||||||
|
required:
|
||||||
|
- name
|
||||||
|
- owner
|
||||||
|
- restrict_conn
|
||||||
|
- invalid
|
||||||
|
properties:
|
||||||
|
name:
|
||||||
|
type: string
|
||||||
|
owner:
|
||||||
|
type: string
|
||||||
|
options:
|
||||||
|
type: array
|
||||||
|
items:
|
||||||
|
$ref: "#/components/schemas/GenericOption"
|
||||||
|
restrict_conn:
|
||||||
|
type: boolean
|
||||||
|
invalid:
|
||||||
|
type: boolean
|
||||||
|
|
||||||
|
Role:
|
||||||
|
type: object
|
||||||
|
description: Role
|
||||||
|
required:
|
||||||
|
- name
|
||||||
|
properties:
|
||||||
|
name:
|
||||||
|
type: string
|
||||||
|
encrypted_password:
|
||||||
|
type: string
|
||||||
|
options:
|
||||||
|
type: array
|
||||||
|
items:
|
||||||
|
$ref: "#/components/schemas/GenericOption"
|
||||||
|
|
||||||
|
GenericOption:
|
||||||
|
type: object
|
||||||
|
description: Schema Generic option
|
||||||
|
required:
|
||||||
|
- name
|
||||||
|
- vartype
|
||||||
|
properties:
|
||||||
|
name:
|
||||||
|
type: string
|
||||||
|
value:
|
||||||
|
type: string
|
||||||
|
vartype:
|
||||||
|
type: string
|
||||||
|
|
||||||
ComputeState:
|
ComputeState:
|
||||||
type: object
|
type: object
|
||||||
required:
|
required:
|
||||||
|
|||||||
@@ -8,10 +8,12 @@ pub mod configurator;
|
|||||||
pub mod http;
|
pub mod http;
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
pub mod logger;
|
pub mod logger;
|
||||||
|
pub mod catalog;
|
||||||
pub mod compute;
|
pub mod compute;
|
||||||
pub mod extension_server;
|
pub mod extension_server;
|
||||||
pub mod monitor;
|
pub mod monitor;
|
||||||
pub mod params;
|
pub mod params;
|
||||||
pub mod pg_helpers;
|
pub mod pg_helpers;
|
||||||
pub mod spec;
|
pub mod spec;
|
||||||
|
pub mod swap;
|
||||||
pub mod sync_sk;
|
pub mod sync_sk;
|
||||||
|
|||||||
@@ -490,7 +490,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
|||||||
"rename_db" => {
|
"rename_db" => {
|
||||||
let new_name = op.new_name.as_ref().unwrap();
|
let new_name = op.new_name.as_ref().unwrap();
|
||||||
|
|
||||||
if existing_dbs.get(&op.name).is_some() {
|
if existing_dbs.contains_key(&op.name) {
|
||||||
let query: String = format!(
|
let query: String = format!(
|
||||||
"ALTER DATABASE {} RENAME TO {}",
|
"ALTER DATABASE {} RENAME TO {}",
|
||||||
op.name.pg_quote(),
|
op.name.pg_quote(),
|
||||||
|
|||||||
45
compute_tools/src/swap.rs
Normal file
45
compute_tools/src/swap.rs
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
use anyhow::{anyhow, Context};
|
||||||
|
use tracing::warn;
|
||||||
|
|
||||||
|
pub const RESIZE_SWAP_BIN: &str = "/neonvm/bin/resize-swap";
|
||||||
|
|
||||||
|
pub fn resize_swap(size_bytes: u64) -> anyhow::Result<()> {
|
||||||
|
// run `/neonvm/bin/resize-swap --once {size_bytes}`
|
||||||
|
//
|
||||||
|
// Passing '--once' causes resize-swap to delete itself after successful completion, which
|
||||||
|
// means that if compute_ctl restarts later, we won't end up calling 'swapoff' while
|
||||||
|
// postgres is running.
|
||||||
|
//
|
||||||
|
// NOTE: resize-swap is not very clever. If present, --once MUST be the first arg.
|
||||||
|
let child_result = std::process::Command::new("/usr/bin/sudo")
|
||||||
|
.arg(RESIZE_SWAP_BIN)
|
||||||
|
.arg("--once")
|
||||||
|
.arg(size_bytes.to_string())
|
||||||
|
.spawn();
|
||||||
|
|
||||||
|
child_result
|
||||||
|
.context("spawn() failed")
|
||||||
|
.and_then(|mut child| child.wait().context("wait() failed"))
|
||||||
|
.and_then(|status| match status.success() {
|
||||||
|
true => Ok(()),
|
||||||
|
false => {
|
||||||
|
// The command failed. Maybe it was because the resize-swap file doesn't exist?
|
||||||
|
// The --once flag causes it to delete itself on success so we don't disable swap
|
||||||
|
// while postgres is running; maybe this is fine.
|
||||||
|
match Path::new(RESIZE_SWAP_BIN).try_exists() {
|
||||||
|
Err(_) | Ok(true) => Err(anyhow!("process exited with {status}")),
|
||||||
|
// The path doesn't exist; we're actually ok
|
||||||
|
Ok(false) => {
|
||||||
|
warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running");
|
||||||
|
Ok(())
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
// wrap any prior error with the overall context that we couldn't run the command
|
||||||
|
.with_context(|| {
|
||||||
|
format!("could not run `/usr/bin/sudo {RESIZE_SWAP_BIN} --once {size_bytes}`")
|
||||||
|
})
|
||||||
|
}
|
||||||
@@ -17,6 +17,7 @@ nix.workspace = true
|
|||||||
once_cell.workspace = true
|
once_cell.workspace = true
|
||||||
postgres.workspace = true
|
postgres.workspace = true
|
||||||
hex.workspace = true
|
hex.workspace = true
|
||||||
|
humantime-serde.workspace = true
|
||||||
hyper.workspace = true
|
hyper.workspace = true
|
||||||
regex.workspace = true
|
regex.workspace = true
|
||||||
reqwest = { workspace = true, features = ["blocking", "json"] }
|
reqwest = { workspace = true, features = ["blocking", "json"] }
|
||||||
@@ -27,6 +28,7 @@ serde_with.workspace = true
|
|||||||
tar.workspace = true
|
tar.workspace = true
|
||||||
thiserror.workspace = true
|
thiserror.workspace = true
|
||||||
toml.workspace = true
|
toml.workspace = true
|
||||||
|
toml_edit.workspace = true
|
||||||
tokio.workspace = true
|
tokio.workspace = true
|
||||||
tokio-postgres.workspace = true
|
tokio-postgres.workspace = true
|
||||||
tokio-util.workspace = true
|
tokio-util.workspace = true
|
||||||
|
|||||||
@@ -9,20 +9,23 @@ use anyhow::{anyhow, bail, Context, Result};
|
|||||||
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum};
|
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum};
|
||||||
use compute_api::spec::ComputeMode;
|
use compute_api::spec::ComputeMode;
|
||||||
use control_plane::endpoint::ComputeControlPlane;
|
use control_plane::endpoint::ComputeControlPlane;
|
||||||
use control_plane::local_env::{InitForceMode, LocalEnv};
|
use control_plane::local_env::{
|
||||||
use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
|
InitForceMode, LocalEnv, NeonBroker, NeonLocalInitConf, NeonLocalInitPageserverConf,
|
||||||
|
SafekeeperConf,
|
||||||
|
};
|
||||||
|
use control_plane::pageserver::PageServerNode;
|
||||||
use control_plane::safekeeper::SafekeeperNode;
|
use control_plane::safekeeper::SafekeeperNode;
|
||||||
use control_plane::storage_controller::StorageController;
|
use control_plane::storage_controller::StorageController;
|
||||||
use control_plane::{broker, local_env};
|
use control_plane::{broker, local_env};
|
||||||
|
use pageserver_api::config::{
|
||||||
|
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
|
||||||
|
DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
|
||||||
|
};
|
||||||
use pageserver_api::controller_api::PlacementPolicy;
|
use pageserver_api::controller_api::PlacementPolicy;
|
||||||
use pageserver_api::models::{
|
use pageserver_api::models::{
|
||||||
ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
|
ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
|
||||||
};
|
};
|
||||||
use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
|
use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
|
||||||
use pageserver_api::{
|
|
||||||
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
|
|
||||||
DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
|
|
||||||
};
|
|
||||||
use postgres_backend::AuthType;
|
use postgres_backend::AuthType;
|
||||||
use postgres_connection::parse_host_port;
|
use postgres_connection::parse_host_port;
|
||||||
use safekeeper_api::{
|
use safekeeper_api::{
|
||||||
@@ -52,44 +55,6 @@ const DEFAULT_PG_VERSION: &str = "15";
|
|||||||
|
|
||||||
const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";
|
const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";
|
||||||
|
|
||||||
fn default_conf(num_pageservers: u16) -> String {
|
|
||||||
let mut template = format!(
|
|
||||||
r#"
|
|
||||||
# Default built-in configuration, defined in main.rs
|
|
||||||
control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}'
|
|
||||||
|
|
||||||
[broker]
|
|
||||||
listen_addr = '{DEFAULT_BROKER_ADDR}'
|
|
||||||
|
|
||||||
[[safekeepers]]
|
|
||||||
id = {DEFAULT_SAFEKEEPER_ID}
|
|
||||||
pg_port = {DEFAULT_SAFEKEEPER_PG_PORT}
|
|
||||||
http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT}
|
|
||||||
|
|
||||||
"#,
|
|
||||||
);
|
|
||||||
|
|
||||||
for i in 0..num_pageservers {
|
|
||||||
let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64);
|
|
||||||
let pg_port = DEFAULT_PAGESERVER_PG_PORT + i;
|
|
||||||
let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i;
|
|
||||||
|
|
||||||
template += &format!(
|
|
||||||
r#"
|
|
||||||
[[pageservers]]
|
|
||||||
id = {pageserver_id}
|
|
||||||
listen_pg_addr = '127.0.0.1:{pg_port}'
|
|
||||||
listen_http_addr = '127.0.0.1:{http_port}'
|
|
||||||
pg_auth_type = '{trust_auth}'
|
|
||||||
http_auth_type = '{trust_auth}'
|
|
||||||
"#,
|
|
||||||
trust_auth = AuthType::Trust,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
template
|
|
||||||
}
|
|
||||||
|
|
||||||
///
|
///
|
||||||
/// Timelines tree element used as a value in the HashMap.
|
/// Timelines tree element used as a value in the HashMap.
|
||||||
///
|
///
|
||||||
@@ -133,7 +98,7 @@ fn main() -> Result<()> {
|
|||||||
let subcommand_result = match sub_name {
|
let subcommand_result = match sub_name {
|
||||||
"tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
|
"tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
|
||||||
"timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
|
"timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
|
||||||
"start" => rt.block_on(handle_start_all(sub_args, &env)),
|
"start" => rt.block_on(handle_start_all(&env)),
|
||||||
"stop" => rt.block_on(handle_stop_all(sub_args, &env)),
|
"stop" => rt.block_on(handle_stop_all(sub_args, &env)),
|
||||||
"pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
|
"pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
|
||||||
"storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)),
|
"storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)),
|
||||||
@@ -152,7 +117,7 @@ fn main() -> Result<()> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
match subcommand_result {
|
match subcommand_result {
|
||||||
Ok(Some(updated_env)) => updated_env.persist_config(&updated_env.base_data_dir)?,
|
Ok(Some(updated_env)) => updated_env.persist_config()?,
|
||||||
Ok(None) => (),
|
Ok(None) => (),
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
eprintln!("command failed: {e:?}");
|
eprintln!("command failed: {e:?}");
|
||||||
@@ -341,48 +306,65 @@ fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TimelineId
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
|
fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
|
||||||
let num_pageservers = init_match
|
let num_pageservers = init_match.get_one::<u16>("num-pageservers");
|
||||||
.get_one::<u16>("num-pageservers")
|
|
||||||
.expect("num-pageservers arg has a default");
|
let force = init_match.get_one("force").expect("we set a default value");
|
||||||
// Create config file
|
|
||||||
let toml_file: String = if let Some(config_path) = init_match.get_one::<PathBuf>("config") {
|
// Create the in-memory `LocalEnv` that we'd normally load from disk in `load_config`.
|
||||||
|
let init_conf: NeonLocalInitConf = if let Some(config_path) =
|
||||||
|
init_match.get_one::<PathBuf>("config")
|
||||||
|
{
|
||||||
|
// User (likely the Python test suite) provided a description of the environment.
|
||||||
|
if num_pageservers.is_some() {
|
||||||
|
bail!("Cannot specify both --num-pageservers and --config, use key `pageservers` in the --config file instead");
|
||||||
|
}
|
||||||
// load and parse the file
|
// load and parse the file
|
||||||
std::fs::read_to_string(config_path).with_context(|| {
|
let contents = std::fs::read_to_string(config_path).with_context(|| {
|
||||||
format!(
|
format!(
|
||||||
"Could not read configuration file '{}'",
|
"Could not read configuration file '{}'",
|
||||||
config_path.display()
|
config_path.display()
|
||||||
)
|
)
|
||||||
})?
|
})?;
|
||||||
|
toml_edit::de::from_str(&contents)?
|
||||||
} else {
|
} else {
|
||||||
// Built-in default config
|
// User (likely interactive) did not provide a description of the environment, give them the default
|
||||||
default_conf(*num_pageservers)
|
NeonLocalInitConf {
|
||||||
|
control_plane_api: Some(Some(DEFAULT_PAGESERVER_CONTROL_PLANE_API.parse().unwrap())),
|
||||||
|
broker: NeonBroker {
|
||||||
|
listen_addr: DEFAULT_BROKER_ADDR.parse().unwrap(),
|
||||||
|
},
|
||||||
|
safekeepers: vec![SafekeeperConf {
|
||||||
|
id: DEFAULT_SAFEKEEPER_ID,
|
||||||
|
pg_port: DEFAULT_SAFEKEEPER_PG_PORT,
|
||||||
|
http_port: DEFAULT_SAFEKEEPER_HTTP_PORT,
|
||||||
|
..Default::default()
|
||||||
|
}],
|
||||||
|
pageservers: (0..num_pageservers.copied().unwrap_or(1))
|
||||||
|
.map(|i| {
|
||||||
|
let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64);
|
||||||
|
let pg_port = DEFAULT_PAGESERVER_PG_PORT + i;
|
||||||
|
let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i;
|
||||||
|
NeonLocalInitPageserverConf {
|
||||||
|
id: pageserver_id,
|
||||||
|
listen_pg_addr: format!("127.0.0.1:{pg_port}"),
|
||||||
|
listen_http_addr: format!("127.0.0.1:{http_port}"),
|
||||||
|
pg_auth_type: AuthType::Trust,
|
||||||
|
http_auth_type: AuthType::Trust,
|
||||||
|
other: Default::default(),
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect(),
|
||||||
|
pg_distrib_dir: None,
|
||||||
|
neon_distrib_dir: None,
|
||||||
|
default_tenant_id: TenantId::from_array(std::array::from_fn(|_| 0)),
|
||||||
|
storage_controller: None,
|
||||||
|
control_plane_compute_hook_api: None,
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let pg_version = init_match
|
LocalEnv::init(init_conf, force)
|
||||||
.get_one::<u32>("pg-version")
|
.context("materialize initial neon_local environment on disk")?;
|
||||||
.copied()
|
Ok(LocalEnv::load_config().expect("freshly written config should be loadable"))
|
||||||
.context("Failed to parse postgres version from the argument string")?;
|
|
||||||
|
|
||||||
let mut env =
|
|
||||||
LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?;
|
|
||||||
let force = init_match.get_one("force").expect("we set a default value");
|
|
||||||
env.init(pg_version, force)
|
|
||||||
.context("Failed to initialize neon repository")?;
|
|
||||||
|
|
||||||
// Create remote storage location for default LocalFs remote storage
|
|
||||||
std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?;
|
|
||||||
|
|
||||||
// Initialize pageserver, create initial tenant and timeline.
|
|
||||||
for ps_conf in &env.pageservers {
|
|
||||||
PageServerNode::from_env(&env, ps_conf)
|
|
||||||
.initialize(&pageserver_config_overrides(init_match))
|
|
||||||
.unwrap_or_else(|e| {
|
|
||||||
eprintln!("pageserver init failed: {e:?}");
|
|
||||||
exit(1);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(env)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The default pageserver is the one where CLI tenant/timeline operations are sent by default.
|
/// The default pageserver is the one where CLI tenant/timeline operations are sent by default.
|
||||||
@@ -397,15 +379,6 @@ fn get_default_pageserver(env: &local_env::LocalEnv) -> PageServerNode {
|
|||||||
PageServerNode::from_env(env, ps_conf)
|
PageServerNode::from_env(env, ps_conf)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
|
|
||||||
init_match
|
|
||||||
.get_many::<String>("pageserver-config-override")
|
|
||||||
.into_iter()
|
|
||||||
.flatten()
|
|
||||||
.map(String::as_str)
|
|
||||||
.collect()
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn handle_tenant(
|
async fn handle_tenant(
|
||||||
tenant_match: &ArgMatches,
|
tenant_match: &ArgMatches,
|
||||||
env: &mut local_env::LocalEnv,
|
env: &mut local_env::LocalEnv,
|
||||||
@@ -417,6 +390,54 @@ async fn handle_tenant(
|
|||||||
println!("{} {:?}", t.id, t.state);
|
println!("{} {:?}", t.id, t.state);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Some(("import", import_match)) => {
|
||||||
|
let tenant_id = parse_tenant_id(import_match)?.unwrap_or_else(TenantId::generate);
|
||||||
|
|
||||||
|
let storage_controller = StorageController::from_env(env);
|
||||||
|
let create_response = storage_controller.tenant_import(tenant_id).await?;
|
||||||
|
|
||||||
|
let shard_zero = create_response
|
||||||
|
.shards
|
||||||
|
.first()
|
||||||
|
.expect("Import response omitted shards");
|
||||||
|
|
||||||
|
let attached_pageserver_id = shard_zero.node_id;
|
||||||
|
let pageserver =
|
||||||
|
PageServerNode::from_env(env, env.get_pageserver_conf(attached_pageserver_id)?);
|
||||||
|
|
||||||
|
println!(
|
||||||
|
"Imported tenant {tenant_id}, attached to pageserver {attached_pageserver_id}"
|
||||||
|
);
|
||||||
|
|
||||||
|
let timelines = pageserver
|
||||||
|
.http_client
|
||||||
|
.list_timelines(shard_zero.shard_id)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
// Pick a 'main' timeline that has no ancestors, the rest will get arbitrary names
|
||||||
|
let main_timeline = timelines
|
||||||
|
.iter()
|
||||||
|
.find(|t| t.ancestor_timeline_id.is_none())
|
||||||
|
.expect("No timelines found")
|
||||||
|
.timeline_id;
|
||||||
|
|
||||||
|
let mut branch_i = 0;
|
||||||
|
for timeline in timelines.iter() {
|
||||||
|
let branch_name = if timeline.timeline_id == main_timeline {
|
||||||
|
"main".to_string()
|
||||||
|
} else {
|
||||||
|
branch_i += 1;
|
||||||
|
format!("branch_{branch_i}")
|
||||||
|
};
|
||||||
|
|
||||||
|
println!(
|
||||||
|
"Importing timeline {tenant_id}/{} as branch {branch_name}",
|
||||||
|
timeline.timeline_id
|
||||||
|
);
|
||||||
|
|
||||||
|
env.register_branch_mapping(branch_name, tenant_id, timeline.timeline_id)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
Some(("create", create_match)) => {
|
Some(("create", create_match)) => {
|
||||||
let tenant_conf: HashMap<_, _> = create_match
|
let tenant_conf: HashMap<_, _> = create_match
|
||||||
.get_many::<String>("config")
|
.get_many::<String>("config")
|
||||||
@@ -789,6 +810,8 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
|||||||
.copied()
|
.copied()
|
||||||
.unwrap_or(false);
|
.unwrap_or(false);
|
||||||
|
|
||||||
|
let allow_multiple = sub_args.get_flag("allow-multiple");
|
||||||
|
|
||||||
let mode = match (lsn, hot_standby) {
|
let mode = match (lsn, hot_standby) {
|
||||||
(Some(lsn), false) => ComputeMode::Static(lsn),
|
(Some(lsn), false) => ComputeMode::Static(lsn),
|
||||||
(None, true) => ComputeMode::Replica,
|
(None, true) => ComputeMode::Replica,
|
||||||
@@ -806,7 +829,9 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
|||||||
_ => {}
|
_ => {}
|
||||||
}
|
}
|
||||||
|
|
||||||
cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?;
|
if !allow_multiple {
|
||||||
|
cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?;
|
||||||
|
}
|
||||||
|
|
||||||
cplane.new_endpoint(
|
cplane.new_endpoint(
|
||||||
&endpoint_id,
|
&endpoint_id,
|
||||||
@@ -835,6 +860,8 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
|||||||
|
|
||||||
let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");
|
let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");
|
||||||
|
|
||||||
|
let allow_multiple = sub_args.get_flag("allow-multiple");
|
||||||
|
|
||||||
// If --safekeepers argument is given, use only the listed safekeeper nodes.
|
// If --safekeepers argument is given, use only the listed safekeeper nodes.
|
||||||
let safekeepers =
|
let safekeepers =
|
||||||
if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
|
if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
|
||||||
@@ -860,11 +887,13 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
|||||||
.cloned()
|
.cloned()
|
||||||
.unwrap_or_default();
|
.unwrap_or_default();
|
||||||
|
|
||||||
cplane.check_conflicting_endpoints(
|
if !allow_multiple {
|
||||||
endpoint.mode,
|
cplane.check_conflicting_endpoints(
|
||||||
endpoint.tenant_id,
|
endpoint.mode,
|
||||||
endpoint.timeline_id,
|
endpoint.tenant_id,
|
||||||
)?;
|
endpoint.timeline_id,
|
||||||
|
)?;
|
||||||
|
}
|
||||||
|
|
||||||
let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id {
|
let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id {
|
||||||
let conf = env.get_pageserver_conf(pageserver_id).unwrap();
|
let conf = env.get_pageserver_conf(pageserver_id).unwrap();
|
||||||
@@ -1020,10 +1049,7 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
|
|||||||
async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||||
match sub_match.subcommand() {
|
match sub_match.subcommand() {
|
||||||
Some(("start", subcommand_args)) => {
|
Some(("start", subcommand_args)) => {
|
||||||
if let Err(e) = get_pageserver(env, subcommand_args)?
|
if let Err(e) = get_pageserver(env, subcommand_args)?.start().await {
|
||||||
.start(&pageserver_config_overrides(subcommand_args))
|
|
||||||
.await
|
|
||||||
{
|
|
||||||
eprintln!("pageserver start failed: {e}");
|
eprintln!("pageserver start failed: {e}");
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
@@ -1049,10 +1075,7 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
|||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Err(e) = pageserver
|
if let Err(e) = pageserver.start().await {
|
||||||
.start(&pageserver_config_overrides(subcommand_args))
|
|
||||||
.await
|
|
||||||
{
|
|
||||||
eprintln!("pageserver start failed: {e}");
|
eprintln!("pageserver start failed: {e}");
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
@@ -1179,7 +1202,7 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> {
|
async fn handle_start_all(env: &local_env::LocalEnv) -> anyhow::Result<()> {
|
||||||
// Endpoints are not started automatically
|
// Endpoints are not started automatically
|
||||||
|
|
||||||
broker::start_broker_process(env).await?;
|
broker::start_broker_process(env).await?;
|
||||||
@@ -1196,10 +1219,7 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
|||||||
|
|
||||||
for ps_conf in &env.pageservers {
|
for ps_conf in &env.pageservers {
|
||||||
let pageserver = PageServerNode::from_env(env, ps_conf);
|
let pageserver = PageServerNode::from_env(env, ps_conf);
|
||||||
if let Err(e) = pageserver
|
if let Err(e) = pageserver.start().await {
|
||||||
.start(&pageserver_config_overrides(sub_match))
|
|
||||||
.await
|
|
||||||
{
|
|
||||||
eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
|
eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
|
||||||
try_stop_all(env, true).await;
|
try_stop_all(env, true).await;
|
||||||
exit(1);
|
exit(1);
|
||||||
@@ -1340,13 +1360,6 @@ fn cli() -> Command {
|
|||||||
.required(false)
|
.required(false)
|
||||||
.value_name("stop-mode");
|
.value_name("stop-mode");
|
||||||
|
|
||||||
let pageserver_config_args = Arg::new("pageserver-config-override")
|
|
||||||
.long("pageserver-config-override")
|
|
||||||
.num_args(1)
|
|
||||||
.action(ArgAction::Append)
|
|
||||||
.help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more")
|
|
||||||
.required(false);
|
|
||||||
|
|
||||||
let remote_ext_config_args = Arg::new("remote-ext-config")
|
let remote_ext_config_args = Arg::new("remote-ext-config")
|
||||||
.long("remote-ext-config")
|
.long("remote-ext-config")
|
||||||
.num_args(1)
|
.num_args(1)
|
||||||
@@ -1380,9 +1393,7 @@ fn cli() -> Command {
|
|||||||
let num_pageservers_arg = Arg::new("num-pageservers")
|
let num_pageservers_arg = Arg::new("num-pageservers")
|
||||||
.value_parser(value_parser!(u16))
|
.value_parser(value_parser!(u16))
|
||||||
.long("num-pageservers")
|
.long("num-pageservers")
|
||||||
.help("How many pageservers to create (default 1)")
|
.help("How many pageservers to create (default 1)");
|
||||||
.required(false)
|
|
||||||
.default_value("1");
|
|
||||||
|
|
||||||
let update_catalog = Arg::new("update-catalog")
|
let update_catalog = Arg::new("update-catalog")
|
||||||
.value_parser(value_parser!(bool))
|
.value_parser(value_parser!(bool))
|
||||||
@@ -1396,20 +1407,25 @@ fn cli() -> Command {
|
|||||||
.help("If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`")
|
.help("If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`")
|
||||||
.required(false);
|
.required(false);
|
||||||
|
|
||||||
|
let allow_multiple = Arg::new("allow-multiple")
|
||||||
|
.help("Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests.")
|
||||||
|
.long("allow-multiple")
|
||||||
|
.action(ArgAction::SetTrue)
|
||||||
|
.required(false);
|
||||||
|
|
||||||
Command::new("Neon CLI")
|
Command::new("Neon CLI")
|
||||||
.arg_required_else_help(true)
|
.arg_required_else_help(true)
|
||||||
.version(GIT_VERSION)
|
.version(GIT_VERSION)
|
||||||
.subcommand(
|
.subcommand(
|
||||||
Command::new("init")
|
Command::new("init")
|
||||||
.about("Initialize a new Neon repository, preparing configs for services to start with")
|
.about("Initialize a new Neon repository, preparing configs for services to start with")
|
||||||
.arg(pageserver_config_args.clone())
|
|
||||||
.arg(num_pageservers_arg.clone())
|
.arg(num_pageservers_arg.clone())
|
||||||
.arg(
|
.arg(
|
||||||
Arg::new("config")
|
Arg::new("config")
|
||||||
.long("config")
|
.long("config")
|
||||||
.required(false)
|
.required(false)
|
||||||
.value_parser(value_parser!(PathBuf))
|
.value_parser(value_parser!(PathBuf))
|
||||||
.value_name("config"),
|
.value_name("config")
|
||||||
)
|
)
|
||||||
.arg(pg_version_arg.clone())
|
.arg(pg_version_arg.clone())
|
||||||
.arg(force_arg)
|
.arg(force_arg)
|
||||||
@@ -1480,6 +1496,8 @@ fn cli() -> Command {
|
|||||||
.subcommand(Command::new("config")
|
.subcommand(Command::new("config")
|
||||||
.arg(tenant_id_arg.clone())
|
.arg(tenant_id_arg.clone())
|
||||||
.arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
|
.arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
|
||||||
|
.subcommand(Command::new("import").arg(tenant_id_arg.clone().required(true))
|
||||||
|
.about("Import a tenant that is present in remote storage, and create branches for its timelines"))
|
||||||
)
|
)
|
||||||
.subcommand(
|
.subcommand(
|
||||||
Command::new("pageserver")
|
Command::new("pageserver")
|
||||||
@@ -1489,7 +1507,6 @@ fn cli() -> Command {
|
|||||||
.subcommand(Command::new("status"))
|
.subcommand(Command::new("status"))
|
||||||
.subcommand(Command::new("start")
|
.subcommand(Command::new("start")
|
||||||
.about("Start local pageserver")
|
.about("Start local pageserver")
|
||||||
.arg(pageserver_config_args.clone())
|
|
||||||
)
|
)
|
||||||
.subcommand(Command::new("stop")
|
.subcommand(Command::new("stop")
|
||||||
.about("Stop local pageserver")
|
.about("Stop local pageserver")
|
||||||
@@ -1497,15 +1514,14 @@ fn cli() -> Command {
|
|||||||
)
|
)
|
||||||
.subcommand(Command::new("restart")
|
.subcommand(Command::new("restart")
|
||||||
.about("Restart local pageserver")
|
.about("Restart local pageserver")
|
||||||
.arg(pageserver_config_args.clone())
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
.subcommand(
|
.subcommand(
|
||||||
Command::new("storage_controller")
|
Command::new("storage_controller")
|
||||||
.arg_required_else_help(true)
|
.arg_required_else_help(true)
|
||||||
.about("Manage storage_controller")
|
.about("Manage storage_controller")
|
||||||
.subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
|
.subcommand(Command::new("start").about("Start storage controller"))
|
||||||
.subcommand(Command::new("stop").about("Stop local pageserver")
|
.subcommand(Command::new("stop").about("Stop storage controller")
|
||||||
.arg(stop_mode_arg.clone()))
|
.arg(stop_mode_arg.clone()))
|
||||||
)
|
)
|
||||||
.subcommand(
|
.subcommand(
|
||||||
@@ -1551,6 +1567,7 @@ fn cli() -> Command {
|
|||||||
.arg(pg_version_arg.clone())
|
.arg(pg_version_arg.clone())
|
||||||
.arg(hot_standby_arg.clone())
|
.arg(hot_standby_arg.clone())
|
||||||
.arg(update_catalog)
|
.arg(update_catalog)
|
||||||
|
.arg(allow_multiple.clone())
|
||||||
)
|
)
|
||||||
.subcommand(Command::new("start")
|
.subcommand(Command::new("start")
|
||||||
.about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
|
.about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
|
||||||
@@ -1559,6 +1576,7 @@ fn cli() -> Command {
|
|||||||
.arg(safekeepers_arg)
|
.arg(safekeepers_arg)
|
||||||
.arg(remote_ext_config_args)
|
.arg(remote_ext_config_args)
|
||||||
.arg(create_test_user)
|
.arg(create_test_user)
|
||||||
|
.arg(allow_multiple.clone())
|
||||||
)
|
)
|
||||||
.subcommand(Command::new("reconfigure")
|
.subcommand(Command::new("reconfigure")
|
||||||
.about("Reconfigure the endpoint")
|
.about("Reconfigure the endpoint")
|
||||||
@@ -1610,7 +1628,6 @@ fn cli() -> Command {
|
|||||||
.subcommand(
|
.subcommand(
|
||||||
Command::new("start")
|
Command::new("start")
|
||||||
.about("Start page server and safekeepers")
|
.about("Start page server and safekeepers")
|
||||||
.arg(pageserver_config_args)
|
|
||||||
)
|
)
|
||||||
.subcommand(
|
.subcommand(
|
||||||
Command::new("stop")
|
Command::new("stop")
|
||||||
|
|||||||
@@ -554,6 +554,7 @@ impl Endpoint {
|
|||||||
format_version: 1.0,
|
format_version: 1.0,
|
||||||
operation_uuid: None,
|
operation_uuid: None,
|
||||||
features: self.features.clone(),
|
features: self.features.clone(),
|
||||||
|
swap_size_bytes: None,
|
||||||
cluster: Cluster {
|
cluster: Cluster {
|
||||||
cluster_id: None, // project ID: not used
|
cluster_id: None, // project ID: not used
|
||||||
name: None, // project name: not used
|
name: None, // project name: not used
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
//! Now it also provides init method which acts like a stub for proper installation
|
//! Now it also provides init method which acts like a stub for proper installation
|
||||||
//! script which will use local paths.
|
//! script which will use local paths.
|
||||||
|
|
||||||
use anyhow::{bail, ensure, Context};
|
use anyhow::{bail, Context};
|
||||||
|
|
||||||
use clap::ValueEnum;
|
use clap::ValueEnum;
|
||||||
use postgres_backend::AuthType;
|
use postgres_backend::AuthType;
|
||||||
@@ -17,11 +17,14 @@ use std::net::Ipv4Addr;
|
|||||||
use std::net::SocketAddr;
|
use std::net::SocketAddr;
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
use std::process::{Command, Stdio};
|
use std::process::{Command, Stdio};
|
||||||
|
use std::time::Duration;
|
||||||
use utils::{
|
use utils::{
|
||||||
auth::{encode_from_key_file, Claims},
|
auth::{encode_from_key_file, Claims},
|
||||||
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
|
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
use crate::pageserver::PageServerNode;
|
||||||
|
use crate::pageserver::PAGESERVER_REMOTE_STORAGE_DIR;
|
||||||
use crate::safekeeper::SafekeeperNode;
|
use crate::safekeeper::SafekeeperNode;
|
||||||
|
|
||||||
pub const DEFAULT_PG_VERSION: u32 = 15;
|
pub const DEFAULT_PG_VERSION: u32 = 15;
|
||||||
@@ -33,7 +36,7 @@ pub const DEFAULT_PG_VERSION: u32 = 15;
|
|||||||
// to 'neon_local init --config=<path>' option. See control_plane/simple.conf for
|
// to 'neon_local init --config=<path>' option. See control_plane/simple.conf for
|
||||||
// an example.
|
// an example.
|
||||||
//
|
//
|
||||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
#[derive(PartialEq, Eq, Clone, Debug)]
|
||||||
pub struct LocalEnv {
|
pub struct LocalEnv {
|
||||||
// Base directory for all the nodes (the pageserver, safekeepers and
|
// Base directory for all the nodes (the pageserver, safekeepers and
|
||||||
// compute endpoints).
|
// compute endpoints).
|
||||||
@@ -41,55 +44,99 @@ pub struct LocalEnv {
|
|||||||
// This is not stored in the config file. Rather, this is the path where the
|
// This is not stored in the config file. Rather, this is the path where the
|
||||||
// config file itself is. It is read from the NEON_REPO_DIR env variable or
|
// config file itself is. It is read from the NEON_REPO_DIR env variable or
|
||||||
// '.neon' if not given.
|
// '.neon' if not given.
|
||||||
#[serde(skip)]
|
|
||||||
pub base_data_dir: PathBuf,
|
pub base_data_dir: PathBuf,
|
||||||
|
|
||||||
// Path to postgres distribution. It's expected that "bin", "include",
|
// Path to postgres distribution. It's expected that "bin", "include",
|
||||||
// "lib", "share" from postgres distribution are there. If at some point
|
// "lib", "share" from postgres distribution are there. If at some point
|
||||||
// in time we will be able to run against vanilla postgres we may split that
|
// in time we will be able to run against vanilla postgres we may split that
|
||||||
// to four separate paths and match OS-specific installation layout.
|
// to four separate paths and match OS-specific installation layout.
|
||||||
#[serde(default)]
|
|
||||||
pub pg_distrib_dir: PathBuf,
|
pub pg_distrib_dir: PathBuf,
|
||||||
|
|
||||||
// Path to pageserver binary.
|
// Path to pageserver binary.
|
||||||
#[serde(default)]
|
|
||||||
pub neon_distrib_dir: PathBuf,
|
pub neon_distrib_dir: PathBuf,
|
||||||
|
|
||||||
// Default tenant ID to use with the 'neon_local' command line utility, when
|
// Default tenant ID to use with the 'neon_local' command line utility, when
|
||||||
// --tenant_id is not explicitly specified.
|
// --tenant_id is not explicitly specified.
|
||||||
#[serde(default)]
|
|
||||||
pub default_tenant_id: Option<TenantId>,
|
pub default_tenant_id: Option<TenantId>,
|
||||||
|
|
||||||
// used to issue tokens during e.g pg start
|
// used to issue tokens during e.g pg start
|
||||||
#[serde(default)]
|
|
||||||
pub private_key_path: PathBuf,
|
pub private_key_path: PathBuf,
|
||||||
|
|
||||||
pub broker: NeonBroker,
|
pub broker: NeonBroker,
|
||||||
|
|
||||||
|
// Configuration for the storage controller (1 per neon_local environment)
|
||||||
|
pub storage_controller: NeonStorageControllerConf,
|
||||||
|
|
||||||
/// This Vec must always contain at least one pageserver
|
/// This Vec must always contain at least one pageserver
|
||||||
|
/// Populdated by [`Self::load_config`] from the individual `pageserver.toml`s.
|
||||||
|
/// NB: not used anymore except for informing users that they need to change their `.neon/config`.
|
||||||
pub pageservers: Vec<PageServerConf>,
|
pub pageservers: Vec<PageServerConf>,
|
||||||
|
|
||||||
#[serde(default)]
|
|
||||||
pub safekeepers: Vec<SafekeeperConf>,
|
pub safekeepers: Vec<SafekeeperConf>,
|
||||||
|
|
||||||
// Control plane upcall API for pageserver: if None, we will not run storage_controller If set, this will
|
// Control plane upcall API for pageserver: if None, we will not run storage_controller If set, this will
|
||||||
// be propagated into each pageserver's configuration.
|
// be propagated into each pageserver's configuration.
|
||||||
#[serde(default)]
|
|
||||||
pub control_plane_api: Option<Url>,
|
pub control_plane_api: Option<Url>,
|
||||||
|
|
||||||
// Control plane upcall API for storage controller. If set, this will be propagated into the
|
// Control plane upcall API for storage controller. If set, this will be propagated into the
|
||||||
// storage controller's configuration.
|
// storage controller's configuration.
|
||||||
#[serde(default)]
|
|
||||||
pub control_plane_compute_hook_api: Option<Url>,
|
pub control_plane_compute_hook_api: Option<Url>,
|
||||||
|
|
||||||
/// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user.
|
/// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user.
|
||||||
#[serde(default)]
|
|
||||||
// A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
|
// A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
|
||||||
// but deserialization into a generic toml object as `toml::Value::try_from` fails with an error.
|
// but deserialization into a generic toml object as `toml::Value::try_from` fails with an error.
|
||||||
// https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table".
|
// https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table".
|
||||||
|
pub branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// On-disk state stored in `.neon/config`.
|
||||||
|
#[derive(PartialEq, Eq, Clone, Debug, Default, Serialize, Deserialize)]
|
||||||
|
#[serde(default, deny_unknown_fields)]
|
||||||
|
pub struct OnDiskConfig {
|
||||||
|
pub pg_distrib_dir: PathBuf,
|
||||||
|
pub neon_distrib_dir: PathBuf,
|
||||||
|
pub default_tenant_id: Option<TenantId>,
|
||||||
|
pub private_key_path: PathBuf,
|
||||||
|
pub broker: NeonBroker,
|
||||||
|
pub storage_controller: NeonStorageControllerConf,
|
||||||
|
#[serde(
|
||||||
|
skip_serializing,
|
||||||
|
deserialize_with = "fail_if_pageservers_field_specified"
|
||||||
|
)]
|
||||||
|
pub pageservers: Vec<PageServerConf>,
|
||||||
|
pub safekeepers: Vec<SafekeeperConf>,
|
||||||
|
pub control_plane_api: Option<Url>,
|
||||||
|
pub control_plane_compute_hook_api: Option<Url>,
|
||||||
branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
|
branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn fail_if_pageservers_field_specified<'de, D>(_: D) -> Result<Vec<PageServerConf>, D::Error>
|
||||||
|
where
|
||||||
|
D: serde::Deserializer<'de>,
|
||||||
|
{
|
||||||
|
Err(serde::de::Error::custom(
|
||||||
|
"The 'pageservers' field is no longer used; pageserver.toml is now authoritative; \
|
||||||
|
Please remove the `pageservers` from your .neon/config.",
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The description of the neon_local env to be initialized by `neon_local init --config`.
|
||||||
|
#[derive(Clone, Debug, Deserialize)]
|
||||||
|
#[serde(deny_unknown_fields)]
|
||||||
|
pub struct NeonLocalInitConf {
|
||||||
|
// TODO: do we need this? Seems unused
|
||||||
|
pub pg_distrib_dir: Option<PathBuf>,
|
||||||
|
// TODO: do we need this? Seems unused
|
||||||
|
pub neon_distrib_dir: Option<PathBuf>,
|
||||||
|
pub default_tenant_id: TenantId,
|
||||||
|
pub broker: NeonBroker,
|
||||||
|
pub storage_controller: Option<NeonStorageControllerConf>,
|
||||||
|
pub pageservers: Vec<NeonLocalInitPageserverConf>,
|
||||||
|
pub safekeepers: Vec<SafekeeperConf>,
|
||||||
|
pub control_plane_api: Option<Option<Url>>,
|
||||||
|
pub control_plane_compute_hook_api: Option<Option<Url>>,
|
||||||
|
}
|
||||||
|
|
||||||
/// Broker config for cluster internal communication.
|
/// Broker config for cluster internal communication.
|
||||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
@@ -98,6 +145,33 @@ pub struct NeonBroker {
|
|||||||
pub listen_addr: SocketAddr,
|
pub listen_addr: SocketAddr,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Broker config for cluster internal communication.
|
||||||
|
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||||
|
#[serde(default)]
|
||||||
|
pub struct NeonStorageControllerConf {
|
||||||
|
/// Heartbeat timeout before marking a node offline
|
||||||
|
#[serde(with = "humantime_serde")]
|
||||||
|
pub max_unavailable: Duration,
|
||||||
|
|
||||||
|
/// Threshold for auto-splitting a tenant into shards
|
||||||
|
pub split_threshold: Option<u64>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl NeonStorageControllerConf {
|
||||||
|
// Use a shorter pageserver unavailability interval than the default to speed up tests.
|
||||||
|
const DEFAULT_MAX_UNAVAILABLE_INTERVAL: std::time::Duration =
|
||||||
|
std::time::Duration::from_secs(10);
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for NeonStorageControllerConf {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
max_unavailable: Self::DEFAULT_MAX_UNAVAILABLE_INTERVAL,
|
||||||
|
split_threshold: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Dummy Default impl to satisfy Deserialize derive.
|
// Dummy Default impl to satisfy Deserialize derive.
|
||||||
impl Default for NeonBroker {
|
impl Default for NeonBroker {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
@@ -113,22 +187,18 @@ impl NeonBroker {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// neon_local needs to know this subset of pageserver configuration.
|
||||||
|
// For legacy reasons, this information is duplicated from `pageserver.toml` into `.neon/config`.
|
||||||
|
// It can get stale if `pageserver.toml` is changed.
|
||||||
|
// TODO(christian): don't store this at all in `.neon/config`, always load it from `pageserver.toml`
|
||||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||||
#[serde(default, deny_unknown_fields)]
|
#[serde(default, deny_unknown_fields)]
|
||||||
pub struct PageServerConf {
|
pub struct PageServerConf {
|
||||||
// node id
|
|
||||||
pub id: NodeId,
|
pub id: NodeId,
|
||||||
|
|
||||||
// Pageserver connection settings
|
|
||||||
pub listen_pg_addr: String,
|
pub listen_pg_addr: String,
|
||||||
pub listen_http_addr: String,
|
pub listen_http_addr: String,
|
||||||
|
|
||||||
// auth type used for the PG and HTTP ports
|
|
||||||
pub pg_auth_type: AuthType,
|
pub pg_auth_type: AuthType,
|
||||||
pub http_auth_type: AuthType,
|
pub http_auth_type: AuthType,
|
||||||
|
|
||||||
pub(crate) virtual_file_io_engine: Option<String>,
|
|
||||||
pub(crate) get_vectored_impl: Option<String>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for PageServerConf {
|
impl Default for PageServerConf {
|
||||||
@@ -139,8 +209,40 @@ impl Default for PageServerConf {
|
|||||||
listen_http_addr: String::new(),
|
listen_http_addr: String::new(),
|
||||||
pg_auth_type: AuthType::Trust,
|
pg_auth_type: AuthType::Trust,
|
||||||
http_auth_type: AuthType::Trust,
|
http_auth_type: AuthType::Trust,
|
||||||
virtual_file_io_engine: None,
|
}
|
||||||
get_vectored_impl: None,
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The toml that can be passed to `neon_local init --config`.
|
||||||
|
/// This is a subset of the `pageserver.toml` configuration.
|
||||||
|
// TODO(christian): use pageserver_api::config::ConfigToml (PR #7656)
|
||||||
|
#[derive(Clone, Debug, serde::Deserialize, serde::Serialize)]
|
||||||
|
pub struct NeonLocalInitPageserverConf {
|
||||||
|
pub id: NodeId,
|
||||||
|
pub listen_pg_addr: String,
|
||||||
|
pub listen_http_addr: String,
|
||||||
|
pub pg_auth_type: AuthType,
|
||||||
|
pub http_auth_type: AuthType,
|
||||||
|
#[serde(flatten)]
|
||||||
|
pub other: HashMap<String, toml::Value>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<&NeonLocalInitPageserverConf> for PageServerConf {
|
||||||
|
fn from(conf: &NeonLocalInitPageserverConf) -> Self {
|
||||||
|
let NeonLocalInitPageserverConf {
|
||||||
|
id,
|
||||||
|
listen_pg_addr,
|
||||||
|
listen_http_addr,
|
||||||
|
pg_auth_type,
|
||||||
|
http_auth_type,
|
||||||
|
other: _,
|
||||||
|
} = conf;
|
||||||
|
Self {
|
||||||
|
id: *id,
|
||||||
|
listen_pg_addr: listen_pg_addr.clone(),
|
||||||
|
listen_http_addr: listen_http_addr.clone(),
|
||||||
|
pg_auth_type: *pg_auth_type,
|
||||||
|
http_auth_type: *http_auth_type,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -328,41 +430,7 @@ impl LocalEnv {
|
|||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Create a LocalEnv from a config file.
|
/// Construct `Self` from on-disk state.
|
||||||
///
|
|
||||||
/// Unlike 'load_config', this function fills in any defaults that are missing
|
|
||||||
/// from the config file.
|
|
||||||
pub fn parse_config(toml: &str) -> anyhow::Result<Self> {
|
|
||||||
let mut env: LocalEnv = toml::from_str(toml)?;
|
|
||||||
|
|
||||||
// Find postgres binaries.
|
|
||||||
// Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install".
|
|
||||||
// Note that later in the code we assume, that distrib dirs follow the same pattern
|
|
||||||
// for all postgres versions.
|
|
||||||
if env.pg_distrib_dir == Path::new("") {
|
|
||||||
if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
|
|
||||||
env.pg_distrib_dir = postgres_bin.into();
|
|
||||||
} else {
|
|
||||||
let cwd = env::current_dir()?;
|
|
||||||
env.pg_distrib_dir = cwd.join("pg_install")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Find neon binaries.
|
|
||||||
if env.neon_distrib_dir == Path::new("") {
|
|
||||||
env.neon_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
|
|
||||||
}
|
|
||||||
|
|
||||||
if env.pageservers.is_empty() {
|
|
||||||
anyhow::bail!("Configuration must contain at least one pageserver");
|
|
||||||
}
|
|
||||||
|
|
||||||
env.base_data_dir = base_path();
|
|
||||||
|
|
||||||
Ok(env)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Locate and load config
|
|
||||||
pub fn load_config() -> anyhow::Result<Self> {
|
pub fn load_config() -> anyhow::Result<Self> {
|
||||||
let repopath = base_path();
|
let repopath = base_path();
|
||||||
|
|
||||||
@@ -376,38 +444,129 @@ impl LocalEnv {
|
|||||||
// TODO: check that it looks like a neon repository
|
// TODO: check that it looks like a neon repository
|
||||||
|
|
||||||
// load and parse file
|
// load and parse file
|
||||||
let config = fs::read_to_string(repopath.join("config"))?;
|
let config_file_contents = fs::read_to_string(repopath.join("config"))?;
|
||||||
let mut env: LocalEnv = toml::from_str(config.as_str())?;
|
let on_disk_config: OnDiskConfig = toml::from_str(config_file_contents.as_str())?;
|
||||||
|
let mut env = {
|
||||||
|
let OnDiskConfig {
|
||||||
|
pg_distrib_dir,
|
||||||
|
neon_distrib_dir,
|
||||||
|
default_tenant_id,
|
||||||
|
private_key_path,
|
||||||
|
broker,
|
||||||
|
storage_controller,
|
||||||
|
pageservers,
|
||||||
|
safekeepers,
|
||||||
|
control_plane_api,
|
||||||
|
control_plane_compute_hook_api,
|
||||||
|
branch_name_mappings,
|
||||||
|
} = on_disk_config;
|
||||||
|
LocalEnv {
|
||||||
|
base_data_dir: repopath.clone(),
|
||||||
|
pg_distrib_dir,
|
||||||
|
neon_distrib_dir,
|
||||||
|
default_tenant_id,
|
||||||
|
private_key_path,
|
||||||
|
broker,
|
||||||
|
storage_controller,
|
||||||
|
pageservers,
|
||||||
|
safekeepers,
|
||||||
|
control_plane_api,
|
||||||
|
control_plane_compute_hook_api,
|
||||||
|
branch_name_mappings,
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
env.base_data_dir = repopath;
|
// The source of truth for pageserver configuration is the pageserver.toml.
|
||||||
|
assert!(
|
||||||
|
env.pageservers.is_empty(),
|
||||||
|
"we ensure this during deserialization"
|
||||||
|
);
|
||||||
|
env.pageservers = {
|
||||||
|
let iter = std::fs::read_dir(&repopath).context("open dir")?;
|
||||||
|
let mut pageservers = Vec::new();
|
||||||
|
for res in iter {
|
||||||
|
let dentry = res?;
|
||||||
|
const PREFIX: &str = "pageserver_";
|
||||||
|
let dentry_name = dentry
|
||||||
|
.file_name()
|
||||||
|
.into_string()
|
||||||
|
.ok()
|
||||||
|
.with_context(|| format!("non-utf8 dentry: {:?}", dentry.path()))
|
||||||
|
.unwrap();
|
||||||
|
if !dentry_name.starts_with(PREFIX) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if !dentry.file_type().context("determine file type")?.is_dir() {
|
||||||
|
anyhow::bail!("expected a directory, got {:?}", dentry.path());
|
||||||
|
}
|
||||||
|
let id = dentry_name[PREFIX.len()..]
|
||||||
|
.parse::<NodeId>()
|
||||||
|
.with_context(|| format!("parse id from {:?}", dentry.path()))?;
|
||||||
|
// TODO(christian): use pageserver_api::config::ConfigToml (PR #7656)
|
||||||
|
#[derive(serde::Serialize, serde::Deserialize)]
|
||||||
|
// (allow unknown fields, unlike PageServerConf)
|
||||||
|
struct PageserverConfigTomlSubset {
|
||||||
|
id: NodeId,
|
||||||
|
listen_pg_addr: String,
|
||||||
|
listen_http_addr: String,
|
||||||
|
pg_auth_type: AuthType,
|
||||||
|
http_auth_type: AuthType,
|
||||||
|
}
|
||||||
|
let config_toml_path = dentry.path().join("pageserver.toml");
|
||||||
|
let config_toml: PageserverConfigTomlSubset = toml_edit::de::from_str(
|
||||||
|
&std::fs::read_to_string(&config_toml_path)
|
||||||
|
.with_context(|| format!("read {:?}", config_toml_path))?,
|
||||||
|
)
|
||||||
|
.context("parse pageserver.toml")?;
|
||||||
|
let PageserverConfigTomlSubset {
|
||||||
|
id: config_toml_id,
|
||||||
|
listen_pg_addr,
|
||||||
|
listen_http_addr,
|
||||||
|
pg_auth_type,
|
||||||
|
http_auth_type,
|
||||||
|
} = config_toml;
|
||||||
|
let conf = PageServerConf {
|
||||||
|
id: {
|
||||||
|
anyhow::ensure!(
|
||||||
|
config_toml_id == id,
|
||||||
|
"id mismatch: config_toml.id={config_toml_id} id={id}",
|
||||||
|
);
|
||||||
|
id
|
||||||
|
},
|
||||||
|
listen_pg_addr,
|
||||||
|
listen_http_addr,
|
||||||
|
pg_auth_type,
|
||||||
|
http_auth_type,
|
||||||
|
};
|
||||||
|
pageservers.push(conf);
|
||||||
|
}
|
||||||
|
pageservers
|
||||||
|
};
|
||||||
|
|
||||||
Ok(env)
|
Ok(env)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn persist_config(&self, base_path: &Path) -> anyhow::Result<()> {
|
pub fn persist_config(&self) -> anyhow::Result<()> {
|
||||||
// Currently, the user first passes a config file with 'neon_local init --config=<path>'
|
Self::persist_config_impl(
|
||||||
// We read that in, in `create_config`, and fill any missing defaults. Then it's saved
|
&self.base_data_dir,
|
||||||
// to .neon/config. TODO: We lose any formatting and comments along the way, which is
|
&OnDiskConfig {
|
||||||
// a bit sad.
|
pg_distrib_dir: self.pg_distrib_dir.clone(),
|
||||||
let mut conf_content = r#"# This file describes a local deployment of the page server
|
neon_distrib_dir: self.neon_distrib_dir.clone(),
|
||||||
# and safekeeeper node. It is read by the 'neon_local' command-line
|
default_tenant_id: self.default_tenant_id,
|
||||||
# utility.
|
private_key_path: self.private_key_path.clone(),
|
||||||
"#
|
broker: self.broker.clone(),
|
||||||
.to_string();
|
storage_controller: self.storage_controller.clone(),
|
||||||
|
pageservers: vec![], // it's skip_serializing anyway
|
||||||
// Convert the LocalEnv to a toml file.
|
safekeepers: self.safekeepers.clone(),
|
||||||
//
|
control_plane_api: self.control_plane_api.clone(),
|
||||||
// This could be as simple as this:
|
control_plane_compute_hook_api: self.control_plane_compute_hook_api.clone(),
|
||||||
//
|
branch_name_mappings: self.branch_name_mappings.clone(),
|
||||||
// conf_content += &toml::to_string_pretty(env)?;
|
},
|
||||||
//
|
)
|
||||||
// But it results in a "values must be emitted before tables". I'm not sure
|
}
|
||||||
// why, AFAICS the table, i.e. 'safekeepers: Vec<SafekeeperConf>' is last.
|
|
||||||
// Maybe rust reorders the fields to squeeze avoid padding or something?
|
|
||||||
// In any case, converting to toml::Value first, and serializing that, works.
|
|
||||||
// See https://github.com/alexcrichton/toml-rs/issues/142
|
|
||||||
conf_content += &toml::to_string_pretty(&toml::Value::try_from(self)?)?;
|
|
||||||
|
|
||||||
|
pub fn persist_config_impl(base_path: &Path, config: &OnDiskConfig) -> anyhow::Result<()> {
|
||||||
|
let conf_content = &toml::to_string_pretty(config)?;
|
||||||
let target_config_path = base_path.join("config");
|
let target_config_path = base_path.join("config");
|
||||||
fs::write(&target_config_path, conf_content).with_context(|| {
|
fs::write(&target_config_path, conf_content).with_context(|| {
|
||||||
format!(
|
format!(
|
||||||
@@ -432,17 +591,13 @@ impl LocalEnv {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
/// Materialize the [`NeonLocalInitConf`] to disk. Called during [`neon_local init`].
|
||||||
// Initialize a new Neon repository
|
pub fn init(conf: NeonLocalInitConf, force: &InitForceMode) -> anyhow::Result<()> {
|
||||||
//
|
let base_path = base_path();
|
||||||
pub fn init(&mut self, pg_version: u32, force: &InitForceMode) -> anyhow::Result<()> {
|
assert_ne!(base_path, Path::new(""));
|
||||||
// check if config already exists
|
let base_path = &base_path;
|
||||||
let base_path = &self.base_data_dir;
|
|
||||||
ensure!(
|
|
||||||
base_path != Path::new(""),
|
|
||||||
"repository base path is missing"
|
|
||||||
);
|
|
||||||
|
|
||||||
|
// create base_path dir
|
||||||
if base_path.exists() {
|
if base_path.exists() {
|
||||||
match force {
|
match force {
|
||||||
InitForceMode::MustNotExist => {
|
InitForceMode::MustNotExist => {
|
||||||
@@ -474,70 +629,96 @@ impl LocalEnv {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if !self.pg_bin_dir(pg_version)?.join("postgres").exists() {
|
|
||||||
bail!(
|
|
||||||
"Can't find postgres binary at {}",
|
|
||||||
self.pg_bin_dir(pg_version)?.display()
|
|
||||||
);
|
|
||||||
}
|
|
||||||
for binary in ["pageserver", "safekeeper"] {
|
|
||||||
if !self.neon_distrib_dir.join(binary).exists() {
|
|
||||||
bail!(
|
|
||||||
"Can't find binary '{binary}' in neon distrib dir '{}'",
|
|
||||||
self.neon_distrib_dir.display()
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if !base_path.exists() {
|
if !base_path.exists() {
|
||||||
fs::create_dir(base_path)?;
|
fs::create_dir(base_path)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let NeonLocalInitConf {
|
||||||
|
pg_distrib_dir,
|
||||||
|
neon_distrib_dir,
|
||||||
|
default_tenant_id,
|
||||||
|
broker,
|
||||||
|
storage_controller,
|
||||||
|
pageservers,
|
||||||
|
safekeepers,
|
||||||
|
control_plane_api,
|
||||||
|
control_plane_compute_hook_api,
|
||||||
|
} = conf;
|
||||||
|
|
||||||
|
// Find postgres binaries.
|
||||||
|
// Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install".
|
||||||
|
// Note that later in the code we assume, that distrib dirs follow the same pattern
|
||||||
|
// for all postgres versions.
|
||||||
|
let pg_distrib_dir = pg_distrib_dir.unwrap_or_else(|| {
|
||||||
|
if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
|
||||||
|
postgres_bin.into()
|
||||||
|
} else {
|
||||||
|
let cwd = env::current_dir().unwrap();
|
||||||
|
cwd.join("pg_install")
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Find neon binaries.
|
||||||
|
let neon_distrib_dir = neon_distrib_dir
|
||||||
|
.unwrap_or_else(|| env::current_exe().unwrap().parent().unwrap().to_owned());
|
||||||
|
|
||||||
// Generate keypair for JWT.
|
// Generate keypair for JWT.
|
||||||
//
|
//
|
||||||
// The keypair is only needed if authentication is enabled in any of the
|
// The keypair is only needed if authentication is enabled in any of the
|
||||||
// components. For convenience, we generate the keypair even if authentication
|
// components. For convenience, we generate the keypair even if authentication
|
||||||
// is not enabled, so that you can easily enable it after the initialization
|
// is not enabled, so that you can easily enable it after the initialization
|
||||||
// step. However, if the key generation fails, we treat it as non-fatal if
|
// step.
|
||||||
// authentication was not enabled.
|
generate_auth_keys(
|
||||||
if self.private_key_path == PathBuf::new() {
|
base_path.join("auth_private_key.pem").as_path(),
|
||||||
match generate_auth_keys(
|
base_path.join("auth_public_key.pem").as_path(),
|
||||||
base_path.join("auth_private_key.pem").as_path(),
|
)
|
||||||
base_path.join("auth_public_key.pem").as_path(),
|
.context("generate auth keys")?;
|
||||||
) {
|
let private_key_path = PathBuf::from("auth_private_key.pem");
|
||||||
Ok(()) => {
|
|
||||||
self.private_key_path = PathBuf::from("auth_private_key.pem");
|
// create the runtime type because the remaining initialization code below needs
|
||||||
}
|
// a LocalEnv instance op operation
|
||||||
Err(e) => {
|
// TODO: refactor to avoid this, LocalEnv should only be constructed from on-disk state
|
||||||
if !self.auth_keys_needed() {
|
let env = LocalEnv {
|
||||||
eprintln!("Could not generate keypair for JWT authentication: {e}");
|
base_data_dir: base_path.clone(),
|
||||||
eprintln!("Continuing anyway because authentication was not enabled");
|
pg_distrib_dir,
|
||||||
self.private_key_path = PathBuf::from("auth_private_key.pem");
|
neon_distrib_dir,
|
||||||
} else {
|
default_tenant_id: Some(default_tenant_id),
|
||||||
return Err(e);
|
private_key_path,
|
||||||
}
|
broker,
|
||||||
}
|
storage_controller: storage_controller.unwrap_or_default(),
|
||||||
}
|
pageservers: pageservers.iter().map(Into::into).collect(),
|
||||||
|
safekeepers,
|
||||||
|
control_plane_api: control_plane_api.unwrap_or_default(),
|
||||||
|
control_plane_compute_hook_api: control_plane_compute_hook_api.unwrap_or_default(),
|
||||||
|
branch_name_mappings: Default::default(),
|
||||||
|
};
|
||||||
|
|
||||||
|
// create endpoints dir
|
||||||
|
fs::create_dir_all(env.endpoints_path())?;
|
||||||
|
|
||||||
|
// create safekeeper dirs
|
||||||
|
for safekeeper in &env.safekeepers {
|
||||||
|
fs::create_dir_all(SafekeeperNode::datadir_path_by_id(&env, safekeeper.id))?;
|
||||||
}
|
}
|
||||||
|
|
||||||
fs::create_dir_all(self.endpoints_path())?;
|
// initialize pageserver state
|
||||||
|
for (i, ps) in pageservers.into_iter().enumerate() {
|
||||||
for safekeeper in &self.safekeepers {
|
let runtime_ps = &env.pageservers[i];
|
||||||
fs::create_dir_all(SafekeeperNode::datadir_path_by_id(self, safekeeper.id))?;
|
assert_eq!(&PageServerConf::from(&ps), runtime_ps);
|
||||||
|
fs::create_dir(env.pageserver_data_dir(ps.id))?;
|
||||||
|
PageServerNode::from_env(&env, runtime_ps)
|
||||||
|
.initialize(ps)
|
||||||
|
.context("pageserver init failed")?;
|
||||||
}
|
}
|
||||||
|
|
||||||
self.persist_config(base_path)
|
// setup remote remote location for default LocalFs remote storage
|
||||||
}
|
std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?;
|
||||||
|
|
||||||
fn auth_keys_needed(&self) -> bool {
|
env.persist_config()
|
||||||
self.pageservers.iter().any(|ps| {
|
|
||||||
ps.pg_auth_type == AuthType::NeonJWT || ps.http_auth_type == AuthType::NeonJWT
|
|
||||||
}) || self.safekeepers.iter().any(|sk| sk.auth_enabled)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn base_path() -> PathBuf {
|
pub fn base_path() -> PathBuf {
|
||||||
match std::env::var_os("NEON_REPO_DIR") {
|
match std::env::var_os("NEON_REPO_DIR") {
|
||||||
Some(val) => PathBuf::from(val),
|
Some(val) => PathBuf::from(val),
|
||||||
None => PathBuf::from(".neon"),
|
None => PathBuf::from(".neon"),
|
||||||
@@ -580,31 +761,3 @@ fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow
|
|||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use super::*;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn simple_conf_parsing() {
|
|
||||||
let simple_conf_toml = include_str!("../simple.conf");
|
|
||||||
let simple_conf_parse_result = LocalEnv::parse_config(simple_conf_toml);
|
|
||||||
assert!(
|
|
||||||
simple_conf_parse_result.is_ok(),
|
|
||||||
"failed to parse simple config {simple_conf_toml}, reason: {simple_conf_parse_result:?}"
|
|
||||||
);
|
|
||||||
|
|
||||||
let string_to_replace = "listen_addr = '127.0.0.1:50051'";
|
|
||||||
let spoiled_url_str = "listen_addr = '!@$XOXO%^&'";
|
|
||||||
let spoiled_url_toml = simple_conf_toml.replace(string_to_replace, spoiled_url_str);
|
|
||||||
assert!(
|
|
||||||
spoiled_url_toml.contains(spoiled_url_str),
|
|
||||||
"Failed to replace string {string_to_replace} in the toml file {simple_conf_toml}"
|
|
||||||
);
|
|
||||||
let spoiled_url_parse_result = LocalEnv::parse_config(&spoiled_url_toml);
|
|
||||||
assert!(
|
|
||||||
spoiled_url_parse_result.is_err(),
|
|
||||||
"expected toml with invalid Url {spoiled_url_toml} to fail the parsing, but got {spoiled_url_parse_result:?}"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -4,21 +4,21 @@
|
|||||||
//!
|
//!
|
||||||
//! .neon/
|
//! .neon/
|
||||||
//!
|
//!
|
||||||
use std::borrow::Cow;
|
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::io::Write;
|
use std::io::Write;
|
||||||
use std::num::NonZeroU64;
|
use std::num::NonZeroU64;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::process::Command;
|
use std::str::FromStr;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
|
||||||
use anyhow::{bail, Context};
|
use anyhow::{bail, Context};
|
||||||
use camino::Utf8PathBuf;
|
use camino::Utf8PathBuf;
|
||||||
use futures::SinkExt;
|
use futures::SinkExt;
|
||||||
use pageserver_api::models::{
|
use pageserver_api::models::{
|
||||||
self, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, TimelineInfo,
|
self, AuxFilePolicy, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo,
|
||||||
|
TimelineInfo,
|
||||||
};
|
};
|
||||||
use pageserver_api::shard::TenantShardId;
|
use pageserver_api::shard::TenantShardId;
|
||||||
use pageserver_client::mgmt_api;
|
use pageserver_client::mgmt_api;
|
||||||
@@ -30,7 +30,7 @@ use utils::{
|
|||||||
lsn::Lsn,
|
lsn::Lsn,
|
||||||
};
|
};
|
||||||
|
|
||||||
use crate::local_env::PageServerConf;
|
use crate::local_env::{NeonLocalInitPageserverConf, PageServerConf};
|
||||||
use crate::{background_process, local_env::LocalEnv};
|
use crate::{background_process, local_env::LocalEnv};
|
||||||
|
|
||||||
/// Directory within .neon which will be used by default for LocalFs remote storage.
|
/// Directory within .neon which will be used by default for LocalFs remote storage.
|
||||||
@@ -74,57 +74,23 @@ impl PageServerNode {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Merge overrides provided by the user on the command line with our default overides derived from neon_local configuration.
|
fn pageserver_init_make_toml(
|
||||||
///
|
&self,
|
||||||
/// These all end up on the command line of the `pageserver` binary.
|
conf: NeonLocalInitPageserverConf,
|
||||||
fn neon_local_overrides(&self, cli_overrides: &[&str]) -> Vec<String> {
|
) -> anyhow::Result<toml_edit::Document> {
|
||||||
|
assert_eq!(&PageServerConf::from(&conf), &self.conf, "during neon_local init, we derive the runtime state of ps conf (self.conf) from the --config flag fully");
|
||||||
|
|
||||||
|
// TODO(christian): instead of what we do here, create a pageserver_api::config::ConfigToml (PR #7656)
|
||||||
|
|
||||||
// FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
|
// FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
|
||||||
let pg_distrib_dir_param = format!(
|
let pg_distrib_dir_param = format!(
|
||||||
"pg_distrib_dir='{}'",
|
"pg_distrib_dir='{}'",
|
||||||
self.env.pg_distrib_dir_raw().display()
|
self.env.pg_distrib_dir_raw().display()
|
||||||
);
|
);
|
||||||
|
|
||||||
let PageServerConf {
|
|
||||||
id,
|
|
||||||
listen_pg_addr,
|
|
||||||
listen_http_addr,
|
|
||||||
pg_auth_type,
|
|
||||||
http_auth_type,
|
|
||||||
virtual_file_io_engine,
|
|
||||||
get_vectored_impl,
|
|
||||||
} = &self.conf;
|
|
||||||
|
|
||||||
let id = format!("id={}", id);
|
|
||||||
|
|
||||||
let http_auth_type_param = format!("http_auth_type='{}'", http_auth_type);
|
|
||||||
let listen_http_addr_param = format!("listen_http_addr='{}'", listen_http_addr);
|
|
||||||
|
|
||||||
let pg_auth_type_param = format!("pg_auth_type='{}'", pg_auth_type);
|
|
||||||
let listen_pg_addr_param = format!("listen_pg_addr='{}'", listen_pg_addr);
|
|
||||||
let virtual_file_io_engine = if let Some(virtual_file_io_engine) = virtual_file_io_engine {
|
|
||||||
format!("virtual_file_io_engine='{virtual_file_io_engine}'")
|
|
||||||
} else {
|
|
||||||
String::new()
|
|
||||||
};
|
|
||||||
let get_vectored_impl = if let Some(get_vectored_impl) = get_vectored_impl {
|
|
||||||
format!("get_vectored_impl='{get_vectored_impl}'")
|
|
||||||
} else {
|
|
||||||
String::new()
|
|
||||||
};
|
|
||||||
|
|
||||||
let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
|
let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
|
||||||
|
|
||||||
let mut overrides = vec![
|
let mut overrides = vec![pg_distrib_dir_param, broker_endpoint_param];
|
||||||
id,
|
|
||||||
pg_distrib_dir_param,
|
|
||||||
http_auth_type_param,
|
|
||||||
pg_auth_type_param,
|
|
||||||
listen_http_addr_param,
|
|
||||||
listen_pg_addr_param,
|
|
||||||
broker_endpoint_param,
|
|
||||||
virtual_file_io_engine,
|
|
||||||
get_vectored_impl,
|
|
||||||
];
|
|
||||||
|
|
||||||
if let Some(control_plane_api) = &self.env.control_plane_api {
|
if let Some(control_plane_api) = &self.env.control_plane_api {
|
||||||
overrides.push(format!(
|
overrides.push(format!(
|
||||||
@@ -134,7 +100,7 @@ impl PageServerNode {
|
|||||||
|
|
||||||
// Storage controller uses the same auth as pageserver: if JWT is enabled
|
// Storage controller uses the same auth as pageserver: if JWT is enabled
|
||||||
// for us, we will also need it to talk to them.
|
// for us, we will also need it to talk to them.
|
||||||
if matches!(http_auth_type, AuthType::NeonJWT) {
|
if matches!(conf.http_auth_type, AuthType::NeonJWT) {
|
||||||
let jwt_token = self
|
let jwt_token = self
|
||||||
.env
|
.env
|
||||||
.generate_auth_token(&Claims::new(None, Scope::GenerationsApi))
|
.generate_auth_token(&Claims::new(None, Scope::GenerationsApi))
|
||||||
@@ -143,31 +109,40 @@ impl PageServerNode {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if !cli_overrides
|
if !conf.other.contains_key("remote_storage") {
|
||||||
.iter()
|
|
||||||
.any(|c| c.starts_with("remote_storage"))
|
|
||||||
{
|
|
||||||
overrides.push(format!(
|
overrides.push(format!(
|
||||||
"remote_storage={{local_path='../{PAGESERVER_REMOTE_STORAGE_DIR}'}}"
|
"remote_storage={{local_path='../{PAGESERVER_REMOTE_STORAGE_DIR}'}}"
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
if *http_auth_type != AuthType::Trust || *pg_auth_type != AuthType::Trust {
|
if conf.http_auth_type != AuthType::Trust || conf.pg_auth_type != AuthType::Trust {
|
||||||
// Keys are generated in the toplevel repo dir, pageservers' workdirs
|
// Keys are generated in the toplevel repo dir, pageservers' workdirs
|
||||||
// are one level below that, so refer to keys with ../
|
// are one level below that, so refer to keys with ../
|
||||||
overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
|
overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Apply the user-provided overrides
|
// Apply the user-provided overrides
|
||||||
overrides.extend(cli_overrides.iter().map(|&c| c.to_owned()));
|
overrides.push(
|
||||||
|
toml_edit::ser::to_string_pretty(&conf)
|
||||||
|
.expect("we deserialized this from toml earlier"),
|
||||||
|
);
|
||||||
|
|
||||||
overrides
|
// Turn `overrides` into a toml document.
|
||||||
|
// TODO: above code is legacy code, it should be refactored to use toml_edit directly.
|
||||||
|
let mut config_toml = toml_edit::Document::new();
|
||||||
|
for fragment_str in overrides {
|
||||||
|
let fragment = toml_edit::Document::from_str(&fragment_str)
|
||||||
|
.expect("all fragments in `overrides` are valid toml documents, this function controls that");
|
||||||
|
for (key, item) in fragment.iter() {
|
||||||
|
config_toml.insert(key, item.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(config_toml)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Initializes a pageserver node by creating its config with the overrides provided.
|
/// Initializes a pageserver node by creating its config with the overrides provided.
|
||||||
pub fn initialize(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
|
pub fn initialize(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> {
|
||||||
// First, run `pageserver --init` and wait for it to write a config into FS and exit.
|
self.pageserver_init(conf)
|
||||||
self.pageserver_init(config_overrides)
|
|
||||||
.with_context(|| format!("Failed to run init for pageserver node {}", self.conf.id))
|
.with_context(|| format!("Failed to run init for pageserver node {}", self.conf.id))
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -183,11 +158,11 @@ impl PageServerNode {
|
|||||||
.expect("non-Unicode path")
|
.expect("non-Unicode path")
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
|
pub async fn start(&self) -> anyhow::Result<()> {
|
||||||
self.start_node(config_overrides, false).await
|
self.start_node().await
|
||||||
}
|
}
|
||||||
|
|
||||||
fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
|
fn pageserver_init(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> {
|
||||||
let datadir = self.repo_path();
|
let datadir = self.repo_path();
|
||||||
let node_id = self.conf.id;
|
let node_id = self.conf.id;
|
||||||
println!(
|
println!(
|
||||||
@@ -198,29 +173,20 @@ impl PageServerNode {
|
|||||||
);
|
);
|
||||||
io::stdout().flush()?;
|
io::stdout().flush()?;
|
||||||
|
|
||||||
if !datadir.exists() {
|
let config = self
|
||||||
std::fs::create_dir(&datadir)?;
|
.pageserver_init_make_toml(conf)
|
||||||
}
|
.context("make pageserver toml")?;
|
||||||
|
let config_file_path = datadir.join("pageserver.toml");
|
||||||
let datadir_path_str = datadir.to_str().with_context(|| {
|
let mut config_file = std::fs::OpenOptions::new()
|
||||||
format!("Cannot start pageserver node {node_id} in path that has no string representation: {datadir:?}")
|
.create_new(true)
|
||||||
})?;
|
.write(true)
|
||||||
let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str);
|
.open(&config_file_path)
|
||||||
args.push(Cow::Borrowed("--init"));
|
.with_context(|| format!("open pageserver toml for write: {config_file_path:?}"))?;
|
||||||
|
config_file
|
||||||
let init_output = Command::new(self.env.pageserver_bin())
|
.write_all(config.to_string().as_bytes())
|
||||||
.args(args.iter().map(Cow::as_ref))
|
.context("write pageserver toml")?;
|
||||||
.envs(self.pageserver_env_variables()?)
|
drop(config_file);
|
||||||
.output()
|
// TODO: invoke a TBD config-check command to validate that pageserver will start with the written config
|
||||||
.with_context(|| format!("Failed to run pageserver init for node {node_id}"))?;
|
|
||||||
|
|
||||||
anyhow::ensure!(
|
|
||||||
init_output.status.success(),
|
|
||||||
"Pageserver init for node {} did not finish successfully, stdout: {}, stderr: {}",
|
|
||||||
node_id,
|
|
||||||
String::from_utf8_lossy(&init_output.stdout),
|
|
||||||
String::from_utf8_lossy(&init_output.stderr),
|
|
||||||
);
|
|
||||||
|
|
||||||
// Write metadata file, used by pageserver on startup to register itself with
|
// Write metadata file, used by pageserver on startup to register itself with
|
||||||
// the storage controller
|
// the storage controller
|
||||||
@@ -234,12 +200,13 @@ impl PageServerNode {
|
|||||||
// situation: the metadata is written by some other script.
|
// situation: the metadata is written by some other script.
|
||||||
std::fs::write(
|
std::fs::write(
|
||||||
metadata_path,
|
metadata_path,
|
||||||
serde_json::to_vec(&serde_json::json!({
|
serde_json::to_vec(&pageserver_api::config::NodeMetadata {
|
||||||
"host": "localhost",
|
postgres_host: "localhost".to_string(),
|
||||||
"port": self.pg_connection_config.port(),
|
postgres_port: self.pg_connection_config.port(),
|
||||||
"http_host": "localhost",
|
http_host: "localhost".to_string(),
|
||||||
"http_port": http_port,
|
http_port,
|
||||||
}))
|
other: HashMap::new(),
|
||||||
|
})
|
||||||
.unwrap(),
|
.unwrap(),
|
||||||
)
|
)
|
||||||
.expect("Failed to write metadata file");
|
.expect("Failed to write metadata file");
|
||||||
@@ -247,11 +214,7 @@ impl PageServerNode {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn start_node(
|
async fn start_node(&self) -> anyhow::Result<()> {
|
||||||
&self,
|
|
||||||
config_overrides: &[&str],
|
|
||||||
update_config: bool,
|
|
||||||
) -> anyhow::Result<()> {
|
|
||||||
// TODO: using a thread here because start_process() is not async but we need to call check_status()
|
// TODO: using a thread here because start_process() is not async but we need to call check_status()
|
||||||
let datadir = self.repo_path();
|
let datadir = self.repo_path();
|
||||||
print!(
|
print!(
|
||||||
@@ -268,15 +231,12 @@ impl PageServerNode {
|
|||||||
self.conf.id, datadir,
|
self.conf.id, datadir,
|
||||||
)
|
)
|
||||||
})?;
|
})?;
|
||||||
let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str);
|
let args = vec!["-D", datadir_path_str];
|
||||||
if update_config {
|
|
||||||
args.push(Cow::Borrowed("--update-config"));
|
|
||||||
}
|
|
||||||
background_process::start_process(
|
background_process::start_process(
|
||||||
"pageserver",
|
"pageserver",
|
||||||
&datadir,
|
&datadir,
|
||||||
&self.env.pageserver_bin(),
|
&self.env.pageserver_bin(),
|
||||||
args.iter().map(Cow::as_ref),
|
args,
|
||||||
self.pageserver_env_variables()?,
|
self.pageserver_env_variables()?,
|
||||||
background_process::InitialPidFile::Expect(self.pid_file()),
|
background_process::InitialPidFile::Expect(self.pid_file()),
|
||||||
|| async {
|
|| async {
|
||||||
@@ -293,22 +253,6 @@ impl PageServerNode {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn pageserver_basic_args<'a>(
|
|
||||||
&self,
|
|
||||||
config_overrides: &'a [&'a str],
|
|
||||||
datadir_path_str: &'a str,
|
|
||||||
) -> Vec<Cow<'a, str>> {
|
|
||||||
let mut args = vec![Cow::Borrowed("-D"), Cow::Borrowed(datadir_path_str)];
|
|
||||||
|
|
||||||
let overrides = self.neon_local_overrides(config_overrides);
|
|
||||||
for config_override in overrides {
|
|
||||||
args.push(Cow::Borrowed("-c"));
|
|
||||||
args.push(Cow::Owned(config_override));
|
|
||||||
}
|
|
||||||
|
|
||||||
args
|
|
||||||
}
|
|
||||||
|
|
||||||
fn pageserver_env_variables(&self) -> anyhow::Result<Vec<(String, String)>> {
|
fn pageserver_env_variables(&self) -> anyhow::Result<Vec<(String, String)>> {
|
||||||
// FIXME: why is this tied to pageserver's auth type? Whether or not the safekeeper
|
// FIXME: why is this tied to pageserver's auth type? Whether or not the safekeeper
|
||||||
// needs a token, and how to generate that token, seems independent to whether
|
// needs a token, and how to generate that token, seems independent to whether
|
||||||
@@ -434,6 +378,11 @@ impl PageServerNode {
|
|||||||
.map(serde_json::from_str)
|
.map(serde_json::from_str)
|
||||||
.transpose()
|
.transpose()
|
||||||
.context("parse `timeline_get_throttle` from json")?,
|
.context("parse `timeline_get_throttle` from json")?,
|
||||||
|
switch_aux_file_policy: settings
|
||||||
|
.remove("switch_aux_file_policy")
|
||||||
|
.map(|x| x.parse::<AuxFilePolicy>())
|
||||||
|
.transpose()
|
||||||
|
.context("Failed to parse 'switch_aux_file_policy'")?,
|
||||||
};
|
};
|
||||||
if !settings.is_empty() {
|
if !settings.is_empty() {
|
||||||
bail!("Unrecognized tenant settings: {settings:?}")
|
bail!("Unrecognized tenant settings: {settings:?}")
|
||||||
@@ -552,6 +501,11 @@ impl PageServerNode {
|
|||||||
.map(serde_json::from_str)
|
.map(serde_json::from_str)
|
||||||
.transpose()
|
.transpose()
|
||||||
.context("parse `timeline_get_throttle` from json")?,
|
.context("parse `timeline_get_throttle` from json")?,
|
||||||
|
switch_aux_file_policy: settings
|
||||||
|
.remove("switch_aux_file_policy")
|
||||||
|
.map(|x| x.parse::<AuxFilePolicy>())
|
||||||
|
.transpose()
|
||||||
|
.context("Failed to parse 'switch_aux_file_policy'")?,
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
use crate::{background_process, local_env::LocalEnv};
|
use crate::{
|
||||||
|
background_process,
|
||||||
|
local_env::{LocalEnv, NeonStorageControllerConf},
|
||||||
|
};
|
||||||
use camino::{Utf8Path, Utf8PathBuf};
|
use camino::{Utf8Path, Utf8PathBuf};
|
||||||
use hyper::Method;
|
|
||||||
use pageserver_api::{
|
use pageserver_api::{
|
||||||
controller_api::{
|
controller_api::{
|
||||||
NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse, TenantLocateResponse,
|
NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse, TenantLocateResponse,
|
||||||
@@ -14,6 +16,7 @@ use pageserver_api::{
|
|||||||
};
|
};
|
||||||
use pageserver_client::mgmt_api::ResponseErrorMessageExt;
|
use pageserver_client::mgmt_api::ResponseErrorMessageExt;
|
||||||
use postgres_backend::AuthType;
|
use postgres_backend::AuthType;
|
||||||
|
use reqwest::Method;
|
||||||
use serde::{de::DeserializeOwned, Deserialize, Serialize};
|
use serde::{de::DeserializeOwned, Deserialize, Serialize};
|
||||||
use std::{fs, str::FromStr};
|
use std::{fs, str::FromStr};
|
||||||
use tokio::process::Command;
|
use tokio::process::Command;
|
||||||
@@ -32,15 +35,13 @@ pub struct StorageController {
|
|||||||
public_key: Option<String>,
|
public_key: Option<String>,
|
||||||
postgres_port: u16,
|
postgres_port: u16,
|
||||||
client: reqwest::Client,
|
client: reqwest::Client,
|
||||||
|
config: NeonStorageControllerConf,
|
||||||
}
|
}
|
||||||
|
|
||||||
const COMMAND: &str = "storage_controller";
|
const COMMAND: &str = "storage_controller";
|
||||||
|
|
||||||
const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
|
const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
|
||||||
|
|
||||||
// Use a shorter pageserver unavailability interval than the default to speed up tests.
|
|
||||||
const NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
|
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize)]
|
||||||
pub struct AttachHookRequest {
|
pub struct AttachHookRequest {
|
||||||
pub tenant_shard_id: TenantShardId,
|
pub tenant_shard_id: TenantShardId,
|
||||||
@@ -135,6 +136,7 @@ impl StorageController {
|
|||||||
client: reqwest::ClientBuilder::new()
|
client: reqwest::ClientBuilder::new()
|
||||||
.build()
|
.build()
|
||||||
.expect("Failed to construct http client"),
|
.expect("Failed to construct http client"),
|
||||||
|
config: env.storage_controller.clone(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -241,9 +243,13 @@ impl StorageController {
|
|||||||
anyhow::bail!("initdb failed with status {status}");
|
anyhow::bail!("initdb failed with status {status}");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Write a minimal config file:
|
||||||
|
// - Specify the port, since this is chosen dynamically
|
||||||
|
// - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
|
||||||
|
// the storage controller we don't want a slow local disk to interfere with that.
|
||||||
tokio::fs::write(
|
tokio::fs::write(
|
||||||
&pg_data_path.join("postgresql.conf"),
|
&pg_data_path.join("postgresql.conf"),
|
||||||
format!("port = {}", self.postgres_port),
|
format!("port = {}\nfsync=off\n", self.postgres_port),
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
};
|
};
|
||||||
@@ -272,8 +278,6 @@ impl StorageController {
|
|||||||
// Run migrations on every startup, in case something changed.
|
// Run migrations on every startup, in case something changed.
|
||||||
let database_url = self.setup_database().await?;
|
let database_url = self.setup_database().await?;
|
||||||
|
|
||||||
let max_unavailable: humantime::Duration = NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL.into();
|
|
||||||
|
|
||||||
let mut args = vec![
|
let mut args = vec![
|
||||||
"-l",
|
"-l",
|
||||||
&self.listen,
|
&self.listen,
|
||||||
@@ -283,7 +287,7 @@ impl StorageController {
|
|||||||
"--database-url",
|
"--database-url",
|
||||||
&database_url,
|
&database_url,
|
||||||
"--max-unavailable-interval",
|
"--max-unavailable-interval",
|
||||||
&max_unavailable.to_string(),
|
&humantime::Duration::from(self.config.max_unavailable).to_string(),
|
||||||
]
|
]
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|s| s.to_string())
|
.map(|s| s.to_string())
|
||||||
@@ -305,6 +309,10 @@ impl StorageController {
|
|||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if let Some(split_threshold) = self.config.split_threshold.as_ref() {
|
||||||
|
args.push(format!("--split-threshold={split_threshold}"))
|
||||||
|
}
|
||||||
|
|
||||||
background_process::start_process(
|
background_process::start_process(
|
||||||
COMMAND,
|
COMMAND,
|
||||||
&self.env.base_data_dir,
|
&self.env.base_data_dir,
|
||||||
@@ -379,7 +387,7 @@ impl StorageController {
|
|||||||
/// Simple HTTP request wrapper for calling into storage controller
|
/// Simple HTTP request wrapper for calling into storage controller
|
||||||
async fn dispatch<RQ, RS>(
|
async fn dispatch<RQ, RS>(
|
||||||
&self,
|
&self,
|
||||||
method: hyper::Method,
|
method: reqwest::Method,
|
||||||
path: String,
|
path: String,
|
||||||
body: Option<RQ>,
|
body: Option<RQ>,
|
||||||
) -> anyhow::Result<RS>
|
) -> anyhow::Result<RS>
|
||||||
@@ -472,6 +480,16 @@ impl StorageController {
|
|||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[instrument(skip(self))]
|
||||||
|
pub async fn tenant_import(&self, tenant_id: TenantId) -> anyhow::Result<TenantCreateResponse> {
|
||||||
|
self.dispatch::<(), TenantCreateResponse>(
|
||||||
|
Method::POST,
|
||||||
|
format!("debug/v1/tenant/{tenant_id}/import"),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
#[instrument(skip(self))]
|
#[instrument(skip(self))]
|
||||||
pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
|
pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
|
||||||
self.dispatch::<(), _>(
|
self.dispatch::<(), _>(
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
use std::{collections::HashMap, str::FromStr, time::Duration};
|
use std::{collections::HashMap, str::FromStr, time::Duration};
|
||||||
|
|
||||||
use clap::{Parser, Subcommand};
|
use clap::{Parser, Subcommand};
|
||||||
use hyper::{Method, StatusCode};
|
|
||||||
use pageserver_api::{
|
use pageserver_api::{
|
||||||
controller_api::{
|
controller_api::{
|
||||||
NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
|
NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
|
||||||
@@ -14,7 +13,7 @@ use pageserver_api::{
|
|||||||
shard::{ShardStripeSize, TenantShardId},
|
shard::{ShardStripeSize, TenantShardId},
|
||||||
};
|
};
|
||||||
use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
|
use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
|
||||||
use reqwest::Url;
|
use reqwest::{Method, StatusCode, Url};
|
||||||
use serde::{de::DeserializeOwned, Serialize};
|
use serde::{de::DeserializeOwned, Serialize};
|
||||||
use utils::id::{NodeId, TenantId};
|
use utils::id::{NodeId, TenantId};
|
||||||
|
|
||||||
@@ -232,7 +231,7 @@ impl Client {
|
|||||||
/// Simple HTTP request wrapper for calling into storage controller
|
/// Simple HTTP request wrapper for calling into storage controller
|
||||||
async fn dispatch<RQ, RS>(
|
async fn dispatch<RQ, RS>(
|
||||||
&self,
|
&self,
|
||||||
method: hyper::Method,
|
method: Method,
|
||||||
path: String,
|
path: String,
|
||||||
body: Option<RQ>,
|
body: Option<RQ>,
|
||||||
) -> mgmt_api::Result<RS>
|
) -> mgmt_api::Result<RS>
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
ARG REPOSITORY=neondatabase
|
||||||
ARG COMPUTE_IMAGE=compute-node-v14
|
ARG COMPUTE_IMAGE=compute-node-v14
|
||||||
ARG TAG=latest
|
ARG TAG=latest
|
||||||
|
|
||||||
|
|||||||
@@ -8,8 +8,6 @@
|
|||||||
# Their defaults point at DockerHub `neondatabase/neon:latest` image.`,
|
# Their defaults point at DockerHub `neondatabase/neon:latest` image.`,
|
||||||
# to verify custom image builds (e.g pre-published ones).
|
# to verify custom image builds (e.g pre-published ones).
|
||||||
|
|
||||||
# XXX: Current does not work on M1 macs due to x86_64 Docker images compiled only, and no seccomp support in M1 Docker emulation layer.
|
|
||||||
|
|
||||||
set -eux -o pipefail
|
set -eux -o pipefail
|
||||||
|
|
||||||
SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
|
SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ The storage controller uses a postgres database to persist a subset of its state
|
|||||||
persist the _relationships_ between them: the attachment state of a tenant's shards to nodes is kept in memory and
|
persist the _relationships_ between them: the attachment state of a tenant's shards to nodes is kept in memory and
|
||||||
rebuilt on startup.
|
rebuilt on startup.
|
||||||
|
|
||||||
The file `[persistence.rs](http://persistence.rs)` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.
|
The file `persistence.rs` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.
|
||||||
|
|
||||||
The `diesel` crate is used for defining models & migrations.
|
The `diesel` crate is used for defining models & migrations.
|
||||||
|
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
use chrono::{DateTime, Utc};
|
use chrono::{DateTime, Utc};
|
||||||
use serde::{Deserialize, Serialize, Serializer};
|
use serde::{Deserialize, Serialize, Serializer};
|
||||||
|
|
||||||
use crate::spec::ComputeSpec;
|
use crate::spec::{ComputeSpec, Database, Role};
|
||||||
|
|
||||||
#[derive(Serialize, Debug, Deserialize)]
|
#[derive(Serialize, Debug, Deserialize)]
|
||||||
pub struct GenericAPIError {
|
pub struct GenericAPIError {
|
||||||
@@ -113,6 +113,12 @@ pub struct ComputeMetrics {
|
|||||||
pub total_ext_download_size: u64,
|
pub total_ext_download_size: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Debug, Default, Serialize)]
|
||||||
|
pub struct CatalogObjects {
|
||||||
|
pub roles: Vec<Role>,
|
||||||
|
pub databases: Vec<Database>,
|
||||||
|
}
|
||||||
|
|
||||||
/// Response of the `/computes/{compute_id}/spec` control-plane API.
|
/// Response of the `/computes/{compute_id}/spec` control-plane API.
|
||||||
/// This is not actually a compute API response, so consider moving
|
/// This is not actually a compute API response, so consider moving
|
||||||
/// to a different place.
|
/// to a different place.
|
||||||
|
|||||||
@@ -33,6 +33,23 @@ pub struct ComputeSpec {
|
|||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub features: Vec<ComputeFeature>,
|
pub features: Vec<ComputeFeature>,
|
||||||
|
|
||||||
|
/// If compute_ctl was passed `--resize-swap-on-bind`, a value of `Some(_)` instructs
|
||||||
|
/// compute_ctl to `/neonvm/bin/resize-swap` with the given size, when the spec is first
|
||||||
|
/// received.
|
||||||
|
///
|
||||||
|
/// Both this field and `--resize-swap-on-bind` are required, so that the control plane's
|
||||||
|
/// spec generation doesn't need to be aware of the actual compute it's running on, while
|
||||||
|
/// guaranteeing gradual rollout of swap. Otherwise, without `--resize-swap-on-bind`, we could
|
||||||
|
/// end up trying to resize swap in VMs without it -- or end up *not* resizing swap, thus
|
||||||
|
/// giving every VM much more swap than it should have (32GiB).
|
||||||
|
///
|
||||||
|
/// Eventually we may remove `--resize-swap-on-bind` and exclusively use `swap_size_bytes` for
|
||||||
|
/// enabling the swap resizing behavior once rollout is complete.
|
||||||
|
///
|
||||||
|
/// See neondatabase/cloud#12047 for more.
|
||||||
|
#[serde(default)]
|
||||||
|
pub swap_size_bytes: Option<u64>,
|
||||||
|
|
||||||
/// Expected cluster state at the end of transition process.
|
/// Expected cluster state at the end of transition process.
|
||||||
pub cluster: Cluster,
|
pub cluster: Cluster,
|
||||||
pub delta_operations: Option<Vec<DeltaOp>>,
|
pub delta_operations: Option<Vec<DeltaOp>>,
|
||||||
|
|||||||
@@ -256,7 +256,16 @@ fn update_rusage_metrics() {
|
|||||||
DISK_IO_BYTES
|
DISK_IO_BYTES
|
||||||
.with_label_values(&["write"])
|
.with_label_values(&["write"])
|
||||||
.set(rusage_stats.ru_oublock * BYTES_IN_BLOCK);
|
.set(rusage_stats.ru_oublock * BYTES_IN_BLOCK);
|
||||||
MAXRSS_KB.set(rusage_stats.ru_maxrss);
|
|
||||||
|
// On macOS, the unit of maxrss is bytes; on Linux, it's kilobytes. https://stackoverflow.com/a/59915669
|
||||||
|
#[cfg(target_os = "macos")]
|
||||||
|
{
|
||||||
|
MAXRSS_KB.set(rusage_stats.ru_maxrss / 1024);
|
||||||
|
}
|
||||||
|
#[cfg(not(target_os = "macos"))]
|
||||||
|
{
|
||||||
|
MAXRSS_KB.set(rusage_stats.ru_maxrss);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_rusage_stats() -> libc::rusage {
|
fn get_rusage_stats() -> libc::rusage {
|
||||||
@@ -471,6 +480,15 @@ impl<A: CounterPairAssoc> CounterPairVec<A> {
|
|||||||
let id = self.vec.with_labels(labels);
|
let id = self.vec.with_labels(labels);
|
||||||
self.vec.remove_metric(id)
|
self.vec.remove_metric(id)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn sample(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) -> u64 {
|
||||||
|
let id = self.vec.with_labels(labels);
|
||||||
|
let metric = self.vec.get_metric(id);
|
||||||
|
|
||||||
|
let inc = metric.inc.count.load(std::sync::atomic::Ordering::Relaxed);
|
||||||
|
let dec = metric.dec.count.load(std::sync::atomic::Ordering::Relaxed);
|
||||||
|
inc.saturating_sub(dec)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T, A> ::measured::metric::group::MetricGroup<T> for CounterPairVec<A>
|
impl<T, A> ::measured::metric::group::MetricGroup<T> for CounterPairVec<A>
|
||||||
|
|||||||
31
libs/pageserver_api/src/config.rs
Normal file
31
libs/pageserver_api/src/config.rs
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
use const_format::formatcp;
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests;
|
||||||
|
|
||||||
|
pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
|
||||||
|
pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
|
||||||
|
pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
|
||||||
|
pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
|
||||||
|
|
||||||
|
// Certain metadata (e.g. externally-addressable name, AZ) is delivered
|
||||||
|
// as a separate structure. This information is not neeed by the pageserver
|
||||||
|
// itself, it is only used for registering the pageserver with the control
|
||||||
|
// plane and/or storage controller.
|
||||||
|
//
|
||||||
|
#[derive(PartialEq, Eq, Debug, serde::Serialize, serde::Deserialize)]
|
||||||
|
pub struct NodeMetadata {
|
||||||
|
#[serde(rename = "host")]
|
||||||
|
pub postgres_host: String,
|
||||||
|
#[serde(rename = "port")]
|
||||||
|
pub postgres_port: u16,
|
||||||
|
pub http_host: String,
|
||||||
|
pub http_port: u16,
|
||||||
|
|
||||||
|
// Deployment tools may write fields to the metadata file beyond what we
|
||||||
|
// use in this type: this type intentionally only names fields that require.
|
||||||
|
#[serde(flatten)]
|
||||||
|
pub other: HashMap<String, serde_json::Value>,
|
||||||
|
}
|
||||||
22
libs/pageserver_api/src/config/tests.rs
Normal file
22
libs/pageserver_api/src/config/tests.rs
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_node_metadata_v1_backward_compatibilty() {
|
||||||
|
let v1 = serde_json::to_vec(&serde_json::json!({
|
||||||
|
"host": "localhost",
|
||||||
|
"port": 23,
|
||||||
|
"http_host": "localhost",
|
||||||
|
"http_port": 42,
|
||||||
|
}));
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
serde_json::from_slice::<NodeMetadata>(&v1.unwrap()).unwrap(),
|
||||||
|
NodeMetadata {
|
||||||
|
postgres_host: "localhost".to_string(),
|
||||||
|
postgres_port: 23,
|
||||||
|
http_host: "localhost".to_string(),
|
||||||
|
http_port: 42,
|
||||||
|
other: HashMap::new(),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
}
|
||||||
@@ -1,5 +1,6 @@
|
|||||||
use anyhow::{bail, Result};
|
use anyhow::{bail, Result};
|
||||||
use byteorder::{ByteOrder, BE};
|
use byteorder::{ByteOrder, BE};
|
||||||
|
use bytes::BufMut;
|
||||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
|
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||||
use postgres_ffi::{Oid, TransactionId};
|
use postgres_ffi::{Oid, TransactionId};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
@@ -21,15 +22,107 @@ pub struct Key {
|
|||||||
pub field6: u32,
|
pub field6: u32,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// The storage key size.
|
||||||
pub const KEY_SIZE: usize = 18;
|
pub const KEY_SIZE: usize = 18;
|
||||||
|
|
||||||
|
/// The metadata key size. 2B fewer than the storage key size because field2 is not fully utilized.
|
||||||
|
/// See [`Key::to_i128`] for more information on the encoding.
|
||||||
|
pub const METADATA_KEY_SIZE: usize = 16;
|
||||||
|
|
||||||
|
/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x40 is a metadata key.
|
||||||
|
pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x60;
|
||||||
|
pub const METADATA_KEY_END_PREFIX: u8 = 0x7F;
|
||||||
|
|
||||||
|
/// The (reserved) key prefix of relation sizes.
|
||||||
|
pub const RELATION_SIZE_PREFIX: u8 = 0x61;
|
||||||
|
|
||||||
|
/// The key prefix of AUX file keys.
|
||||||
|
pub const AUX_KEY_PREFIX: u8 = 0x62;
|
||||||
|
|
||||||
|
/// Check if the key falls in the range of metadata keys.
|
||||||
|
pub const fn is_metadata_key_slice(key: &[u8]) -> bool {
|
||||||
|
key[0] >= METADATA_KEY_BEGIN_PREFIX && key[0] < METADATA_KEY_END_PREFIX
|
||||||
|
}
|
||||||
|
|
||||||
impl Key {
|
impl Key {
|
||||||
|
/// Check if the key falls in the range of metadata keys.
|
||||||
|
pub const fn is_metadata_key(&self) -> bool {
|
||||||
|
self.field1 >= METADATA_KEY_BEGIN_PREFIX && self.field1 < METADATA_KEY_END_PREFIX
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Encode a metadata key to a storage key.
|
||||||
|
pub fn from_metadata_key_fixed_size(key: &[u8; METADATA_KEY_SIZE]) -> Self {
|
||||||
|
assert!(is_metadata_key_slice(key), "key not in metadata key range");
|
||||||
|
Key {
|
||||||
|
field1: key[0],
|
||||||
|
field2: u16::from_be_bytes(key[1..3].try_into().unwrap()) as u32,
|
||||||
|
field3: u32::from_be_bytes(key[3..7].try_into().unwrap()),
|
||||||
|
field4: u32::from_be_bytes(key[7..11].try_into().unwrap()),
|
||||||
|
field5: key[11],
|
||||||
|
field6: u32::from_be_bytes(key[12..16].try_into().unwrap()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Encode a metadata key to a storage key.
|
||||||
|
pub fn from_metadata_key(key: &[u8]) -> Self {
|
||||||
|
Self::from_metadata_key_fixed_size(key.try_into().expect("expect 16 byte metadata key"))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract a metadata key to a writer. The result should always be 16 bytes.
|
||||||
|
pub fn extract_metadata_key_to_writer(&self, mut writer: impl BufMut) {
|
||||||
|
writer.put_u8(self.field1);
|
||||||
|
assert!(self.field2 <= 0xFFFF);
|
||||||
|
writer.put_u16(self.field2 as u16);
|
||||||
|
writer.put_u32(self.field3);
|
||||||
|
writer.put_u32(self.field4);
|
||||||
|
writer.put_u8(self.field5);
|
||||||
|
writer.put_u32(self.field6);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get the range of metadata keys.
|
||||||
|
pub const fn metadata_key_range() -> Range<Self> {
|
||||||
|
Key {
|
||||||
|
field1: METADATA_KEY_BEGIN_PREFIX,
|
||||||
|
field2: 0,
|
||||||
|
field3: 0,
|
||||||
|
field4: 0,
|
||||||
|
field5: 0,
|
||||||
|
field6: 0,
|
||||||
|
}..Key {
|
||||||
|
field1: METADATA_KEY_END_PREFIX,
|
||||||
|
field2: 0,
|
||||||
|
field3: 0,
|
||||||
|
field4: 0,
|
||||||
|
field5: 0,
|
||||||
|
field6: 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get the range of aux keys.
|
||||||
|
pub fn metadata_aux_key_range() -> Range<Self> {
|
||||||
|
Key {
|
||||||
|
field1: AUX_KEY_PREFIX,
|
||||||
|
field2: 0,
|
||||||
|
field3: 0,
|
||||||
|
field4: 0,
|
||||||
|
field5: 0,
|
||||||
|
field6: 0,
|
||||||
|
}..Key {
|
||||||
|
field1: AUX_KEY_PREFIX + 1,
|
||||||
|
field2: 0,
|
||||||
|
field3: 0,
|
||||||
|
field4: 0,
|
||||||
|
field5: 0,
|
||||||
|
field6: 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
|
/// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
|
||||||
/// As long as Neon does not support tablespace (because of lack of access to local file system),
|
/// As long as Neon does not support tablespace (because of lack of access to local file system),
|
||||||
/// we can assume that only some predefined namespace OIDs are used which can fit in u16
|
/// we can assume that only some predefined namespace OIDs are used which can fit in u16
|
||||||
pub fn to_i128(&self) -> i128 {
|
pub fn to_i128(&self) -> i128 {
|
||||||
assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
|
assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
|
||||||
(((self.field1 & 0xf) as i128) << 120)
|
(((self.field1 & 0x7F) as i128) << 120)
|
||||||
| (((self.field2 & 0xFFFF) as i128) << 104)
|
| (((self.field2 & 0xFFFF) as i128) << 104)
|
||||||
| ((self.field3 as i128) << 72)
|
| ((self.field3 as i128) << 72)
|
||||||
| ((self.field4 as i128) << 40)
|
| ((self.field4 as i128) << 40)
|
||||||
@@ -39,7 +132,7 @@ impl Key {
|
|||||||
|
|
||||||
pub const fn from_i128(x: i128) -> Self {
|
pub const fn from_i128(x: i128) -> Self {
|
||||||
Key {
|
Key {
|
||||||
field1: ((x >> 120) & 0xf) as u8,
|
field1: ((x >> 120) & 0x7F) as u8,
|
||||||
field2: ((x >> 104) & 0xFFFF) as u32,
|
field2: ((x >> 104) & 0xFFFF) as u32,
|
||||||
field3: (x >> 72) as u32,
|
field3: (x >> 72) as u32,
|
||||||
field4: (x >> 40) as u32,
|
field4: (x >> 40) as u32,
|
||||||
@@ -48,11 +141,11 @@ impl Key {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn next(&self) -> Key {
|
pub const fn next(&self) -> Key {
|
||||||
self.add(1)
|
self.add(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add(&self, x: u32) -> Key {
|
pub const fn add(&self, x: u32) -> Key {
|
||||||
let mut key = *self;
|
let mut key = *self;
|
||||||
|
|
||||||
let r = key.field6.overflowing_add(x);
|
let r = key.field6.overflowing_add(x);
|
||||||
@@ -81,6 +174,8 @@ impl Key {
|
|||||||
key
|
key
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Convert a 18B slice to a key. This function should not be used for metadata keys because field2 is handled differently.
|
||||||
|
/// Use [`Key::from_metadata_key`] instead.
|
||||||
pub fn from_slice(b: &[u8]) -> Self {
|
pub fn from_slice(b: &[u8]) -> Self {
|
||||||
Key {
|
Key {
|
||||||
field1: b[0],
|
field1: b[0],
|
||||||
@@ -92,6 +187,8 @@ impl Key {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Convert a key to a 18B slice. This function should not be used for metadata keys because field2 is handled differently.
|
||||||
|
/// Use [`Key::extract_metadata_key_to_writer`] instead.
|
||||||
pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
|
pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
|
||||||
buf[0] = self.field1;
|
buf[0] = self.field1;
|
||||||
BE::write_u32(&mut buf[1..5], self.field2);
|
BE::write_u32(&mut buf[1..5], self.field2);
|
||||||
@@ -475,12 +572,17 @@ pub const AUX_FILES_KEY: Key = Key {
|
|||||||
// Reverse mappings for a few Keys.
|
// Reverse mappings for a few Keys.
|
||||||
// These are needed by WAL redo manager.
|
// These are needed by WAL redo manager.
|
||||||
|
|
||||||
|
/// Non inherited range for vectored get.
|
||||||
|
pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
|
||||||
|
/// Sparse keyspace range for vectored get. Missing key error will be ignored for this range.
|
||||||
|
pub const NON_INHERITED_SPARSE_RANGE: Range<Key> = Key::metadata_key_range();
|
||||||
|
|
||||||
// AUX_FILES currently stores only data for logical replication (slots etc), and
|
// AUX_FILES currently stores only data for logical replication (slots etc), and
|
||||||
// we don't preserve these on a branch because safekeepers can't follow timeline
|
// we don't preserve these on a branch because safekeepers can't follow timeline
|
||||||
// switch (and generally it likely should be optional), so ignore these.
|
// switch (and generally it likely should be optional), so ignore these.
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
pub fn is_inherited_key(key: Key) -> bool {
|
pub fn is_inherited_key(key: Key) -> bool {
|
||||||
key != AUX_FILES_KEY
|
!NON_INHERITED_RANGE.contains(&key) && !NON_INHERITED_SPARSE_RANGE.contains(&key)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
@@ -556,11 +658,14 @@ impl std::str::FromStr for Key {
|
|||||||
mod tests {
|
mod tests {
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
|
|
||||||
|
use crate::key::is_metadata_key_slice;
|
||||||
use crate::key::Key;
|
use crate::key::Key;
|
||||||
|
|
||||||
use rand::Rng;
|
use rand::Rng;
|
||||||
use rand::SeedableRng;
|
use rand::SeedableRng;
|
||||||
|
|
||||||
|
use super::AUX_KEY_PREFIX;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn display_fromstr_bijection() {
|
fn display_fromstr_bijection() {
|
||||||
let mut rng = rand::rngs::StdRng::seed_from_u64(42);
|
let mut rng = rand::rngs::StdRng::seed_from_u64(42);
|
||||||
@@ -576,4 +681,16 @@ mod tests {
|
|||||||
|
|
||||||
assert_eq!(key, Key::from_str(&format!("{key}")).unwrap());
|
assert_eq!(key, Key::from_str(&format!("{key}")).unwrap());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_metadata_keys() {
|
||||||
|
let mut metadata_key = vec![AUX_KEY_PREFIX];
|
||||||
|
metadata_key.extend_from_slice(&[0xFF; 15]);
|
||||||
|
let encoded_key = Key::from_metadata_key(&metadata_key);
|
||||||
|
let mut output_key = Vec::new();
|
||||||
|
encoded_key.extract_metadata_key_to_writer(&mut output_key);
|
||||||
|
assert_eq!(metadata_key, output_key);
|
||||||
|
assert!(encoded_key.is_metadata_key());
|
||||||
|
assert!(is_metadata_key_slice(&metadata_key));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,10 @@
|
|||||||
use postgres_ffi::BLCKSZ;
|
use postgres_ffi::BLCKSZ;
|
||||||
use std::ops::Range;
|
use std::ops::Range;
|
||||||
|
|
||||||
use crate::key::Key;
|
use crate::{
|
||||||
|
key::Key,
|
||||||
|
shard::{ShardCount, ShardIdentity},
|
||||||
|
};
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
|
|
||||||
///
|
///
|
||||||
@@ -14,44 +17,279 @@ pub struct KeySpace {
|
|||||||
pub ranges: Vec<Range<Key>>,
|
pub ranges: Vec<Range<Key>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl KeySpace {
|
/// A wrapper type for sparse keyspaces.
|
||||||
|
#[derive(Clone, Debug, Default, PartialEq, Eq)]
|
||||||
|
pub struct SparseKeySpace(pub KeySpace);
|
||||||
|
|
||||||
|
/// Represents a contiguous half-open range of the keyspace, masked according to a particular
|
||||||
|
/// ShardNumber's stripes: within this range of keys, only some "belong" to the current
|
||||||
|
/// shard.
|
||||||
|
///
|
||||||
|
/// When we iterate over keys within this object, we will skip any keys that don't belong
|
||||||
|
/// to this shard.
|
||||||
|
///
|
||||||
|
/// The start + end keys may not belong to the shard: these specify where layer files should
|
||||||
|
/// start + end, but we will never actually read/write those keys.
|
||||||
|
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||||
|
pub struct ShardedRange<'a> {
|
||||||
|
pub shard_identity: &'a ShardIdentity,
|
||||||
|
pub range: Range<Key>,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate the size of a range within the blocks of the same relation, or spanning only the
|
||||||
|
// top page in the previous relation's space.
|
||||||
|
fn contiguous_range_len(range: &Range<Key>) -> u32 {
|
||||||
|
debug_assert!(is_contiguous_range(range));
|
||||||
|
if range.start.field6 == 0xffffffff {
|
||||||
|
range.end.field6 + 1
|
||||||
|
} else {
|
||||||
|
range.end.field6 - range.start.field6
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return true if this key range includes only keys in the same relation's data blocks, or
|
||||||
|
/// just spanning one relation and the logical size (0xffffffff) block of the relation before it.
|
||||||
|
///
|
||||||
|
/// Contiguous in this context means we know the keys are in use _somewhere_, but it might not
|
||||||
|
/// be on our shard. Later in ShardedRange we do the extra work to figure out how much
|
||||||
|
/// of a given contiguous range is present on one shard.
|
||||||
|
///
|
||||||
|
/// This matters, because:
|
||||||
|
/// - Within such ranges, keys are used contiguously. Outside such ranges it is sparse.
|
||||||
|
/// - Within such ranges, we may calculate distances using simple subtraction of field6.
|
||||||
|
fn is_contiguous_range(range: &Range<Key>) -> bool {
|
||||||
|
range.start.field1 == range.end.field1
|
||||||
|
&& range.start.field2 == range.end.field2
|
||||||
|
&& range.start.field3 == range.end.field3
|
||||||
|
&& range.start.field4 == range.end.field4
|
||||||
|
&& (range.start.field5 == range.end.field5
|
||||||
|
|| (range.start.field6 == 0xffffffff && range.start.field5 + 1 == range.end.field5))
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> ShardedRange<'a> {
|
||||||
|
pub fn new(range: Range<Key>, shard_identity: &'a ShardIdentity) -> Self {
|
||||||
|
Self {
|
||||||
|
shard_identity,
|
||||||
|
range,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Break up this range into chunks, each of which has at least one local key in it if the
|
||||||
|
/// total range has at least one local key.
|
||||||
|
pub fn fragment(self, target_nblocks: u32) -> Vec<(u32, Range<Key>)> {
|
||||||
|
// Optimization for single-key case (e.g. logical size keys)
|
||||||
|
if self.range.end == self.range.start.add(1) {
|
||||||
|
return vec![(
|
||||||
|
if self.shard_identity.is_key_disposable(&self.range.start) {
|
||||||
|
0
|
||||||
|
} else {
|
||||||
|
1
|
||||||
|
},
|
||||||
|
self.range,
|
||||||
|
)];
|
||||||
|
}
|
||||||
|
|
||||||
|
if !is_contiguous_range(&self.range) {
|
||||||
|
// Ranges that span relations are not fragmented. We only get these ranges as a result
|
||||||
|
// of operations that act on existing layers, so we trust that the existing range is
|
||||||
|
// reasonably small.
|
||||||
|
return vec![(u32::MAX, self.range)];
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut fragments: Vec<(u32, Range<Key>)> = Vec::new();
|
||||||
|
|
||||||
|
let mut cursor = self.range.start;
|
||||||
|
while cursor < self.range.end {
|
||||||
|
let advance_by = self.distance_to_next_boundary(cursor);
|
||||||
|
let is_fragment_disposable = self.shard_identity.is_key_disposable(&cursor);
|
||||||
|
|
||||||
|
// If the previous fragment is undersized, then we seek to consume enough
|
||||||
|
// blocks to complete it.
|
||||||
|
let (want_blocks, merge_last_fragment) = match fragments.last_mut() {
|
||||||
|
Some(frag) if frag.0 < target_nblocks => (target_nblocks - frag.0, Some(frag)),
|
||||||
|
Some(frag) => {
|
||||||
|
// Prev block is complete, want the full number.
|
||||||
|
(
|
||||||
|
target_nblocks,
|
||||||
|
if is_fragment_disposable {
|
||||||
|
// If this current range will be empty (not shard-local data), we will merge into previous
|
||||||
|
Some(frag)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
},
|
||||||
|
)
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
// First iteration, want the full number
|
||||||
|
(target_nblocks, None)
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let advance_by = if is_fragment_disposable {
|
||||||
|
advance_by
|
||||||
|
} else {
|
||||||
|
std::cmp::min(advance_by, want_blocks)
|
||||||
|
};
|
||||||
|
|
||||||
|
let next_cursor = cursor.add(advance_by);
|
||||||
|
|
||||||
|
let this_frag = (
|
||||||
|
if is_fragment_disposable {
|
||||||
|
0
|
||||||
|
} else {
|
||||||
|
advance_by
|
||||||
|
},
|
||||||
|
cursor..next_cursor,
|
||||||
|
);
|
||||||
|
cursor = next_cursor;
|
||||||
|
|
||||||
|
if let Some(last_fragment) = merge_last_fragment {
|
||||||
|
// Previous fragment was short or this one is empty, merge into it
|
||||||
|
last_fragment.0 += this_frag.0;
|
||||||
|
last_fragment.1.end = this_frag.1.end;
|
||||||
|
} else {
|
||||||
|
fragments.push(this_frag);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fragments
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Estimate the physical pages that are within this range, on this shard. This returns
|
||||||
|
/// u32::MAX if the range spans relations: this return value should be interpreted as "large".
|
||||||
|
pub fn page_count(&self) -> u32 {
|
||||||
|
// Special cases for single keys like logical sizes
|
||||||
|
if self.range.end == self.range.start.add(1) {
|
||||||
|
return if self.shard_identity.is_key_disposable(&self.range.start) {
|
||||||
|
0
|
||||||
|
} else {
|
||||||
|
1
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// We can only do an authentic calculation of contiguous key ranges
|
||||||
|
if !is_contiguous_range(&self.range) {
|
||||||
|
return u32::MAX;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Special case for single sharded tenants: our logical and physical sizes are the same
|
||||||
|
if self.shard_identity.count < ShardCount::new(2) {
|
||||||
|
return contiguous_range_len(&self.range);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Normal path: step through stripes and part-stripes in the range, evaluate whether each one belongs
|
||||||
|
// to Self, and add the stripe's block count to our total if so.
|
||||||
|
let mut result: u64 = 0;
|
||||||
|
let mut cursor = self.range.start;
|
||||||
|
while cursor < self.range.end {
|
||||||
|
// Count up to the next stripe_size boundary or end of range
|
||||||
|
let advance_by = self.distance_to_next_boundary(cursor);
|
||||||
|
|
||||||
|
// If this blocks in this stripe belong to us, add them to our count
|
||||||
|
if !self.shard_identity.is_key_disposable(&cursor) {
|
||||||
|
result += advance_by as u64;
|
||||||
|
}
|
||||||
|
|
||||||
|
cursor = cursor.add(advance_by);
|
||||||
|
}
|
||||||
|
|
||||||
|
if result > u32::MAX as u64 {
|
||||||
|
u32::MAX
|
||||||
|
} else {
|
||||||
|
result as u32
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Advance the cursor to the next potential fragment boundary: this is either
|
||||||
|
/// a stripe boundary, or the end of the range.
|
||||||
|
fn distance_to_next_boundary(&self, cursor: Key) -> u32 {
|
||||||
|
let distance_to_range_end = contiguous_range_len(&(cursor..self.range.end));
|
||||||
|
|
||||||
|
if self.shard_identity.count < ShardCount::new(2) {
|
||||||
|
// Optimization: don't bother stepping through stripes if the tenant isn't sharded.
|
||||||
|
return distance_to_range_end;
|
||||||
|
}
|
||||||
|
|
||||||
|
if cursor.field6 == 0xffffffff {
|
||||||
|
// We are wrapping from one relation's logical size to the next relation's first data block
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
let stripe_index = cursor.field6 / self.shard_identity.stripe_size.0;
|
||||||
|
let stripe_remainder = self.shard_identity.stripe_size.0
|
||||||
|
- (cursor.field6 - stripe_index * self.shard_identity.stripe_size.0);
|
||||||
|
|
||||||
|
if cfg!(debug_assertions) {
|
||||||
|
// We should never overflow field5 and field6 -- our callers check this earlier
|
||||||
|
// and would have returned their u32::MAX cases if the input range violated this.
|
||||||
|
let next_cursor = cursor.add(stripe_remainder);
|
||||||
|
debug_assert!(
|
||||||
|
next_cursor.field1 == cursor.field1
|
||||||
|
&& next_cursor.field2 == cursor.field2
|
||||||
|
&& next_cursor.field3 == cursor.field3
|
||||||
|
&& next_cursor.field4 == cursor.field4
|
||||||
|
&& next_cursor.field5 == cursor.field5
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cmp::min(stripe_remainder, distance_to_range_end)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Whereas `page_count` estimates the number of pages physically in this range on this shard,
|
||||||
|
/// this function simply calculates the number of pages in the space, without accounting for those
|
||||||
|
/// pages that would not actually be stored on this node.
|
||||||
///
|
///
|
||||||
|
/// Don't use this function in code that works with physical entities like layer files.
|
||||||
|
pub fn raw_size(range: &Range<Key>) -> u32 {
|
||||||
|
if is_contiguous_range(range) {
|
||||||
|
contiguous_range_len(range)
|
||||||
|
} else {
|
||||||
|
u32::MAX
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl KeySpace {
|
||||||
|
/// Create a key space with a single range.
|
||||||
|
pub fn single(key_range: Range<Key>) -> Self {
|
||||||
|
Self {
|
||||||
|
ranges: vec![key_range],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Partition a key space into roughly chunks of roughly 'target_size' bytes
|
/// Partition a key space into roughly chunks of roughly 'target_size' bytes
|
||||||
/// in each partition.
|
/// in each partition.
|
||||||
///
|
///
|
||||||
pub fn partition(&self, target_size: u64) -> KeyPartitioning {
|
pub fn partition(&self, shard_identity: &ShardIdentity, target_size: u64) -> KeyPartitioning {
|
||||||
// Assume that each value is 8k in size.
|
// Assume that each value is 8k in size.
|
||||||
let target_nblocks = (target_size / BLCKSZ as u64) as usize;
|
let target_nblocks = (target_size / BLCKSZ as u64) as u32;
|
||||||
|
|
||||||
let mut parts = Vec::new();
|
let mut parts = Vec::new();
|
||||||
let mut current_part = Vec::new();
|
let mut current_part = Vec::new();
|
||||||
let mut current_part_size: usize = 0;
|
let mut current_part_size: usize = 0;
|
||||||
for range in &self.ranges {
|
for range in &self.ranges {
|
||||||
// If appending the next contiguous range in the keyspace to the current
|
// While doing partitioning, wrap the range in ShardedRange so that our size calculations
|
||||||
// partition would cause it to be too large, start a new partition.
|
// will respect shard striping rather than assuming all keys within a range are present.
|
||||||
let this_size = key_range_size(range) as usize;
|
let range = ShardedRange::new(range.clone(), shard_identity);
|
||||||
if current_part_size + this_size > target_nblocks && !current_part.is_empty() {
|
|
||||||
parts.push(KeySpace {
|
|
||||||
ranges: current_part,
|
|
||||||
});
|
|
||||||
current_part = Vec::new();
|
|
||||||
current_part_size = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// If the next range is larger than 'target_size', split it into
|
// Chunk up the range into parts that each contain up to target_size local blocks
|
||||||
// 'target_size' chunks.
|
for (frag_on_shard_size, frag_range) in range.fragment(target_nblocks) {
|
||||||
let mut remain_size = this_size;
|
// If appending the next contiguous range in the keyspace to the current
|
||||||
let mut start = range.start;
|
// partition would cause it to be too large, and our current partition
|
||||||
while remain_size > target_nblocks {
|
// covers at least one block that is physically present in this shard,
|
||||||
let next = start.add(target_nblocks as u32);
|
// then start a new partition
|
||||||
parts.push(KeySpace {
|
if current_part_size + frag_on_shard_size as usize > target_nblocks as usize
|
||||||
ranges: vec![start..next],
|
&& current_part_size > 0
|
||||||
});
|
{
|
||||||
start = next;
|
parts.push(KeySpace {
|
||||||
remain_size -= target_nblocks
|
ranges: current_part,
|
||||||
|
});
|
||||||
|
current_part = Vec::new();
|
||||||
|
current_part_size = 0;
|
||||||
|
}
|
||||||
|
current_part.push(frag_range.start..frag_range.end);
|
||||||
|
current_part_size += frag_on_shard_size as usize;
|
||||||
}
|
}
|
||||||
current_part.push(start..range.end);
|
|
||||||
current_part_size += remain_size;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// add last partition that wasn't full yet.
|
// add last partition that wasn't full yet.
|
||||||
@@ -64,8 +302,12 @@ impl KeySpace {
|
|||||||
KeyPartitioning { parts }
|
KeyPartitioning { parts }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn is_empty(&self) -> bool {
|
||||||
|
self.total_raw_size() == 0
|
||||||
|
}
|
||||||
|
|
||||||
/// Merge another keyspace into the current one.
|
/// Merge another keyspace into the current one.
|
||||||
/// Note: the keyspaces must not ovelap (enforced via assertions)
|
/// Note: the keyspaces must not overlap (enforced via assertions). To merge overlapping key ranges, use `KeySpaceRandomAccum`.
|
||||||
pub fn merge(&mut self, other: &KeySpace) {
|
pub fn merge(&mut self, other: &KeySpace) {
|
||||||
let all_ranges = self
|
let all_ranges = self
|
||||||
.ranges
|
.ranges
|
||||||
@@ -94,12 +336,13 @@ impl KeySpace {
|
|||||||
|
|
||||||
/// Remove all keys in `other` from `self`.
|
/// Remove all keys in `other` from `self`.
|
||||||
/// This can involve splitting or removing of existing ranges.
|
/// This can involve splitting or removing of existing ranges.
|
||||||
pub fn remove_overlapping_with(&mut self, other: &KeySpace) {
|
/// Returns the removed keyspace
|
||||||
|
pub fn remove_overlapping_with(&mut self, other: &KeySpace) -> KeySpace {
|
||||||
let (self_start, self_end) = match (self.start(), self.end()) {
|
let (self_start, self_end) = match (self.start(), self.end()) {
|
||||||
(Some(start), Some(end)) => (start, end),
|
(Some(start), Some(end)) => (start, end),
|
||||||
_ => {
|
_ => {
|
||||||
// self is empty
|
// self is empty
|
||||||
return;
|
return KeySpace::default();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -112,30 +355,37 @@ impl KeySpace {
|
|||||||
.skip_while(|range| self_start >= range.end)
|
.skip_while(|range| self_start >= range.end)
|
||||||
.take_while(|range| self_end > range.start);
|
.take_while(|range| self_end > range.start);
|
||||||
|
|
||||||
|
let mut removed_accum = KeySpaceRandomAccum::new();
|
||||||
for range in other_ranges {
|
for range in other_ranges {
|
||||||
while let Some(overlap_at) = self.overlaps_at(range) {
|
while let Some(overlap_at) = self.overlaps_at(range) {
|
||||||
let overlapped = self.ranges[overlap_at].clone();
|
let overlapped = self.ranges[overlap_at].clone();
|
||||||
|
|
||||||
if overlapped.start < range.start && overlapped.end <= range.end {
|
if overlapped.start < range.start && overlapped.end <= range.end {
|
||||||
// Higher part of the range is completely overlapped.
|
// Higher part of the range is completely overlapped.
|
||||||
|
removed_accum.add_range(range.start..self.ranges[overlap_at].end);
|
||||||
self.ranges[overlap_at].end = range.start;
|
self.ranges[overlap_at].end = range.start;
|
||||||
}
|
}
|
||||||
if overlapped.start >= range.start && overlapped.end > range.end {
|
if overlapped.start >= range.start && overlapped.end > range.end {
|
||||||
// Lower part of the range is completely overlapped.
|
// Lower part of the range is completely overlapped.
|
||||||
|
removed_accum.add_range(self.ranges[overlap_at].start..range.end);
|
||||||
self.ranges[overlap_at].start = range.end;
|
self.ranges[overlap_at].start = range.end;
|
||||||
}
|
}
|
||||||
if overlapped.start < range.start && overlapped.end > range.end {
|
if overlapped.start < range.start && overlapped.end > range.end {
|
||||||
// Middle part of the range is overlapped.
|
// Middle part of the range is overlapped.
|
||||||
|
removed_accum.add_range(range.clone());
|
||||||
self.ranges[overlap_at].end = range.start;
|
self.ranges[overlap_at].end = range.start;
|
||||||
self.ranges
|
self.ranges
|
||||||
.insert(overlap_at + 1, range.end..overlapped.end);
|
.insert(overlap_at + 1, range.end..overlapped.end);
|
||||||
}
|
}
|
||||||
if overlapped.start >= range.start && overlapped.end <= range.end {
|
if overlapped.start >= range.start && overlapped.end <= range.end {
|
||||||
// Whole range is overlapped
|
// Whole range is overlapped
|
||||||
|
removed_accum.add_range(self.ranges[overlap_at].clone());
|
||||||
self.ranges.remove(overlap_at);
|
self.ranges.remove(overlap_at);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
removed_accum.to_keyspace()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn start(&self) -> Option<Key> {
|
pub fn start(&self) -> Option<Key> {
|
||||||
@@ -146,11 +396,11 @@ impl KeySpace {
|
|||||||
self.ranges.last().map(|range| range.end)
|
self.ranges.last().map(|range| range.end)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[allow(unused)]
|
/// The size of the keyspace in pages, before accounting for sharding
|
||||||
pub fn total_size(&self) -> usize {
|
pub fn total_raw_size(&self) -> usize {
|
||||||
self.ranges
|
self.ranges
|
||||||
.iter()
|
.iter()
|
||||||
.map(|range| key_range_size(range) as usize)
|
.map(|range| ShardedRange::raw_size(range) as usize)
|
||||||
.sum()
|
.sum()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -170,6 +420,11 @@ impl KeySpace {
|
|||||||
pub fn overlaps(&self, range: &Range<Key>) -> bool {
|
pub fn overlaps(&self, range: &Range<Key>) -> bool {
|
||||||
self.overlaps_at(range).is_some()
|
self.overlaps_at(range).is_some()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Check if the keyspace contains a key
|
||||||
|
pub fn contains(&self, key: &Key) -> bool {
|
||||||
|
self.overlaps(&(*key..key.next()))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
///
|
///
|
||||||
@@ -184,10 +439,33 @@ pub struct KeyPartitioning {
|
|||||||
pub parts: Vec<KeySpace>,
|
pub parts: Vec<KeySpace>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Represents a partitioning of the sparse key space.
|
||||||
|
#[derive(Clone, Debug, Default)]
|
||||||
|
pub struct SparseKeyPartitioning {
|
||||||
|
pub parts: Vec<SparseKeySpace>,
|
||||||
|
}
|
||||||
|
|
||||||
impl KeyPartitioning {
|
impl KeyPartitioning {
|
||||||
pub fn new() -> Self {
|
pub fn new() -> Self {
|
||||||
KeyPartitioning { parts: Vec::new() }
|
KeyPartitioning { parts: Vec::new() }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Convert a key partitioning to a sparse partition.
|
||||||
|
pub fn into_sparse(self) -> SparseKeyPartitioning {
|
||||||
|
SparseKeyPartitioning {
|
||||||
|
parts: self.parts.into_iter().map(SparseKeySpace).collect(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SparseKeyPartitioning {
|
||||||
|
/// Note: use this function with caution. Attempt to handle a sparse keyspace in the same way as a dense keyspace will
|
||||||
|
/// cause long/dead loops.
|
||||||
|
pub fn into_dense(self) -> KeyPartitioning {
|
||||||
|
KeyPartitioning {
|
||||||
|
parts: self.parts.into_iter().map(|x| x.0).collect(),
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
///
|
///
|
||||||
@@ -219,7 +497,7 @@ impl KeySpaceAccum {
|
|||||||
|
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
pub fn add_range(&mut self, range: Range<Key>) {
|
pub fn add_range(&mut self, range: Range<Key>) {
|
||||||
self.size += key_range_size(&range) as u64;
|
self.size += ShardedRange::raw_size(&range) as u64;
|
||||||
|
|
||||||
match self.accum.as_mut() {
|
match self.accum.as_mut() {
|
||||||
Some(accum) => {
|
Some(accum) => {
|
||||||
@@ -251,7 +529,9 @@ impl KeySpaceAccum {
|
|||||||
std::mem::take(self).to_keyspace()
|
std::mem::take(self).to_keyspace()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn size(&self) -> u64 {
|
// The total number of keys in this object, ignoring any sharding effects that might cause some of
|
||||||
|
// the keys to be omitted in storage on this shard.
|
||||||
|
pub fn raw_size(&self) -> u64 {
|
||||||
self.size
|
self.size
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -307,36 +587,19 @@ impl KeySpaceRandomAccum {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline(always)]
|
|
||||||
pub fn key_range_size(key_range: &Range<Key>) -> u32 {
|
|
||||||
let start = key_range.start;
|
|
||||||
let end = key_range.end;
|
|
||||||
|
|
||||||
if end.field1 != start.field1
|
|
||||||
|| end.field2 != start.field2
|
|
||||||
|| end.field3 != start.field3
|
|
||||||
|| end.field4 != start.field4
|
|
||||||
{
|
|
||||||
return u32::MAX;
|
|
||||||
}
|
|
||||||
|
|
||||||
let start = (start.field5 as u64) << 32 | start.field6 as u64;
|
|
||||||
let end = (end.field5 as u64) << 32 | end.field6 as u64;
|
|
||||||
|
|
||||||
let diff = end - start;
|
|
||||||
if diff > u32::MAX as u64 {
|
|
||||||
u32::MAX
|
|
||||||
} else {
|
|
||||||
diff as u32
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn singleton_range(key: Key) -> Range<Key> {
|
pub fn singleton_range(key: Key) -> Range<Key> {
|
||||||
key..key.next()
|
key..key.next()
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
use rand::{RngCore, SeedableRng};
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
models::ShardParameters,
|
||||||
|
shard::{ShardCount, ShardNumber},
|
||||||
|
};
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
use std::fmt::Write;
|
use std::fmt::Write;
|
||||||
|
|
||||||
@@ -379,14 +642,17 @@ mod tests {
|
|||||||
accum.add_range(range.clone());
|
accum.add_range(range.clone());
|
||||||
}
|
}
|
||||||
|
|
||||||
let expected_size: u64 = ranges.iter().map(|r| key_range_size(r) as u64).sum();
|
let expected_size: u64 = ranges
|
||||||
assert_eq!(accum.size(), expected_size);
|
.iter()
|
||||||
|
.map(|r| ShardedRange::raw_size(r) as u64)
|
||||||
|
.sum();
|
||||||
|
assert_eq!(accum.raw_size(), expected_size);
|
||||||
|
|
||||||
assert_ks_eq(&accum.consume_keyspace(), ranges.clone());
|
assert_ks_eq(&accum.consume_keyspace(), ranges.clone());
|
||||||
assert_eq!(accum.size(), 0);
|
assert_eq!(accum.raw_size(), 0);
|
||||||
|
|
||||||
assert_ks_eq(&accum.consume_keyspace(), vec![]);
|
assert_ks_eq(&accum.consume_keyspace(), vec![]);
|
||||||
assert_eq!(accum.size(), 0);
|
assert_eq!(accum.raw_size(), 0);
|
||||||
|
|
||||||
for range in &ranges {
|
for range in &ranges {
|
||||||
accum.add_range(range.clone());
|
accum.add_range(range.clone());
|
||||||
@@ -553,7 +819,16 @@ mod tests {
|
|||||||
Key::from_i128(11)..Key::from_i128(13),
|
Key::from_i128(11)..Key::from_i128(13),
|
||||||
],
|
],
|
||||||
};
|
};
|
||||||
key_space1.remove_overlapping_with(&key_space2);
|
let removed = key_space1.remove_overlapping_with(&key_space2);
|
||||||
|
let removed_expected = KeySpace {
|
||||||
|
ranges: vec![
|
||||||
|
Key::from_i128(2)..Key::from_i128(3),
|
||||||
|
Key::from_i128(6)..Key::from_i128(7),
|
||||||
|
Key::from_i128(11)..Key::from_i128(12),
|
||||||
|
],
|
||||||
|
};
|
||||||
|
assert_eq!(removed, removed_expected);
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
key_space1.ranges,
|
key_space1.ranges,
|
||||||
vec![
|
vec![
|
||||||
@@ -583,7 +858,17 @@ mod tests {
|
|||||||
Key::from_i128(14)..Key::from_i128(17),
|
Key::from_i128(14)..Key::from_i128(17),
|
||||||
],
|
],
|
||||||
};
|
};
|
||||||
key_space1.remove_overlapping_with(&key_space2);
|
|
||||||
|
let removed = key_space1.remove_overlapping_with(&key_space2);
|
||||||
|
let removed_expected = KeySpace {
|
||||||
|
ranges: vec![
|
||||||
|
Key::from_i128(3)..Key::from_i128(5),
|
||||||
|
Key::from_i128(8)..Key::from_i128(10),
|
||||||
|
Key::from_i128(14)..Key::from_i128(15),
|
||||||
|
],
|
||||||
|
};
|
||||||
|
assert_eq!(removed, removed_expected);
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
key_space1.ranges,
|
key_space1.ranges,
|
||||||
vec![
|
vec![
|
||||||
@@ -610,7 +895,11 @@ mod tests {
|
|||||||
Key::from_i128(15)..Key::from_i128(17),
|
Key::from_i128(15)..Key::from_i128(17),
|
||||||
],
|
],
|
||||||
};
|
};
|
||||||
key_space1.remove_overlapping_with(&key_space2);
|
|
||||||
|
let removed = key_space1.remove_overlapping_with(&key_space2);
|
||||||
|
let removed_expected = KeySpace::default();
|
||||||
|
assert_eq!(removed, removed_expected);
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
key_space1.ranges,
|
key_space1.ranges,
|
||||||
vec![
|
vec![
|
||||||
@@ -637,7 +926,17 @@ mod tests {
|
|||||||
let key_space2 = KeySpace {
|
let key_space2 = KeySpace {
|
||||||
ranges: vec![Key::from_i128(9)..Key::from_i128(19)],
|
ranges: vec![Key::from_i128(9)..Key::from_i128(19)],
|
||||||
};
|
};
|
||||||
key_space1.remove_overlapping_with(&key_space2);
|
|
||||||
|
let removed = key_space1.remove_overlapping_with(&key_space2);
|
||||||
|
let removed_expected = KeySpace {
|
||||||
|
ranges: vec![
|
||||||
|
Key::from_i128(9)..Key::from_i128(10),
|
||||||
|
Key::from_i128(12)..Key::from_i128(15),
|
||||||
|
Key::from_i128(17)..Key::from_i128(19),
|
||||||
|
],
|
||||||
|
};
|
||||||
|
assert_eq!(removed, removed_expected);
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
key_space1.ranges,
|
key_space1.ranges,
|
||||||
vec![
|
vec![
|
||||||
@@ -650,4 +949,412 @@ mod tests {
|
|||||||
]
|
]
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
#[test]
|
||||||
|
fn sharded_range_relation_gap() {
|
||||||
|
let shard_identity = ShardIdentity::new(
|
||||||
|
ShardNumber(0),
|
||||||
|
ShardCount::new(4),
|
||||||
|
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let range = ShardedRange::new(
|
||||||
|
Range {
|
||||||
|
start: Key::from_hex("000000067F00000005000040100300000000").unwrap(),
|
||||||
|
end: Key::from_hex("000000067F00000005000040130000004000").unwrap(),
|
||||||
|
},
|
||||||
|
&shard_identity,
|
||||||
|
);
|
||||||
|
|
||||||
|
// Key range spans relations, expect MAX
|
||||||
|
assert_eq!(range.page_count(), u32::MAX);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn shard_identity_keyspaces_single_key() {
|
||||||
|
let shard_identity = ShardIdentity::new(
|
||||||
|
ShardNumber(1),
|
||||||
|
ShardCount::new(4),
|
||||||
|
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let range = ShardedRange::new(
|
||||||
|
Range {
|
||||||
|
start: Key::from_hex("000000067f000000010000007000ffffffff").unwrap(),
|
||||||
|
end: Key::from_hex("000000067f00000001000000700100000000").unwrap(),
|
||||||
|
},
|
||||||
|
&shard_identity,
|
||||||
|
);
|
||||||
|
// Single-key range on logical size key
|
||||||
|
assert_eq!(range.page_count(), 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test the helper that we use to identify ranges which go outside the data blocks of a single relation
|
||||||
|
#[test]
|
||||||
|
fn contiguous_range_check() {
|
||||||
|
assert!(!is_contiguous_range(
|
||||||
|
&(Key::from_hex("000000067f00000001000004df00fffffffe").unwrap()
|
||||||
|
..Key::from_hex("000000067f00000001000004df0100000003").unwrap())
|
||||||
|
),);
|
||||||
|
|
||||||
|
// The ranges goes all the way up to the 0xffffffff, including it: this is
|
||||||
|
// not considered a rel block range because 0xffffffff stores logical sizes,
|
||||||
|
// not blocks.
|
||||||
|
assert!(!is_contiguous_range(
|
||||||
|
&(Key::from_hex("000000067f00000001000004df00fffffffe").unwrap()
|
||||||
|
..Key::from_hex("000000067f00000001000004df0100000000").unwrap())
|
||||||
|
),);
|
||||||
|
|
||||||
|
// Keys within the normal data region of a relation
|
||||||
|
assert!(is_contiguous_range(
|
||||||
|
&(Key::from_hex("000000067f00000001000004df0000000000").unwrap()
|
||||||
|
..Key::from_hex("000000067f00000001000004df0000000080").unwrap())
|
||||||
|
),);
|
||||||
|
|
||||||
|
// The logical size key of one forkno, then some blocks in the next
|
||||||
|
assert!(is_contiguous_range(
|
||||||
|
&(Key::from_hex("000000067f00000001000004df00ffffffff").unwrap()
|
||||||
|
..Key::from_hex("000000067f00000001000004df0100000080").unwrap())
|
||||||
|
),);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn shard_identity_keyspaces_forkno_gap() {
|
||||||
|
let shard_identity = ShardIdentity::new(
|
||||||
|
ShardNumber(1),
|
||||||
|
ShardCount::new(4),
|
||||||
|
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let range = ShardedRange::new(
|
||||||
|
Range {
|
||||||
|
start: Key::from_hex("000000067f00000001000004df00fffffffe").unwrap(),
|
||||||
|
end: Key::from_hex("000000067f00000001000004df0100000003").unwrap(),
|
||||||
|
},
|
||||||
|
&shard_identity,
|
||||||
|
);
|
||||||
|
|
||||||
|
// Range spanning the end of one forkno and the start of the next: we do not attempt to
|
||||||
|
// calculate a valid size, because we have no way to know if they keys between start
|
||||||
|
// and end are actually in use.
|
||||||
|
assert_eq!(range.page_count(), u32::MAX);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn shard_identity_keyspaces_one_relation() {
|
||||||
|
for shard_number in 0..4 {
|
||||||
|
let shard_identity = ShardIdentity::new(
|
||||||
|
ShardNumber(shard_number),
|
||||||
|
ShardCount::new(4),
|
||||||
|
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let range = ShardedRange::new(
|
||||||
|
Range {
|
||||||
|
start: Key::from_hex("000000067f00000001000000ae0000000000").unwrap(),
|
||||||
|
end: Key::from_hex("000000067f00000001000000ae0000000001").unwrap(),
|
||||||
|
},
|
||||||
|
&shard_identity,
|
||||||
|
);
|
||||||
|
|
||||||
|
// Very simple case: range covering block zero of one relation, where that block maps to shard zero
|
||||||
|
if shard_number == 0 {
|
||||||
|
assert_eq!(range.page_count(), 1);
|
||||||
|
} else {
|
||||||
|
// Other shards should perceive the range's size as zero
|
||||||
|
assert_eq!(range.page_count(), 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test helper: construct a ShardedRange and call fragment() on it, returning
|
||||||
|
/// the total page count in the range and the fragments.
|
||||||
|
fn do_fragment(
|
||||||
|
range_start: Key,
|
||||||
|
range_end: Key,
|
||||||
|
shard_identity: &ShardIdentity,
|
||||||
|
target_nblocks: u32,
|
||||||
|
) -> (u32, Vec<(u32, Range<Key>)>) {
|
||||||
|
let range = ShardedRange::new(
|
||||||
|
Range {
|
||||||
|
start: range_start,
|
||||||
|
end: range_end,
|
||||||
|
},
|
||||||
|
shard_identity,
|
||||||
|
);
|
||||||
|
|
||||||
|
let page_count = range.page_count();
|
||||||
|
let fragments = range.fragment(target_nblocks);
|
||||||
|
|
||||||
|
// Invariant: we always get at least one fragment
|
||||||
|
assert!(!fragments.is_empty());
|
||||||
|
|
||||||
|
// Invariant: the first/last fragment start/end should equal the input start/end
|
||||||
|
assert_eq!(fragments.first().unwrap().1.start, range_start);
|
||||||
|
assert_eq!(fragments.last().unwrap().1.end, range_end);
|
||||||
|
|
||||||
|
if page_count > 0 {
|
||||||
|
// Invariant: every fragment must contain at least one shard-local page, if the
|
||||||
|
// total range contains at least one shard-local page
|
||||||
|
let all_nonzero = fragments.iter().all(|f| f.0 > 0);
|
||||||
|
if !all_nonzero {
|
||||||
|
eprintln!("Found a zero-length fragment: {:?}", fragments);
|
||||||
|
}
|
||||||
|
assert!(all_nonzero);
|
||||||
|
} else {
|
||||||
|
// A range with no shard-local pages should always be returned as a single fragment
|
||||||
|
assert_eq!(fragments, vec![(0, range_start..range_end)]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Invariant: fragments must be ordered and non-overlapping
|
||||||
|
let mut last: Option<Range<Key>> = None;
|
||||||
|
for frag in &fragments {
|
||||||
|
if let Some(last) = last {
|
||||||
|
assert!(frag.1.start >= last.end);
|
||||||
|
assert!(frag.1.start > last.start);
|
||||||
|
}
|
||||||
|
last = Some(frag.1.clone())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Invariant: fragments respect target_nblocks
|
||||||
|
for frag in &fragments {
|
||||||
|
assert!(frag.0 == u32::MAX || frag.0 <= target_nblocks);
|
||||||
|
}
|
||||||
|
|
||||||
|
(page_count, fragments)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Really simple tests for fragment(), on a range that just contains a single stripe
|
||||||
|
/// for a single tenant.
|
||||||
|
#[test]
|
||||||
|
fn sharded_range_fragment_simple() {
|
||||||
|
let shard_identity = ShardIdentity::new(
|
||||||
|
ShardNumber(0),
|
||||||
|
ShardCount::new(4),
|
||||||
|
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
// A range which we happen to know covers exactly one stripe which belongs to this shard
|
||||||
|
let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
|
||||||
|
let input_end = Key::from_hex("000000067f00000001000000ae0000008000").unwrap();
|
||||||
|
|
||||||
|
// Ask for stripe_size blocks, we get the whole stripe
|
||||||
|
assert_eq!(
|
||||||
|
do_fragment(input_start, input_end, &shard_identity, 32768),
|
||||||
|
(32768, vec![(32768, input_start..input_end)])
|
||||||
|
);
|
||||||
|
|
||||||
|
// Ask for more, we still get the whole stripe
|
||||||
|
assert_eq!(
|
||||||
|
do_fragment(input_start, input_end, &shard_identity, 10000000),
|
||||||
|
(32768, vec![(32768, input_start..input_end)])
|
||||||
|
);
|
||||||
|
|
||||||
|
// Ask for target_nblocks of half the stripe size, we get two halves
|
||||||
|
assert_eq!(
|
||||||
|
do_fragment(input_start, input_end, &shard_identity, 16384),
|
||||||
|
(
|
||||||
|
32768,
|
||||||
|
vec![
|
||||||
|
(16384, input_start..input_start.add(16384)),
|
||||||
|
(16384, input_start.add(16384)..input_end)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn sharded_range_fragment_multi_stripe() {
|
||||||
|
let shard_identity = ShardIdentity::new(
|
||||||
|
ShardNumber(0),
|
||||||
|
ShardCount::new(4),
|
||||||
|
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
// A range which covers multiple stripes, exactly one of which belongs to the current shard.
|
||||||
|
let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
|
||||||
|
let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
|
||||||
|
// Ask for all the blocks, get a fragment that covers the whole range but reports
|
||||||
|
// its size to be just the blocks belonging to our shard.
|
||||||
|
assert_eq!(
|
||||||
|
do_fragment(input_start, input_end, &shard_identity, 131072),
|
||||||
|
(32768, vec![(32768, input_start..input_end)])
|
||||||
|
);
|
||||||
|
|
||||||
|
// Ask for a sub-stripe quantity
|
||||||
|
assert_eq!(
|
||||||
|
do_fragment(input_start, input_end, &shard_identity, 16000),
|
||||||
|
(
|
||||||
|
32768,
|
||||||
|
vec![
|
||||||
|
(16000, input_start..input_start.add(16000)),
|
||||||
|
(16000, input_start.add(16000)..input_start.add(32000)),
|
||||||
|
(768, input_start.add(32000)..input_end),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
);
|
||||||
|
|
||||||
|
// Try on a range that starts slightly after our owned stripe
|
||||||
|
assert_eq!(
|
||||||
|
do_fragment(input_start.add(1), input_end, &shard_identity, 131072),
|
||||||
|
(32767, vec![(32767, input_start.add(1)..input_end)])
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test our calculations work correctly when we start a range from the logical size key of
|
||||||
|
/// a previous relation.
|
||||||
|
#[test]
|
||||||
|
fn sharded_range_fragment_starting_from_logical_size() {
|
||||||
|
let input_start = Key::from_hex("000000067f00000001000000ae00ffffffff").unwrap();
|
||||||
|
let input_end = Key::from_hex("000000067f00000001000000ae0100008000").unwrap();
|
||||||
|
|
||||||
|
// Shard 0 owns the first stripe in the relation, and the preceding logical size is shard local too
|
||||||
|
let shard_identity = ShardIdentity::new(
|
||||||
|
ShardNumber(0),
|
||||||
|
ShardCount::new(4),
|
||||||
|
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(
|
||||||
|
do_fragment(input_start, input_end, &shard_identity, 0x10000),
|
||||||
|
(0x8001, vec![(0x8001, input_start..input_end)])
|
||||||
|
);
|
||||||
|
|
||||||
|
// Shard 1 does not own the first stripe in the relation, but it does own the logical size (all shards
|
||||||
|
// store all logical sizes)
|
||||||
|
let shard_identity = ShardIdentity::new(
|
||||||
|
ShardNumber(1),
|
||||||
|
ShardCount::new(4),
|
||||||
|
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(
|
||||||
|
do_fragment(input_start, input_end, &shard_identity, 0x10000),
|
||||||
|
(0x1, vec![(0x1, input_start..input_end)])
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Test that ShardedRange behaves properly when used on un-sharded data
|
||||||
|
#[test]
|
||||||
|
fn sharded_range_fragment_unsharded() {
|
||||||
|
let shard_identity = ShardIdentity::unsharded();
|
||||||
|
|
||||||
|
let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
|
||||||
|
let input_end = Key::from_hex("000000067f00000001000000ae0000010000").unwrap();
|
||||||
|
assert_eq!(
|
||||||
|
do_fragment(input_start, input_end, &shard_identity, 0x8000),
|
||||||
|
(
|
||||||
|
0x10000,
|
||||||
|
vec![
|
||||||
|
(0x8000, input_start..input_start.add(0x8000)),
|
||||||
|
(0x8000, input_start.add(0x8000)..input_start.add(0x10000))
|
||||||
|
]
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn sharded_range_fragment_cross_relation() {
|
||||||
|
let shard_identity = ShardIdentity::unsharded();
|
||||||
|
|
||||||
|
// A range that spans relations: expect fragmentation to give up and return a u32::MAX size
|
||||||
|
let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
|
||||||
|
let input_end = Key::from_hex("000000068f00000001000000ae0000010000").unwrap();
|
||||||
|
assert_eq!(
|
||||||
|
do_fragment(input_start, input_end, &shard_identity, 0x8000),
|
||||||
|
(u32::MAX, vec![(u32::MAX, input_start..input_end),])
|
||||||
|
);
|
||||||
|
|
||||||
|
// Same, but using a sharded identity
|
||||||
|
let shard_identity = ShardIdentity::new(
|
||||||
|
ShardNumber(0),
|
||||||
|
ShardCount::new(4),
|
||||||
|
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(
|
||||||
|
do_fragment(input_start, input_end, &shard_identity, 0x8000),
|
||||||
|
(u32::MAX, vec![(u32::MAX, input_start..input_end),])
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn sharded_range_fragment_tiny_nblocks() {
|
||||||
|
let shard_identity = ShardIdentity::unsharded();
|
||||||
|
|
||||||
|
// A range that spans relations: expect fragmentation to give up and return a u32::MAX size
|
||||||
|
let input_start = Key::from_hex("000000067F00000001000004E10000000000").unwrap();
|
||||||
|
let input_end = Key::from_hex("000000067F00000001000004E10000000038").unwrap();
|
||||||
|
assert_eq!(
|
||||||
|
do_fragment(input_start, input_end, &shard_identity, 16),
|
||||||
|
(
|
||||||
|
0x38,
|
||||||
|
vec![
|
||||||
|
(16, input_start..input_start.add(16)),
|
||||||
|
(16, input_start.add(16)..input_start.add(32)),
|
||||||
|
(16, input_start.add(32)..input_start.add(48)),
|
||||||
|
(8, input_start.add(48)..input_end),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn sharded_range_fragment_fuzz() {
|
||||||
|
// Use a fixed seed: we don't want to explicitly pick values, but we do want
|
||||||
|
// the test to be reproducible.
|
||||||
|
let mut prng = rand::rngs::StdRng::seed_from_u64(0xdeadbeef);
|
||||||
|
|
||||||
|
for _i in 0..1000 {
|
||||||
|
let shard_identity = if prng.next_u32() % 2 == 0 {
|
||||||
|
ShardIdentity::unsharded()
|
||||||
|
} else {
|
||||||
|
let shard_count = prng.next_u32() % 127 + 1;
|
||||||
|
ShardIdentity::new(
|
||||||
|
ShardNumber((prng.next_u32() % shard_count) as u8),
|
||||||
|
ShardCount::new(shard_count as u8),
|
||||||
|
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||||
|
)
|
||||||
|
.unwrap()
|
||||||
|
};
|
||||||
|
|
||||||
|
let target_nblocks = prng.next_u32() % 65536 + 1;
|
||||||
|
|
||||||
|
let start_offset = prng.next_u32() % 16384;
|
||||||
|
|
||||||
|
// Try ranges up to 4GiB in size, that are always at least 1
|
||||||
|
let range_size = prng.next_u32() % 8192 + 1;
|
||||||
|
|
||||||
|
// A range that spans relations: expect fragmentation to give up and return a u32::MAX size
|
||||||
|
let input_start = Key::from_hex("000000067F00000001000004E10000000000")
|
||||||
|
.unwrap()
|
||||||
|
.add(start_offset);
|
||||||
|
let input_end = input_start.add(range_size);
|
||||||
|
|
||||||
|
// This test's main success conditions are the invariants baked into do_fragment
|
||||||
|
let (_total_size, fragments) =
|
||||||
|
do_fragment(input_start, input_end, &shard_identity, target_nblocks);
|
||||||
|
|
||||||
|
// Pick a random key within the range and check it appears in the output
|
||||||
|
let example_key = input_start.add(prng.next_u32() % range_size);
|
||||||
|
|
||||||
|
// Panic on unwrap if it isn't found
|
||||||
|
let example_key_frag = fragments
|
||||||
|
.iter()
|
||||||
|
.find(|f| f.1.contains(&example_key))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
// Check that the fragment containing our random key has a nonzero size if
|
||||||
|
// that key is shard-local
|
||||||
|
let example_key_local = !shard_identity.is_key_disposable(&example_key);
|
||||||
|
if example_key_local {
|
||||||
|
assert!(example_key_frag.0 > 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
#![deny(unsafe_code)]
|
#![deny(unsafe_code)]
|
||||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||||
use const_format::formatcp;
|
|
||||||
|
|
||||||
pub mod controller_api;
|
pub mod controller_api;
|
||||||
pub mod key;
|
pub mod key;
|
||||||
@@ -11,7 +10,4 @@ pub mod shard;
|
|||||||
/// Public API types
|
/// Public API types
|
||||||
pub mod upcall_api;
|
pub mod upcall_api;
|
||||||
|
|
||||||
pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
|
pub mod config;
|
||||||
pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
|
|
||||||
pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
|
|
||||||
pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
|
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
pub mod detach_ancestor;
|
||||||
pub mod partitioning;
|
pub mod partitioning;
|
||||||
pub mod utilization;
|
pub mod utilization;
|
||||||
|
|
||||||
@@ -8,6 +9,7 @@ use std::{
|
|||||||
collections::HashMap,
|
collections::HashMap,
|
||||||
io::{BufRead, Read},
|
io::{BufRead, Read},
|
||||||
num::{NonZeroU64, NonZeroUsize},
|
num::{NonZeroU64, NonZeroUsize},
|
||||||
|
sync::atomic::AtomicUsize,
|
||||||
time::{Duration, SystemTime},
|
time::{Duration, SystemTime},
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -159,6 +161,22 @@ impl std::fmt::Debug for TenantState {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// A temporary lease to a specific lsn inside a timeline.
|
||||||
|
/// Access to the lsn is guaranteed by the pageserver until the expiration indicated by `valid_until`.
|
||||||
|
#[serde_as]
|
||||||
|
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
|
||||||
|
pub struct LsnLease {
|
||||||
|
#[serde_as(as = "SystemTimeAsRfc3339Millis")]
|
||||||
|
pub valid_until: SystemTime,
|
||||||
|
}
|
||||||
|
|
||||||
|
serde_with::serde_conv!(
|
||||||
|
SystemTimeAsRfc3339Millis,
|
||||||
|
SystemTime,
|
||||||
|
|time: &SystemTime| humantime::format_rfc3339_millis(*time).to_string(),
|
||||||
|
|value: String| -> Result<_, humantime::TimestampError> { humantime::parse_rfc3339(&value) }
|
||||||
|
);
|
||||||
|
|
||||||
/// The only [`TenantState`] variants we could be `TenantState::Activating` from.
|
/// The only [`TenantState`] variants we could be `TenantState::Activating` from.
|
||||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||||
pub enum ActivatingFrom {
|
pub enum ActivatingFrom {
|
||||||
@@ -287,7 +305,7 @@ pub struct TenantConfig {
|
|||||||
pub compaction_period: Option<String>,
|
pub compaction_period: Option<String>,
|
||||||
pub compaction_threshold: Option<usize>,
|
pub compaction_threshold: Option<usize>,
|
||||||
// defer parsing compaction_algorithm, like eviction_policy
|
// defer parsing compaction_algorithm, like eviction_policy
|
||||||
pub compaction_algorithm: Option<CompactionAlgorithm>,
|
pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
|
||||||
pub gc_horizon: Option<u64>,
|
pub gc_horizon: Option<u64>,
|
||||||
pub gc_period: Option<String>,
|
pub gc_period: Option<String>,
|
||||||
pub image_creation_threshold: Option<usize>,
|
pub image_creation_threshold: Option<usize>,
|
||||||
@@ -303,6 +321,103 @@ pub struct TenantConfig {
|
|||||||
pub lazy_slru_download: Option<bool>,
|
pub lazy_slru_download: Option<bool>,
|
||||||
pub timeline_get_throttle: Option<ThrottleConfig>,
|
pub timeline_get_throttle: Option<ThrottleConfig>,
|
||||||
pub image_layer_creation_check_threshold: Option<u8>,
|
pub image_layer_creation_check_threshold: Option<u8>,
|
||||||
|
pub switch_aux_file_policy: Option<AuxFilePolicy>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The policy for the aux file storage. It can be switched through `switch_aux_file_policy`
|
||||||
|
/// tenant config. When the first aux file written, the policy will be persisted in the
|
||||||
|
/// `index_part.json` file and has a limited migration path.
|
||||||
|
///
|
||||||
|
/// Currently, we only allow the following migration path:
|
||||||
|
///
|
||||||
|
/// Unset -> V1
|
||||||
|
/// -> V2
|
||||||
|
/// -> CrossValidation -> V2
|
||||||
|
#[derive(
|
||||||
|
Eq,
|
||||||
|
PartialEq,
|
||||||
|
Debug,
|
||||||
|
Copy,
|
||||||
|
Clone,
|
||||||
|
strum_macros::EnumString,
|
||||||
|
strum_macros::Display,
|
||||||
|
serde_with::DeserializeFromStr,
|
||||||
|
serde_with::SerializeDisplay,
|
||||||
|
)]
|
||||||
|
#[strum(serialize_all = "kebab-case")]
|
||||||
|
pub enum AuxFilePolicy {
|
||||||
|
/// V1 aux file policy: store everything in AUX_FILE_KEY
|
||||||
|
#[strum(ascii_case_insensitive)]
|
||||||
|
V1,
|
||||||
|
/// V2 aux file policy: store in the AUX_FILE keyspace
|
||||||
|
#[strum(ascii_case_insensitive)]
|
||||||
|
V2,
|
||||||
|
/// Cross validation runs both formats on the write path and does validation
|
||||||
|
/// on the read path.
|
||||||
|
#[strum(ascii_case_insensitive)]
|
||||||
|
CrossValidation,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl AuxFilePolicy {
|
||||||
|
pub fn is_valid_migration_path(from: Option<Self>, to: Self) -> bool {
|
||||||
|
matches!(
|
||||||
|
(from, to),
|
||||||
|
(None, _) | (Some(AuxFilePolicy::CrossValidation), AuxFilePolicy::V2)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
|
||||||
|
pub fn default_tenant_config() -> Self {
|
||||||
|
Self::V1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The aux file policy memory flag. Users can store `Option<AuxFilePolicy>` into this atomic flag. 0 == unspecified.
|
||||||
|
pub struct AtomicAuxFilePolicy(AtomicUsize);
|
||||||
|
|
||||||
|
impl AtomicAuxFilePolicy {
|
||||||
|
pub fn new(policy: Option<AuxFilePolicy>) -> Self {
|
||||||
|
Self(AtomicUsize::new(
|
||||||
|
policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn load(&self) -> Option<AuxFilePolicy> {
|
||||||
|
match self.0.load(std::sync::atomic::Ordering::Acquire) {
|
||||||
|
0 => None,
|
||||||
|
other => Some(AuxFilePolicy::from_usize(other)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn store(&self, policy: Option<AuxFilePolicy>) {
|
||||||
|
self.0.store(
|
||||||
|
policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
|
||||||
|
std::sync::atomic::Ordering::Release,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl AuxFilePolicy {
|
||||||
|
pub fn to_usize(self) -> usize {
|
||||||
|
match self {
|
||||||
|
Self::V1 => 1,
|
||||||
|
Self::CrossValidation => 2,
|
||||||
|
Self::V2 => 3,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn try_from_usize(this: usize) -> Option<Self> {
|
||||||
|
match this {
|
||||||
|
1 => Some(Self::V1),
|
||||||
|
2 => Some(Self::CrossValidation),
|
||||||
|
3 => Some(Self::V2),
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn from_usize(this: usize) -> Self {
|
||||||
|
Self::try_from_usize(this).unwrap()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||||
@@ -323,13 +438,28 @@ impl EvictionPolicy {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
#[derive(
|
||||||
#[serde(tag = "kind")]
|
Eq,
|
||||||
|
PartialEq,
|
||||||
|
Debug,
|
||||||
|
Copy,
|
||||||
|
Clone,
|
||||||
|
strum_macros::EnumString,
|
||||||
|
strum_macros::Display,
|
||||||
|
serde_with::DeserializeFromStr,
|
||||||
|
serde_with::SerializeDisplay,
|
||||||
|
)]
|
||||||
|
#[strum(serialize_all = "kebab-case")]
|
||||||
pub enum CompactionAlgorithm {
|
pub enum CompactionAlgorithm {
|
||||||
Legacy,
|
Legacy,
|
||||||
Tiered,
|
Tiered,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct CompactionAlgorithmSettings {
|
||||||
|
pub kind: CompactionAlgorithm,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||||
pub struct EvictionPolicyLayerAccessThreshold {
|
pub struct EvictionPolicyLayerAccessThreshold {
|
||||||
#[serde(with = "humantime_serde")]
|
#[serde(with = "humantime_serde")]
|
||||||
@@ -429,7 +559,6 @@ pub struct StatusResponse {
|
|||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
#[serde(deny_unknown_fields)]
|
#[serde(deny_unknown_fields)]
|
||||||
pub struct TenantLocationConfigRequest {
|
pub struct TenantLocationConfigRequest {
|
||||||
pub tenant_id: Option<TenantShardId>,
|
|
||||||
#[serde(flatten)]
|
#[serde(flatten)]
|
||||||
pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
|
pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
|
||||||
}
|
}
|
||||||
@@ -578,6 +707,9 @@ pub struct TimelineInfo {
|
|||||||
pub state: TimelineState,
|
pub state: TimelineState,
|
||||||
|
|
||||||
pub walreceiver_status: String,
|
pub walreceiver_status: String,
|
||||||
|
|
||||||
|
/// The last aux file policy being used on this timeline
|
||||||
|
pub last_aux_file_policy: Option<AuxFilePolicy>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
@@ -684,6 +816,8 @@ pub enum HistoricLayerInfo {
|
|||||||
lsn_end: Lsn,
|
lsn_end: Lsn,
|
||||||
remote: bool,
|
remote: bool,
|
||||||
access_stats: LayerAccessStats,
|
access_stats: LayerAccessStats,
|
||||||
|
|
||||||
|
l0: bool,
|
||||||
},
|
},
|
||||||
Image {
|
Image {
|
||||||
layer_file_name: String,
|
layer_file_name: String,
|
||||||
@@ -719,6 +853,16 @@ impl HistoricLayerInfo {
|
|||||||
};
|
};
|
||||||
*field = value;
|
*field = value;
|
||||||
}
|
}
|
||||||
|
pub fn layer_file_size(&self) -> u64 {
|
||||||
|
match self {
|
||||||
|
HistoricLayerInfo::Delta {
|
||||||
|
layer_file_size, ..
|
||||||
|
} => *layer_file_size,
|
||||||
|
HistoricLayerInfo::Image {
|
||||||
|
layer_file_size, ..
|
||||||
|
} => *layer_file_size,
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize)]
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
@@ -726,6 +870,16 @@ pub struct DownloadRemoteLayersTaskSpawnRequest {
|
|||||||
pub max_concurrent_downloads: NonZeroUsize,
|
pub max_concurrent_downloads: NonZeroUsize,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
|
pub struct IngestAuxFilesRequest {
|
||||||
|
pub aux_files: HashMap<String, String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
|
pub struct ListAuxFilesRequest {
|
||||||
|
pub lsn: Lsn,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||||
pub struct DownloadRemoteLayersTaskInfo {
|
pub struct DownloadRemoteLayersTaskInfo {
|
||||||
pub task_id: String,
|
pub task_id: String,
|
||||||
@@ -750,9 +904,6 @@ pub struct TimelineGcRequest {
|
|||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
pub struct WalRedoManagerProcessStatus {
|
pub struct WalRedoManagerProcessStatus {
|
||||||
pub pid: u32,
|
pub pid: u32,
|
||||||
/// The strum-generated `into::<&'static str>()` for `pageserver::walredo::ProcessKind`.
|
|
||||||
/// `ProcessKind` are a transitory thing, so, they have no enum representation in `pageserver_api`.
|
|
||||||
pub kind: Cow<'static, str>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
@@ -780,6 +931,66 @@ pub struct SecondaryProgress {
|
|||||||
pub bytes_total: u64,
|
pub bytes_total: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
|
pub struct TenantScanRemoteStorageShard {
|
||||||
|
pub tenant_shard_id: TenantShardId,
|
||||||
|
pub generation: Option<u32>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, Debug, Default)]
|
||||||
|
pub struct TenantScanRemoteStorageResponse {
|
||||||
|
pub shards: Vec<TenantScanRemoteStorageShard>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||||
|
#[serde(rename_all = "snake_case")]
|
||||||
|
pub enum TenantSorting {
|
||||||
|
ResidentSize,
|
||||||
|
MaxLogicalSize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for TenantSorting {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::ResidentSize
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||||
|
pub struct TopTenantShardsRequest {
|
||||||
|
// How would you like to sort the tenants?
|
||||||
|
pub order_by: TenantSorting,
|
||||||
|
|
||||||
|
// How many results?
|
||||||
|
pub limit: usize,
|
||||||
|
|
||||||
|
// Omit tenants with more than this many shards (e.g. if this is the max number of shards
|
||||||
|
// that the caller would ever split to)
|
||||||
|
pub where_shards_lt: Option<ShardCount>,
|
||||||
|
|
||||||
|
// Omit tenants where the ordering metric is less than this (this is an optimization to
|
||||||
|
// let us quickly exclude numerous tiny shards)
|
||||||
|
pub where_gt: Option<u64>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq)]
|
||||||
|
pub struct TopTenantShardItem {
|
||||||
|
pub id: TenantShardId,
|
||||||
|
|
||||||
|
/// Total size of layers on local disk for all timelines in this tenant
|
||||||
|
pub resident_size: u64,
|
||||||
|
|
||||||
|
/// Total size of layers in remote storage for all timelines in this tenant
|
||||||
|
pub physical_size: u64,
|
||||||
|
|
||||||
|
/// The largest logical size of a timeline within this tenant
|
||||||
|
pub max_logical_size: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, Debug, Default)]
|
||||||
|
pub struct TopTenantShardsResponse {
|
||||||
|
pub shards: Vec<TopTenantShardItem>,
|
||||||
|
}
|
||||||
|
|
||||||
pub mod virtual_file {
|
pub mod virtual_file {
|
||||||
#[derive(
|
#[derive(
|
||||||
Copy,
|
Copy,
|
||||||
@@ -847,39 +1058,72 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// In the V2 protocol version, a GetPage request contains two LSN values:
|
||||||
|
//
|
||||||
|
// request_lsn: Get the page version at this point in time. Lsn::Max is a special value that means
|
||||||
|
// "get the latest version present". It's used by the primary server, which knows that no one else
|
||||||
|
// is writing WAL. 'not_modified_since' must be set to a proper value even if request_lsn is
|
||||||
|
// Lsn::Max. Standby servers use the current replay LSN as the request LSN.
|
||||||
|
//
|
||||||
|
// not_modified_since: Hint to the pageserver that the client knows that the page has not been
|
||||||
|
// modified between 'not_modified_since' and the request LSN. It's always correct to set
|
||||||
|
// 'not_modified_since equal' to 'request_lsn' (unless Lsn::Max is used as the 'request_lsn'), but
|
||||||
|
// passing an earlier LSN can speed up the request, by allowing the pageserver to process the
|
||||||
|
// request without waiting for 'request_lsn' to arrive.
|
||||||
|
//
|
||||||
|
// The legacy V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
|
||||||
|
// sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and
|
||||||
|
// 'latest' was set to true. The V2 interface was added because there was no correct way for a
|
||||||
|
// standby to request a page at a particular non-latest LSN, and also include the
|
||||||
|
// 'not_modified_since' hint. That led to an awkward choice of either using an old LSN in the
|
||||||
|
// request, if the standby knows that the page hasn't been modified since, and risk getting an error
|
||||||
|
// if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could
|
||||||
|
// require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2
|
||||||
|
// interface allows sending both LSNs, and let the pageserver do the right thing. There is no
|
||||||
|
// difference in the responses between V1 and V2.
|
||||||
|
//
|
||||||
|
// The Request structs below reflect the V2 interface. If V1 is used, the parse function
|
||||||
|
// maps the old format requests to the new format.
|
||||||
|
//
|
||||||
|
#[derive(Clone, Copy)]
|
||||||
|
pub enum PagestreamProtocolVersion {
|
||||||
|
V1,
|
||||||
|
V2,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, PartialEq, Eq)]
|
#[derive(Debug, PartialEq, Eq)]
|
||||||
pub struct PagestreamExistsRequest {
|
pub struct PagestreamExistsRequest {
|
||||||
pub latest: bool,
|
pub request_lsn: Lsn,
|
||||||
pub lsn: Lsn,
|
pub not_modified_since: Lsn,
|
||||||
pub rel: RelTag,
|
pub rel: RelTag,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, PartialEq, Eq)]
|
#[derive(Debug, PartialEq, Eq)]
|
||||||
pub struct PagestreamNblocksRequest {
|
pub struct PagestreamNblocksRequest {
|
||||||
pub latest: bool,
|
pub request_lsn: Lsn,
|
||||||
pub lsn: Lsn,
|
pub not_modified_since: Lsn,
|
||||||
pub rel: RelTag,
|
pub rel: RelTag,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, PartialEq, Eq)]
|
#[derive(Debug, PartialEq, Eq)]
|
||||||
pub struct PagestreamGetPageRequest {
|
pub struct PagestreamGetPageRequest {
|
||||||
pub latest: bool,
|
pub request_lsn: Lsn,
|
||||||
pub lsn: Lsn,
|
pub not_modified_since: Lsn,
|
||||||
pub rel: RelTag,
|
pub rel: RelTag,
|
||||||
pub blkno: u32,
|
pub blkno: u32,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, PartialEq, Eq)]
|
#[derive(Debug, PartialEq, Eq)]
|
||||||
pub struct PagestreamDbSizeRequest {
|
pub struct PagestreamDbSizeRequest {
|
||||||
pub latest: bool,
|
pub request_lsn: Lsn,
|
||||||
pub lsn: Lsn,
|
pub not_modified_since: Lsn,
|
||||||
pub dbnode: u32,
|
pub dbnode: u32,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, PartialEq, Eq)]
|
#[derive(Debug, PartialEq, Eq)]
|
||||||
pub struct PagestreamGetSlruSegmentRequest {
|
pub struct PagestreamGetSlruSegmentRequest {
|
||||||
pub latest: bool,
|
pub request_lsn: Lsn,
|
||||||
pub lsn: Lsn,
|
pub not_modified_since: Lsn,
|
||||||
pub kind: u8,
|
pub kind: u8,
|
||||||
pub segno: u32,
|
pub segno: u32,
|
||||||
}
|
}
|
||||||
@@ -926,14 +1170,16 @@ pub struct TenantHistorySize {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl PagestreamFeMessage {
|
impl PagestreamFeMessage {
|
||||||
|
/// Serialize a compute -> pageserver message. This is currently only used in testing
|
||||||
|
/// tools. Always uses protocol version 2.
|
||||||
pub fn serialize(&self) -> Bytes {
|
pub fn serialize(&self) -> Bytes {
|
||||||
let mut bytes = BytesMut::new();
|
let mut bytes = BytesMut::new();
|
||||||
|
|
||||||
match self {
|
match self {
|
||||||
Self::Exists(req) => {
|
Self::Exists(req) => {
|
||||||
bytes.put_u8(0);
|
bytes.put_u8(0);
|
||||||
bytes.put_u8(u8::from(req.latest));
|
bytes.put_u64(req.request_lsn.0);
|
||||||
bytes.put_u64(req.lsn.0);
|
bytes.put_u64(req.not_modified_since.0);
|
||||||
bytes.put_u32(req.rel.spcnode);
|
bytes.put_u32(req.rel.spcnode);
|
||||||
bytes.put_u32(req.rel.dbnode);
|
bytes.put_u32(req.rel.dbnode);
|
||||||
bytes.put_u32(req.rel.relnode);
|
bytes.put_u32(req.rel.relnode);
|
||||||
@@ -942,8 +1188,8 @@ impl PagestreamFeMessage {
|
|||||||
|
|
||||||
Self::Nblocks(req) => {
|
Self::Nblocks(req) => {
|
||||||
bytes.put_u8(1);
|
bytes.put_u8(1);
|
||||||
bytes.put_u8(u8::from(req.latest));
|
bytes.put_u64(req.request_lsn.0);
|
||||||
bytes.put_u64(req.lsn.0);
|
bytes.put_u64(req.not_modified_since.0);
|
||||||
bytes.put_u32(req.rel.spcnode);
|
bytes.put_u32(req.rel.spcnode);
|
||||||
bytes.put_u32(req.rel.dbnode);
|
bytes.put_u32(req.rel.dbnode);
|
||||||
bytes.put_u32(req.rel.relnode);
|
bytes.put_u32(req.rel.relnode);
|
||||||
@@ -952,8 +1198,8 @@ impl PagestreamFeMessage {
|
|||||||
|
|
||||||
Self::GetPage(req) => {
|
Self::GetPage(req) => {
|
||||||
bytes.put_u8(2);
|
bytes.put_u8(2);
|
||||||
bytes.put_u8(u8::from(req.latest));
|
bytes.put_u64(req.request_lsn.0);
|
||||||
bytes.put_u64(req.lsn.0);
|
bytes.put_u64(req.not_modified_since.0);
|
||||||
bytes.put_u32(req.rel.spcnode);
|
bytes.put_u32(req.rel.spcnode);
|
||||||
bytes.put_u32(req.rel.dbnode);
|
bytes.put_u32(req.rel.dbnode);
|
||||||
bytes.put_u32(req.rel.relnode);
|
bytes.put_u32(req.rel.relnode);
|
||||||
@@ -963,15 +1209,15 @@ impl PagestreamFeMessage {
|
|||||||
|
|
||||||
Self::DbSize(req) => {
|
Self::DbSize(req) => {
|
||||||
bytes.put_u8(3);
|
bytes.put_u8(3);
|
||||||
bytes.put_u8(u8::from(req.latest));
|
bytes.put_u64(req.request_lsn.0);
|
||||||
bytes.put_u64(req.lsn.0);
|
bytes.put_u64(req.not_modified_since.0);
|
||||||
bytes.put_u32(req.dbnode);
|
bytes.put_u32(req.dbnode);
|
||||||
}
|
}
|
||||||
|
|
||||||
Self::GetSlruSegment(req) => {
|
Self::GetSlruSegment(req) => {
|
||||||
bytes.put_u8(4);
|
bytes.put_u8(4);
|
||||||
bytes.put_u8(u8::from(req.latest));
|
bytes.put_u64(req.request_lsn.0);
|
||||||
bytes.put_u64(req.lsn.0);
|
bytes.put_u64(req.not_modified_since.0);
|
||||||
bytes.put_u8(req.kind);
|
bytes.put_u8(req.kind);
|
||||||
bytes.put_u32(req.segno);
|
bytes.put_u32(req.segno);
|
||||||
}
|
}
|
||||||
@@ -980,18 +1226,40 @@ impl PagestreamFeMessage {
|
|||||||
bytes.into()
|
bytes.into()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn parse<R: std::io::Read>(body: &mut R) -> anyhow::Result<PagestreamFeMessage> {
|
pub fn parse<R: std::io::Read>(
|
||||||
// TODO these gets can fail
|
body: &mut R,
|
||||||
|
protocol_version: PagestreamProtocolVersion,
|
||||||
|
) -> anyhow::Result<PagestreamFeMessage> {
|
||||||
// these correspond to the NeonMessageTag enum in pagestore_client.h
|
// these correspond to the NeonMessageTag enum in pagestore_client.h
|
||||||
//
|
//
|
||||||
// TODO: consider using protobuf or serde bincode for less error prone
|
// TODO: consider using protobuf or serde bincode for less error prone
|
||||||
// serialization.
|
// serialization.
|
||||||
let msg_tag = body.read_u8()?;
|
let msg_tag = body.read_u8()?;
|
||||||
|
|
||||||
|
let (request_lsn, not_modified_since) = match protocol_version {
|
||||||
|
PagestreamProtocolVersion::V2 => (
|
||||||
|
Lsn::from(body.read_u64::<BigEndian>()?),
|
||||||
|
Lsn::from(body.read_u64::<BigEndian>()?),
|
||||||
|
),
|
||||||
|
PagestreamProtocolVersion::V1 => {
|
||||||
|
// In the old protocol, each message starts with a boolean 'latest' flag,
|
||||||
|
// followed by 'lsn'. Convert that to the two LSNs, 'request_lsn' and
|
||||||
|
// 'not_modified_since', used in the new protocol version.
|
||||||
|
let latest = body.read_u8()? != 0;
|
||||||
|
let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
|
||||||
|
if latest {
|
||||||
|
(Lsn::MAX, request_lsn) // get latest version
|
||||||
|
} else {
|
||||||
|
(request_lsn, request_lsn) // get version at specified LSN
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// The rest of the messages are the same between V1 and V2
|
||||||
match msg_tag {
|
match msg_tag {
|
||||||
0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
|
0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
|
||||||
latest: body.read_u8()? != 0,
|
request_lsn,
|
||||||
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
|
not_modified_since,
|
||||||
rel: RelTag {
|
rel: RelTag {
|
||||||
spcnode: body.read_u32::<BigEndian>()?,
|
spcnode: body.read_u32::<BigEndian>()?,
|
||||||
dbnode: body.read_u32::<BigEndian>()?,
|
dbnode: body.read_u32::<BigEndian>()?,
|
||||||
@@ -1000,8 +1268,8 @@ impl PagestreamFeMessage {
|
|||||||
},
|
},
|
||||||
})),
|
})),
|
||||||
1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
|
1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
|
||||||
latest: body.read_u8()? != 0,
|
request_lsn,
|
||||||
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
|
not_modified_since,
|
||||||
rel: RelTag {
|
rel: RelTag {
|
||||||
spcnode: body.read_u32::<BigEndian>()?,
|
spcnode: body.read_u32::<BigEndian>()?,
|
||||||
dbnode: body.read_u32::<BigEndian>()?,
|
dbnode: body.read_u32::<BigEndian>()?,
|
||||||
@@ -1010,8 +1278,8 @@ impl PagestreamFeMessage {
|
|||||||
},
|
},
|
||||||
})),
|
})),
|
||||||
2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
|
2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
|
||||||
latest: body.read_u8()? != 0,
|
request_lsn,
|
||||||
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
|
not_modified_since,
|
||||||
rel: RelTag {
|
rel: RelTag {
|
||||||
spcnode: body.read_u32::<BigEndian>()?,
|
spcnode: body.read_u32::<BigEndian>()?,
|
||||||
dbnode: body.read_u32::<BigEndian>()?,
|
dbnode: body.read_u32::<BigEndian>()?,
|
||||||
@@ -1021,14 +1289,14 @@ impl PagestreamFeMessage {
|
|||||||
blkno: body.read_u32::<BigEndian>()?,
|
blkno: body.read_u32::<BigEndian>()?,
|
||||||
})),
|
})),
|
||||||
3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
|
3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
|
||||||
latest: body.read_u8()? != 0,
|
request_lsn,
|
||||||
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
|
not_modified_since,
|
||||||
dbnode: body.read_u32::<BigEndian>()?,
|
dbnode: body.read_u32::<BigEndian>()?,
|
||||||
})),
|
})),
|
||||||
4 => Ok(PagestreamFeMessage::GetSlruSegment(
|
4 => Ok(PagestreamFeMessage::GetSlruSegment(
|
||||||
PagestreamGetSlruSegmentRequest {
|
PagestreamGetSlruSegmentRequest {
|
||||||
latest: body.read_u8()? != 0,
|
request_lsn,
|
||||||
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
|
not_modified_since,
|
||||||
kind: body.read_u8()?,
|
kind: body.read_u8()?,
|
||||||
segno: body.read_u32::<BigEndian>()?,
|
segno: body.read_u32::<BigEndian>()?,
|
||||||
},
|
},
|
||||||
@@ -1148,6 +1416,7 @@ impl PagestreamBeMessage {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use serde_json::json;
|
use serde_json::json;
|
||||||
|
use std::str::FromStr;
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
@@ -1156,8 +1425,8 @@ mod tests {
|
|||||||
// Test serialization/deserialization of PagestreamFeMessage
|
// Test serialization/deserialization of PagestreamFeMessage
|
||||||
let messages = vec![
|
let messages = vec![
|
||||||
PagestreamFeMessage::Exists(PagestreamExistsRequest {
|
PagestreamFeMessage::Exists(PagestreamExistsRequest {
|
||||||
latest: true,
|
request_lsn: Lsn(4),
|
||||||
lsn: Lsn(4),
|
not_modified_since: Lsn(3),
|
||||||
rel: RelTag {
|
rel: RelTag {
|
||||||
forknum: 1,
|
forknum: 1,
|
||||||
spcnode: 2,
|
spcnode: 2,
|
||||||
@@ -1166,8 +1435,8 @@ mod tests {
|
|||||||
},
|
},
|
||||||
}),
|
}),
|
||||||
PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
|
PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
|
||||||
latest: false,
|
request_lsn: Lsn(4),
|
||||||
lsn: Lsn(4),
|
not_modified_since: Lsn(4),
|
||||||
rel: RelTag {
|
rel: RelTag {
|
||||||
forknum: 1,
|
forknum: 1,
|
||||||
spcnode: 2,
|
spcnode: 2,
|
||||||
@@ -1176,8 +1445,8 @@ mod tests {
|
|||||||
},
|
},
|
||||||
}),
|
}),
|
||||||
PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
|
PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
|
||||||
latest: true,
|
request_lsn: Lsn(4),
|
||||||
lsn: Lsn(4),
|
not_modified_since: Lsn(3),
|
||||||
rel: RelTag {
|
rel: RelTag {
|
||||||
forknum: 1,
|
forknum: 1,
|
||||||
spcnode: 2,
|
spcnode: 2,
|
||||||
@@ -1187,14 +1456,16 @@ mod tests {
|
|||||||
blkno: 7,
|
blkno: 7,
|
||||||
}),
|
}),
|
||||||
PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
|
PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
|
||||||
latest: true,
|
request_lsn: Lsn(4),
|
||||||
lsn: Lsn(4),
|
not_modified_since: Lsn(3),
|
||||||
dbnode: 7,
|
dbnode: 7,
|
||||||
}),
|
}),
|
||||||
];
|
];
|
||||||
for msg in messages {
|
for msg in messages {
|
||||||
let bytes = msg.serialize();
|
let bytes = msg.serialize();
|
||||||
let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap();
|
let reconstructed =
|
||||||
|
PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V2)
|
||||||
|
.unwrap();
|
||||||
assert!(msg == reconstructed);
|
assert!(msg == reconstructed);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1353,4 +1624,69 @@ mod tests {
|
|||||||
assert_eq!(actual, expected, "example on {line}");
|
assert_eq!(actual, expected, "example on {line}");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_aux_file_migration_path() {
|
||||||
|
assert!(AuxFilePolicy::is_valid_migration_path(
|
||||||
|
None,
|
||||||
|
AuxFilePolicy::V1
|
||||||
|
));
|
||||||
|
assert!(AuxFilePolicy::is_valid_migration_path(
|
||||||
|
None,
|
||||||
|
AuxFilePolicy::V2
|
||||||
|
));
|
||||||
|
assert!(AuxFilePolicy::is_valid_migration_path(
|
||||||
|
None,
|
||||||
|
AuxFilePolicy::CrossValidation
|
||||||
|
));
|
||||||
|
// Self-migration is not a valid migration path, and the caller should handle it by itself.
|
||||||
|
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||||
|
Some(AuxFilePolicy::V1),
|
||||||
|
AuxFilePolicy::V1
|
||||||
|
));
|
||||||
|
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||||
|
Some(AuxFilePolicy::V2),
|
||||||
|
AuxFilePolicy::V2
|
||||||
|
));
|
||||||
|
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||||
|
Some(AuxFilePolicy::CrossValidation),
|
||||||
|
AuxFilePolicy::CrossValidation
|
||||||
|
));
|
||||||
|
// Migrations not allowed
|
||||||
|
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||||
|
Some(AuxFilePolicy::CrossValidation),
|
||||||
|
AuxFilePolicy::V1
|
||||||
|
));
|
||||||
|
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||||
|
Some(AuxFilePolicy::V1),
|
||||||
|
AuxFilePolicy::V2
|
||||||
|
));
|
||||||
|
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||||
|
Some(AuxFilePolicy::V2),
|
||||||
|
AuxFilePolicy::V1
|
||||||
|
));
|
||||||
|
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||||
|
Some(AuxFilePolicy::V2),
|
||||||
|
AuxFilePolicy::CrossValidation
|
||||||
|
));
|
||||||
|
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||||
|
Some(AuxFilePolicy::V1),
|
||||||
|
AuxFilePolicy::CrossValidation
|
||||||
|
));
|
||||||
|
// Migrations allowed
|
||||||
|
assert!(AuxFilePolicy::is_valid_migration_path(
|
||||||
|
Some(AuxFilePolicy::CrossValidation),
|
||||||
|
AuxFilePolicy::V2
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_aux_parse() {
|
||||||
|
assert_eq!(AuxFilePolicy::from_str("V2").unwrap(), AuxFilePolicy::V2);
|
||||||
|
assert_eq!(AuxFilePolicy::from_str("v2").unwrap(), AuxFilePolicy::V2);
|
||||||
|
assert_eq!(
|
||||||
|
AuxFilePolicy::from_str("cross-validation").unwrap(),
|
||||||
|
AuxFilePolicy::CrossValidation
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
6
libs/pageserver_api/src/models/detach_ancestor.rs
Normal file
6
libs/pageserver_api/src/models/detach_ancestor.rs
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
use utils::id::TimelineId;
|
||||||
|
|
||||||
|
#[derive(Default, serde::Serialize)]
|
||||||
|
pub struct AncestorDetached {
|
||||||
|
pub reparented_timelines: Vec<TimelineId>,
|
||||||
|
}
|
||||||
@@ -1,9 +1,11 @@
|
|||||||
use utils::lsn::Lsn;
|
use utils::lsn::Lsn;
|
||||||
|
|
||||||
|
use crate::keyspace::SparseKeySpace;
|
||||||
|
|
||||||
#[derive(Debug, PartialEq, Eq)]
|
#[derive(Debug, PartialEq, Eq)]
|
||||||
pub struct Partitioning {
|
pub struct Partitioning {
|
||||||
pub keys: crate::keyspace::KeySpace,
|
pub keys: crate::keyspace::KeySpace,
|
||||||
|
pub sparse_keys: crate::keyspace::SparseKeySpace,
|
||||||
pub at_lsn: Lsn,
|
pub at_lsn: Lsn,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -32,6 +34,8 @@ impl serde::Serialize for Partitioning {
|
|||||||
let mut map = serializer.serialize_map(Some(2))?;
|
let mut map = serializer.serialize_map(Some(2))?;
|
||||||
map.serialize_key("keys")?;
|
map.serialize_key("keys")?;
|
||||||
map.serialize_value(&KeySpace(&self.keys))?;
|
map.serialize_value(&KeySpace(&self.keys))?;
|
||||||
|
map.serialize_key("sparse_keys")?;
|
||||||
|
map.serialize_value(&KeySpace(&self.sparse_keys.0))?;
|
||||||
map.serialize_key("at_lsn")?;
|
map.serialize_key("at_lsn")?;
|
||||||
map.serialize_value(&WithDisplay(&self.at_lsn))?;
|
map.serialize_value(&WithDisplay(&self.at_lsn))?;
|
||||||
map.end()
|
map.end()
|
||||||
@@ -99,6 +103,7 @@ impl<'a> serde::Deserialize<'a> for Partitioning {
|
|||||||
#[derive(serde::Deserialize)]
|
#[derive(serde::Deserialize)]
|
||||||
struct De {
|
struct De {
|
||||||
keys: KeySpace,
|
keys: KeySpace,
|
||||||
|
sparse_keys: KeySpace,
|
||||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||||
at_lsn: Lsn,
|
at_lsn: Lsn,
|
||||||
}
|
}
|
||||||
@@ -107,6 +112,7 @@ impl<'a> serde::Deserialize<'a> for Partitioning {
|
|||||||
Ok(Self {
|
Ok(Self {
|
||||||
at_lsn: de.at_lsn,
|
at_lsn: de.at_lsn,
|
||||||
keys: de.keys.0,
|
keys: de.keys.0,
|
||||||
|
sparse_keys: SparseKeySpace(de.sparse_keys.0),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -133,6 +139,12 @@ mod tests {
|
|||||||
"030000000000000000000000000000000003"
|
"030000000000000000000000000000000003"
|
||||||
]
|
]
|
||||||
],
|
],
|
||||||
|
"sparse_keys": [
|
||||||
|
[
|
||||||
|
"620000000000000000000000000000000000",
|
||||||
|
"620000000000000000000000000000000003"
|
||||||
|
]
|
||||||
|
],
|
||||||
"at_lsn": "0/2240160"
|
"at_lsn": "0/2240160"
|
||||||
}
|
}
|
||||||
"#;
|
"#;
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ use crate::{
|
|||||||
models::ShardParameters,
|
models::ShardParameters,
|
||||||
};
|
};
|
||||||
use hex::FromHex;
|
use hex::FromHex;
|
||||||
|
use postgres_ffi::relfile_utils::INIT_FORKNUM;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use utils::id::TenantId;
|
use utils::id::TenantId;
|
||||||
|
|
||||||
@@ -96,7 +97,7 @@ impl ShardCount {
|
|||||||
|
|
||||||
/// The internal value of a ShardCount may be zero, which means "1 shard, but use
|
/// The internal value of a ShardCount may be zero, which means "1 shard, but use
|
||||||
/// legacy format for TenantShardId that excludes the shard suffix", also known
|
/// legacy format for TenantShardId that excludes the shard suffix", also known
|
||||||
/// as `TenantShardId::unsharded`.
|
/// as [`TenantShardId::unsharded`].
|
||||||
///
|
///
|
||||||
/// This method returns the actual number of shards, i.e. if our internal value is
|
/// This method returns the actual number of shards, i.e. if our internal value is
|
||||||
/// zero, we return 1 (unsharded tenants have 1 shard).
|
/// zero, we return 1 (unsharded tenants have 1 shard).
|
||||||
@@ -115,14 +116,16 @@ impl ShardCount {
|
|||||||
self.0
|
self.0
|
||||||
}
|
}
|
||||||
|
|
||||||
///
|
/// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
|
||||||
|
/// uses the legacy format for `TenantShardId`. See also the documentation for
|
||||||
|
/// [`Self::count`].
|
||||||
pub fn is_unsharded(&self) -> bool {
|
pub fn is_unsharded(&self) -> bool {
|
||||||
self.0 == 0
|
self.0 == 0
|
||||||
}
|
}
|
||||||
|
|
||||||
/// `v` may be zero, or the number of shards in the tenant. `v` is what
|
/// `v` may be zero, or the number of shards in the tenant. `v` is what
|
||||||
/// [`Self::literal`] would return.
|
/// [`Self::literal`] would return.
|
||||||
pub fn new(val: u8) -> Self {
|
pub const fn new(val: u8) -> Self {
|
||||||
Self(val)
|
Self(val)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -450,7 +453,7 @@ impl ShardIdentity {
|
|||||||
/// An identity with number=0 count=0 is a "none" identity, which represents legacy
|
/// An identity with number=0 count=0 is a "none" identity, which represents legacy
|
||||||
/// tenants. Modern single-shard tenants should not use this: they should
|
/// tenants. Modern single-shard tenants should not use this: they should
|
||||||
/// have number=0 count=1.
|
/// have number=0 count=1.
|
||||||
pub fn unsharded() -> Self {
|
pub const fn unsharded() -> Self {
|
||||||
Self {
|
Self {
|
||||||
number: ShardNumber(0),
|
number: ShardNumber(0),
|
||||||
count: ShardCount(0),
|
count: ShardCount(0),
|
||||||
@@ -556,6 +559,14 @@ impl ShardIdentity {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Obtains the shard number and count combined into a `ShardIndex`.
|
||||||
|
pub fn shard_index(&self) -> ShardIndex {
|
||||||
|
ShardIndex {
|
||||||
|
shard_count: self.count,
|
||||||
|
shard_number: self.number,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn shard_slug(&self) -> String {
|
pub fn shard_slug(&self) -> String {
|
||||||
if self.count > ShardCount(0) {
|
if self.count > ShardCount(0) {
|
||||||
format!("-{:02x}{:02x}", self.number.0, self.count.0)
|
format!("-{:02x}{:02x}", self.number.0, self.count.0)
|
||||||
@@ -649,7 +660,13 @@ fn key_is_shard0(key: &Key) -> bool {
|
|||||||
// relation pages are distributed to shards other than shard zero. Everything else gets
|
// relation pages are distributed to shards other than shard zero. Everything else gets
|
||||||
// stored on shard 0. This guarantees that shard 0 can independently serve basebackup
|
// stored on shard 0. This guarantees that shard 0 can independently serve basebackup
|
||||||
// requests, and any request other than those for particular blocks in relations.
|
// requests, and any request other than those for particular blocks in relations.
|
||||||
!is_rel_block_key(key)
|
//
|
||||||
|
// The only exception to this rule is "initfork" data -- this relates to postgres's UNLOGGED table
|
||||||
|
// type. These are special relations, usually with only 0 or 1 blocks, and we store them on shard 0
|
||||||
|
// because they must be included in basebackups.
|
||||||
|
let is_initfork = key.field5 == INIT_FORKNUM;
|
||||||
|
|
||||||
|
!is_rel_block_key(key) || is_initfork
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Provide the same result as the function in postgres `hashfn.h` with the same name
|
/// Provide the same result as the function in postgres `hashfn.h` with the same name
|
||||||
|
|||||||
@@ -820,10 +820,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
|||||||
Ok(ProcessMsgResult::Continue)
|
Ok(ProcessMsgResult::Continue)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Log as info/error result of handling COPY stream and send back
|
/// - Log as info/error result of handling COPY stream and send back
|
||||||
/// ErrorResponse if that makes sense. Shutdown the stream if we got
|
/// ErrorResponse if that makes sense.
|
||||||
/// Terminate. TODO: transition into waiting for Sync msg if we initiate the
|
/// - Shutdown the stream if we got Terminate.
|
||||||
/// close.
|
/// - Then close the connection because we don't handle exiting from COPY
|
||||||
|
/// stream normally.
|
||||||
pub async fn handle_copy_stream_end(&mut self, end: CopyStreamHandlerEnd) {
|
pub async fn handle_copy_stream_end(&mut self, end: CopyStreamHandlerEnd) {
|
||||||
use CopyStreamHandlerEnd::*;
|
use CopyStreamHandlerEnd::*;
|
||||||
|
|
||||||
@@ -849,10 +850,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Terminate = &end {
|
|
||||||
self.state = ProtoState::Closed;
|
|
||||||
}
|
|
||||||
|
|
||||||
let err_to_send_and_errcode = match &end {
|
let err_to_send_and_errcode = match &end {
|
||||||
ServerInitiated(_) => Some((end.to_string(), SQLSTATE_SUCCESSFUL_COMPLETION)),
|
ServerInitiated(_) => Some((end.to_string(), SQLSTATE_SUCCESSFUL_COMPLETION)),
|
||||||
Other(_) => Some((format!("{end:#}"), SQLSTATE_INTERNAL_ERROR)),
|
Other(_) => Some((format!("{end:#}"), SQLSTATE_INTERNAL_ERROR)),
|
||||||
@@ -882,6 +879,12 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
|||||||
error!("failed to send ErrorResponse: {}", ee);
|
error!("failed to send ErrorResponse: {}", ee);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Proper COPY stream finishing to continue using the connection is not
|
||||||
|
// implemented at the server side (we don't need it so far). To prevent
|
||||||
|
// further usages of the connection, close it.
|
||||||
|
self.framed.shutdown().await.ok();
|
||||||
|
self.state = ProtoState::Closed;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -178,6 +178,13 @@ impl PgConnectionConfig {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl fmt::Display for PgConnectionConfig {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
|
// The password is intentionally hidden and not part of this display string.
|
||||||
|
write!(f, "postgresql://{}:{}", self.host, self.port)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl fmt::Debug for PgConnectionConfig {
|
impl fmt::Debug for PgConnectionConfig {
|
||||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
// We want `password: Some(REDACTED-STRING)`, not `password: Some("REDACTED-STRING")`
|
// We want `password: Some(REDACTED-STRING)`, not `password: Some("REDACTED-STRING")`
|
||||||
|
|||||||
@@ -118,7 +118,9 @@ pub use v14::bindings::{TimeLineID, TimestampTz, XLogRecPtr, XLogSegNo};
|
|||||||
// Likewise for these, although the assumption that these don't change is a little more iffy.
|
// Likewise for these, although the assumption that these don't change is a little more iffy.
|
||||||
pub use v14::bindings::{MultiXactOffset, MultiXactStatus};
|
pub use v14::bindings::{MultiXactOffset, MultiXactStatus};
|
||||||
pub use v14::bindings::{PageHeaderData, XLogRecord};
|
pub use v14::bindings::{PageHeaderData, XLogRecord};
|
||||||
pub use v14::xlog_utils::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
|
pub use v14::xlog_utils::{
|
||||||
|
XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
|
||||||
|
};
|
||||||
|
|
||||||
pub use v14::bindings::{CheckPoint, ControlFileData};
|
pub use v14::bindings::{CheckPoint, ControlFileData};
|
||||||
|
|
||||||
|
|||||||
@@ -331,7 +331,10 @@ impl CheckPoint {
|
|||||||
/// Returns 'true' if the XID was updated.
|
/// Returns 'true' if the XID was updated.
|
||||||
pub fn update_next_xid(&mut self, xid: u32) -> bool {
|
pub fn update_next_xid(&mut self, xid: u32) -> bool {
|
||||||
// nextXid should be greater than any XID in WAL, so increment provided XID and check for wraparround.
|
// nextXid should be greater than any XID in WAL, so increment provided XID and check for wraparround.
|
||||||
let mut new_xid = std::cmp::max(xid.wrapping_add(1), pg_constants::FIRST_NORMAL_TRANSACTION_ID);
|
let mut new_xid = std::cmp::max(
|
||||||
|
xid.wrapping_add(1),
|
||||||
|
pg_constants::FIRST_NORMAL_TRANSACTION_ID,
|
||||||
|
);
|
||||||
// To reduce number of metadata checkpoints, we forward align XID on XID_CHECKPOINT_INTERVAL.
|
// To reduce number of metadata checkpoints, we forward align XID on XID_CHECKPOINT_INTERVAL.
|
||||||
// XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE
|
// XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE
|
||||||
new_xid =
|
new_xid =
|
||||||
@@ -367,8 +370,16 @@ pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result<Byte
|
|||||||
let seg_off = lsn.segment_offset(WAL_SEGMENT_SIZE);
|
let seg_off = lsn.segment_offset(WAL_SEGMENT_SIZE);
|
||||||
|
|
||||||
let first_page_only = seg_off < XLOG_BLCKSZ;
|
let first_page_only = seg_off < XLOG_BLCKSZ;
|
||||||
let (shdr_rem_len, infoflags) = if first_page_only {
|
// If first records starts in the middle of the page, pretend in page header
|
||||||
(seg_off, pg_constants::XLP_FIRST_IS_CONTRECORD)
|
// there is a fake record which ends where first real record starts. This
|
||||||
|
// makes pg_waldump etc happy.
|
||||||
|
let (shdr_rem_len, infoflags) = if first_page_only && seg_off > 0 {
|
||||||
|
assert!(seg_off >= XLOG_SIZE_OF_XLOG_LONG_PHD);
|
||||||
|
// xlp_rem_len doesn't include page header, hence the subtraction.
|
||||||
|
(
|
||||||
|
seg_off - XLOG_SIZE_OF_XLOG_LONG_PHD,
|
||||||
|
pg_constants::XLP_FIRST_IS_CONTRECORD,
|
||||||
|
)
|
||||||
} else {
|
} else {
|
||||||
(0, 0)
|
(0, 0)
|
||||||
};
|
};
|
||||||
@@ -397,20 +408,22 @@ pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result<Byte
|
|||||||
|
|
||||||
if !first_page_only {
|
if !first_page_only {
|
||||||
let block_offset = lsn.page_offset_in_segment(WAL_SEGMENT_SIZE) as usize;
|
let block_offset = lsn.page_offset_in_segment(WAL_SEGMENT_SIZE) as usize;
|
||||||
|
// see comments above about XLP_FIRST_IS_CONTRECORD and xlp_rem_len.
|
||||||
|
let (xlp_rem_len, xlp_info) = if page_off > 0 {
|
||||||
|
assert!(page_off >= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64);
|
||||||
|
(
|
||||||
|
(page_off - XLOG_SIZE_OF_XLOG_SHORT_PHD as u64) as u32,
|
||||||
|
pg_constants::XLP_FIRST_IS_CONTRECORD,
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
(0, 0)
|
||||||
|
};
|
||||||
let header = XLogPageHeaderData {
|
let header = XLogPageHeaderData {
|
||||||
xlp_magic: XLOG_PAGE_MAGIC as u16,
|
xlp_magic: XLOG_PAGE_MAGIC as u16,
|
||||||
xlp_info: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 {
|
xlp_info,
|
||||||
pg_constants::XLP_FIRST_IS_CONTRECORD
|
|
||||||
} else {
|
|
||||||
0
|
|
||||||
},
|
|
||||||
xlp_tli: PG_TLI,
|
xlp_tli: PG_TLI,
|
||||||
xlp_pageaddr: lsn.page_lsn().0,
|
xlp_pageaddr: lsn.page_lsn().0,
|
||||||
xlp_rem_len: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 {
|
xlp_rem_len,
|
||||||
page_off as u32
|
|
||||||
} else {
|
|
||||||
0u32
|
|
||||||
},
|
|
||||||
..Default::default() // Put 0 in padding fields.
|
..Default::default() // Put 0 in padding fields.
|
||||||
};
|
};
|
||||||
let hdr_bytes = header.encode()?;
|
let hdr_bytes = header.encode()?;
|
||||||
|
|||||||
@@ -4,7 +4,9 @@ use log::*;
|
|||||||
use postgres::types::PgLsn;
|
use postgres::types::PgLsn;
|
||||||
use postgres::Client;
|
use postgres::Client;
|
||||||
use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
|
use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
|
||||||
use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
|
use postgres_ffi::{
|
||||||
|
XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
|
||||||
|
};
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
use std::process::Command;
|
use std::process::Command;
|
||||||
use std::time::{Duration, Instant};
|
use std::time::{Duration, Instant};
|
||||||
@@ -262,11 +264,21 @@ fn craft_internal<C: postgres::GenericClient>(
|
|||||||
intermediate_lsns.insert(0, initial_lsn);
|
intermediate_lsns.insert(0, initial_lsn);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Some records may be not flushed, e.g. non-transactional logical messages.
|
// Some records may be not flushed, e.g. non-transactional logical messages. Flush now.
|
||||||
//
|
//
|
||||||
// Note: this is broken if pg_current_wal_insert_lsn is at page boundary
|
// If the previous WAL record ended exactly at page boundary, pg_current_wal_insert_lsn
|
||||||
// because pg_current_wal_insert_lsn skips page headers.
|
// returns the position just after the page header on the next page. That's where the next
|
||||||
client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?;
|
// record will be inserted. But the page header hasn't actually been written to the WAL
|
||||||
|
// yet, and if you try to flush it, you get a "request to flush past end of generated WAL"
|
||||||
|
// error. Because of that, if the insert location is just after a page header, back off to
|
||||||
|
// previous page boundary.
|
||||||
|
let mut lsn = u64::from(client.pg_current_wal_insert_lsn()?);
|
||||||
|
if lsn % WAL_SEGMENT_SIZE as u64 == XLOG_SIZE_OF_XLOG_LONG_PHD as u64 {
|
||||||
|
lsn -= XLOG_SIZE_OF_XLOG_LONG_PHD as u64;
|
||||||
|
} else if lsn % XLOG_BLCKSZ as u64 == XLOG_SIZE_OF_XLOG_SHORT_PHD as u64 {
|
||||||
|
lsn -= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64;
|
||||||
|
}
|
||||||
|
client.execute("select neon_xlogflush($1)", &[&PgLsn::from(lsn)])?;
|
||||||
Ok(intermediate_lsns)
|
Ok(intermediate_lsns)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -320,61 +332,70 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
|
|||||||
|
|
||||||
client.execute("CREATE table t(x int)", &[])?;
|
client.execute("CREATE table t(x int)", &[])?;
|
||||||
|
|
||||||
// Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary.
|
// Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary. We
|
||||||
// We will use logical message as the padding. We start with detecting how much WAL
|
// will use carefully-sized logical messages to advance WAL insert location such
|
||||||
// it takes for one logical message, considering all alignments and headers.
|
// that there is just enough space on the page for the XLOG_SWITCH record.
|
||||||
let base_wal_advance = {
|
loop {
|
||||||
|
// We start with measuring how much WAL it takes for one logical message,
|
||||||
|
// considering all alignments and headers.
|
||||||
let before_lsn = client.pg_current_wal_insert_lsn()?;
|
let before_lsn = client.pg_current_wal_insert_lsn()?;
|
||||||
// Small non-empty message bigger than few bytes is more likely than an empty
|
|
||||||
// message to have the same format as the big padding message.
|
|
||||||
client.execute(
|
client.execute(
|
||||||
"SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', 10))",
|
"SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', 10))",
|
||||||
&[],
|
&[],
|
||||||
)?;
|
)?;
|
||||||
// The XLOG_SWITCH record has no data => its size is exactly XLOG_SIZE_OF_XLOG_RECORD.
|
let after_lsn = client.pg_current_wal_insert_lsn()?;
|
||||||
(u64::from(client.pg_current_wal_insert_lsn()?) - u64::from(before_lsn)) as usize
|
|
||||||
+ XLOG_SIZE_OF_XLOG_RECORD
|
|
||||||
};
|
|
||||||
let mut remaining_lsn =
|
|
||||||
XLOG_BLCKSZ - u64::from(client.pg_current_wal_insert_lsn()?) as usize % XLOG_BLCKSZ;
|
|
||||||
if remaining_lsn < base_wal_advance {
|
|
||||||
remaining_lsn += XLOG_BLCKSZ;
|
|
||||||
}
|
|
||||||
let repeats = 10 + remaining_lsn - base_wal_advance;
|
|
||||||
info!(
|
|
||||||
"current_wal_insert_lsn={}, remaining_lsn={}, base_wal_advance={}, repeats={}",
|
|
||||||
client.pg_current_wal_insert_lsn()?,
|
|
||||||
remaining_lsn,
|
|
||||||
base_wal_advance,
|
|
||||||
repeats
|
|
||||||
);
|
|
||||||
client.execute(
|
|
||||||
"SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
|
|
||||||
&[&(repeats as i32)],
|
|
||||||
)?;
|
|
||||||
info!(
|
|
||||||
"current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
|
|
||||||
client.pg_current_wal_insert_lsn()?,
|
|
||||||
XLOG_SIZE_OF_XLOG_RECORD
|
|
||||||
);
|
|
||||||
|
|
||||||
// Emit the XLOG_SWITCH
|
// Did the record cross a page boundary? If it did, start over. Crossing a
|
||||||
let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
|
// page boundary adds to the apparent size of the record because of the page
|
||||||
let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
|
// header, which throws off the calculation.
|
||||||
let next_segment = PgLsn::from(0x0200_0000);
|
if u64::from(before_lsn) / XLOG_BLCKSZ as u64
|
||||||
ensure!(
|
!= u64::from(after_lsn) / XLOG_BLCKSZ as u64
|
||||||
xlog_switch_record_end < next_segment,
|
{
|
||||||
"XLOG_SWITCH record ended on or after the expected segment boundary: {} > {}",
|
continue;
|
||||||
xlog_switch_record_end,
|
}
|
||||||
next_segment
|
// base_size is the size of a logical message without the payload
|
||||||
);
|
let base_size = u64::from(after_lsn) - u64::from(before_lsn) - 10;
|
||||||
ensure!(
|
|
||||||
u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
|
// Is there enough space on the page for another logical message and an
|
||||||
"XLOG_SWITCH message ended not on page boundary: {}, offset = {}",
|
// XLOG_SWITCH? If not, start over.
|
||||||
xlog_switch_record_end,
|
let page_remain = XLOG_BLCKSZ as u64 - u64::from(after_lsn) % XLOG_BLCKSZ as u64;
|
||||||
u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
|
if page_remain < base_size + XLOG_SIZE_OF_XLOG_RECORD as u64 {
|
||||||
);
|
continue;
|
||||||
Ok(vec![before_xlog_switch, xlog_switch_record_end])
|
}
|
||||||
|
|
||||||
|
// We will write another logical message, such that after the logical message
|
||||||
|
// record, there will be space for exactly one XLOG_SWITCH. How large should
|
||||||
|
// the logical message's payload be? An XLOG_SWITCH record has no data => its
|
||||||
|
// size is exactly XLOG_SIZE_OF_XLOG_RECORD.
|
||||||
|
let repeats = page_remain - base_size - XLOG_SIZE_OF_XLOG_RECORD as u64;
|
||||||
|
|
||||||
|
client.execute(
|
||||||
|
"SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
|
||||||
|
&[&(repeats as i32)],
|
||||||
|
)?;
|
||||||
|
info!(
|
||||||
|
"current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
|
||||||
|
client.pg_current_wal_insert_lsn()?,
|
||||||
|
XLOG_SIZE_OF_XLOG_RECORD
|
||||||
|
);
|
||||||
|
|
||||||
|
// Emit the XLOG_SWITCH
|
||||||
|
let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
|
||||||
|
let xlog_switch_record_end: PgLsn =
|
||||||
|
client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
|
||||||
|
|
||||||
|
if u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
|
||||||
|
!= XLOG_SIZE_OF_XLOG_SHORT_PHD
|
||||||
|
{
|
||||||
|
warn!(
|
||||||
|
"XLOG_SWITCH message ended not on page boundary: {}, offset = {}, repeating",
|
||||||
|
xlog_switch_record_end,
|
||||||
|
u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
|
||||||
|
);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
return Ok(vec![before_xlog_switch, xlog_switch_record_end]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -38,6 +38,7 @@ azure_storage_blobs.workspace = true
|
|||||||
futures-util.workspace = true
|
futures-util.workspace = true
|
||||||
http-types.workspace = true
|
http-types.workspace = true
|
||||||
itertools.workspace = true
|
itertools.workspace = true
|
||||||
|
sync_wrapper = { workspace = true, features = ["futures"] }
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
camino-tempfile.workspace = true
|
camino-tempfile.workspace = true
|
||||||
|
|||||||
@@ -3,6 +3,7 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::env;
|
use std::env;
|
||||||
|
use std::io;
|
||||||
use std::num::NonZeroU32;
|
use std::num::NonZeroU32;
|
||||||
use std::pin::Pin;
|
use std::pin::Pin;
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
@@ -20,6 +21,7 @@ use azure_storage_blobs::blob::CopyStatus;
|
|||||||
use azure_storage_blobs::prelude::ClientBuilder;
|
use azure_storage_blobs::prelude::ClientBuilder;
|
||||||
use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
|
use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
|
||||||
use bytes::Bytes;
|
use bytes::Bytes;
|
||||||
|
use futures::future::Either;
|
||||||
use futures::stream::Stream;
|
use futures::stream::Stream;
|
||||||
use futures_util::StreamExt;
|
use futures_util::StreamExt;
|
||||||
use futures_util::TryStreamExt;
|
use futures_util::TryStreamExt;
|
||||||
@@ -128,12 +130,12 @@ impl AzureBlobStorage {
|
|||||||
let kind = RequestKind::Get;
|
let kind = RequestKind::Get;
|
||||||
|
|
||||||
let _permit = self.permit(kind, cancel).await?;
|
let _permit = self.permit(kind, cancel).await?;
|
||||||
|
let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
|
||||||
|
let cancel_or_timeout_ = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
|
||||||
|
|
||||||
let mut etag = None;
|
let mut etag = None;
|
||||||
let mut last_modified = None;
|
let mut last_modified = None;
|
||||||
let mut metadata = HashMap::new();
|
let mut metadata = HashMap::new();
|
||||||
// TODO give proper streaming response instead of buffering into RAM
|
|
||||||
// https://github.com/neondatabase/neon/issues/5563
|
|
||||||
|
|
||||||
let download = async {
|
let download = async {
|
||||||
let response = builder
|
let response = builder
|
||||||
@@ -152,39 +154,46 @@ impl AzureBlobStorage {
|
|||||||
Err(_elapsed) => Err(DownloadError::Timeout),
|
Err(_elapsed) => Err(DownloadError::Timeout),
|
||||||
});
|
});
|
||||||
|
|
||||||
let mut response = std::pin::pin!(response);
|
let mut response = Box::pin(response);
|
||||||
|
|
||||||
let mut bufs = Vec::new();
|
let Some(part) = response.next().await else {
|
||||||
while let Some(part) = response.next().await {
|
|
||||||
let part = part?;
|
|
||||||
if etag.is_none() {
|
|
||||||
etag = Some(part.blob.properties.etag);
|
|
||||||
}
|
|
||||||
if last_modified.is_none() {
|
|
||||||
last_modified = Some(part.blob.properties.last_modified.into());
|
|
||||||
}
|
|
||||||
if let Some(blob_meta) = part.blob.metadata {
|
|
||||||
metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
|
|
||||||
}
|
|
||||||
let data = part
|
|
||||||
.data
|
|
||||||
.collect()
|
|
||||||
.await
|
|
||||||
.map_err(|e| DownloadError::Other(e.into()))?;
|
|
||||||
bufs.push(data);
|
|
||||||
}
|
|
||||||
|
|
||||||
if bufs.is_empty() {
|
|
||||||
return Err(DownloadError::Other(anyhow::anyhow!(
|
return Err(DownloadError::Other(anyhow::anyhow!(
|
||||||
"Azure GET response contained no buffers"
|
"Azure GET response contained no response body"
|
||||||
)));
|
)));
|
||||||
|
};
|
||||||
|
let part = part?;
|
||||||
|
if etag.is_none() {
|
||||||
|
etag = Some(part.blob.properties.etag);
|
||||||
}
|
}
|
||||||
|
if last_modified.is_none() {
|
||||||
|
last_modified = Some(part.blob.properties.last_modified.into());
|
||||||
|
}
|
||||||
|
if let Some(blob_meta) = part.blob.metadata {
|
||||||
|
metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
|
||||||
|
}
|
||||||
|
|
||||||
// unwrap safety: if these were None, bufs would be empty and we would have returned an error already
|
// unwrap safety: if these were None, bufs would be empty and we would have returned an error already
|
||||||
let etag = etag.unwrap();
|
let etag = etag.unwrap();
|
||||||
let last_modified = last_modified.unwrap();
|
let last_modified = last_modified.unwrap();
|
||||||
|
|
||||||
|
let tail_stream = response
|
||||||
|
.map(|part| match part {
|
||||||
|
Ok(part) => Either::Left(part.data.map(|r| r.map_err(io::Error::other))),
|
||||||
|
Err(e) => {
|
||||||
|
Either::Right(futures::stream::once(async { Err(io::Error::other(e)) }))
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.flatten();
|
||||||
|
let stream = part
|
||||||
|
.data
|
||||||
|
.map(|r| r.map_err(io::Error::other))
|
||||||
|
.chain(sync_wrapper::SyncStream::new(tail_stream));
|
||||||
|
//.chain(SyncStream::from_pin(Box::pin(tail_stream)));
|
||||||
|
|
||||||
|
let download_stream = crate::support::DownloadStream::new(cancel_or_timeout_, stream);
|
||||||
|
|
||||||
Ok(Download {
|
Ok(Download {
|
||||||
download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
|
download_stream: Box::pin(download_stream),
|
||||||
etag,
|
etag,
|
||||||
last_modified,
|
last_modified,
|
||||||
metadata: Some(StorageMetadata(metadata)),
|
metadata: Some(StorageMetadata(metadata)),
|
||||||
@@ -193,7 +202,10 @@ impl AzureBlobStorage {
|
|||||||
|
|
||||||
tokio::select! {
|
tokio::select! {
|
||||||
bufs = download => bufs,
|
bufs = download => bufs,
|
||||||
_ = cancel.cancelled() => Err(DownloadError::Cancelled),
|
cancel_or_timeout = cancel_or_timeout => match cancel_or_timeout {
|
||||||
|
TimeoutOrCancel::Timeout => Err(DownloadError::Timeout),
|
||||||
|
TimeoutOrCancel::Cancel => Err(DownloadError::Cancelled),
|
||||||
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -21,11 +21,13 @@ use std::{
|
|||||||
fmt::Debug,
|
fmt::Debug,
|
||||||
num::{NonZeroU32, NonZeroUsize},
|
num::{NonZeroU32, NonZeroUsize},
|
||||||
pin::Pin,
|
pin::Pin,
|
||||||
|
str::FromStr,
|
||||||
sync::Arc,
|
sync::Arc,
|
||||||
time::{Duration, SystemTime},
|
time::{Duration, SystemTime},
|
||||||
};
|
};
|
||||||
|
|
||||||
use anyhow::{bail, Context};
|
use anyhow::{bail, Context};
|
||||||
|
use aws_sdk_s3::types::StorageClass;
|
||||||
use camino::{Utf8Path, Utf8PathBuf};
|
use camino::{Utf8Path, Utf8PathBuf};
|
||||||
|
|
||||||
use bytes::Bytes;
|
use bytes::Bytes;
|
||||||
@@ -53,11 +55,11 @@ pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
|
|||||||
/// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
|
/// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
|
||||||
/// <https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/>
|
/// <https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/>
|
||||||
pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
|
pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
|
||||||
/// We set this a little bit low as we currently buffer the entire file into RAM
|
/// Set this limit analogously to the S3 limit
|
||||||
///
|
///
|
||||||
/// Here, a limit of max 20k concurrent connections was noted.
|
/// Here, a limit of max 20k concurrent connections was noted.
|
||||||
/// <https://learn.microsoft.com/en-us/answers/questions/1301863/is-there-any-limitation-to-concurrent-connections>
|
/// <https://learn.microsoft.com/en-us/answers/questions/1301863/is-there-any-limitation-to-concurrent-connections>
|
||||||
pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 30;
|
pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 100;
|
||||||
/// No limits on the client side, which currenltly means 1000 for AWS S3.
|
/// No limits on the client side, which currenltly means 1000 for AWS S3.
|
||||||
/// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
|
/// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
|
||||||
pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
|
pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
|
||||||
@@ -134,6 +136,11 @@ impl RemotePath {
|
|||||||
pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> {
|
pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> {
|
||||||
self.0.strip_prefix(&p.0)
|
self.0.strip_prefix(&p.0)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn add_trailing_slash(&self) -> Self {
|
||||||
|
// Unwrap safety inputs are guararnteed to be valid UTF-8
|
||||||
|
Self(format!("{}/", self.0).try_into().unwrap())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// We don't need callers to be able to pass arbitrary delimiters: just control
|
/// We don't need callers to be able to pass arbitrary delimiters: just control
|
||||||
@@ -157,47 +164,21 @@ pub struct Listing {
|
|||||||
/// providing basic CRUD operations for storage files.
|
/// providing basic CRUD operations for storage files.
|
||||||
#[allow(async_fn_in_trait)]
|
#[allow(async_fn_in_trait)]
|
||||||
pub trait RemoteStorage: Send + Sync + 'static {
|
pub trait RemoteStorage: Send + Sync + 'static {
|
||||||
/// Lists all top level subdirectories for a given prefix
|
/// List objects in remote storage, with semantics matching AWS S3's ListObjectsV2.
|
||||||
/// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
|
/// (see `<https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html>`)
|
||||||
/// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS)
|
///
|
||||||
/// so this method doesnt need to.
|
/// Note that the prefix is relative to any `prefix_in_bucket` configured for the client, not
|
||||||
async fn list_prefixes(
|
/// from the absolute root of the bucket.
|
||||||
&self,
|
///
|
||||||
prefix: Option<&RemotePath>,
|
/// `mode` configures whether to use a delimiter. Without a delimiter all keys
|
||||||
cancel: &CancellationToken,
|
/// within the prefix are listed in the `keys` of the result. With a delimiter, any "directories" at the top level of
|
||||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
/// the prefix are returned in the `prefixes` of the result, and keys in the top level of the prefix are
|
||||||
let result = self
|
/// returned in `keys` ().
|
||||||
.list(prefix, ListingMode::WithDelimiter, None, cancel)
|
///
|
||||||
.await?
|
/// `max_keys` controls the maximum number of keys that will be returned. If this is None, this function
|
||||||
.prefixes;
|
/// will iteratively call listobjects until it runs out of keys. Note that this is not safe to use on
|
||||||
Ok(result)
|
/// unlimted size buckets, as the full list of objects is allocated into a monolithic data structure.
|
||||||
}
|
|
||||||
/// Lists all files in directory "recursively"
|
|
||||||
/// (not really recursively, because AWS has a flat namespace)
|
|
||||||
/// Note: This is subtely different than list_prefixes,
|
|
||||||
/// because it is for listing files instead of listing
|
|
||||||
/// names sharing common prefixes.
|
|
||||||
/// For example,
|
|
||||||
/// list_files("foo/bar") = ["foo/bar/cat123.txt",
|
|
||||||
/// "foo/bar/cat567.txt", "foo/bar/dog123.txt", "foo/bar/dog456.txt"]
|
|
||||||
/// whereas,
|
|
||||||
/// list_prefixes("foo/bar/") = ["cat", "dog"]
|
|
||||||
/// See `test_real_s3.rs` for more details.
|
|
||||||
///
|
///
|
||||||
/// max_keys limits max number of keys returned; None means unlimited.
|
|
||||||
async fn list_files(
|
|
||||||
&self,
|
|
||||||
prefix: Option<&RemotePath>,
|
|
||||||
max_keys: Option<NonZeroU32>,
|
|
||||||
cancel: &CancellationToken,
|
|
||||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
|
||||||
let result = self
|
|
||||||
.list(prefix, ListingMode::NoDelimiter, max_keys, cancel)
|
|
||||||
.await?
|
|
||||||
.keys;
|
|
||||||
Ok(result)
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn list(
|
async fn list(
|
||||||
&self,
|
&self,
|
||||||
prefix: Option<&RemotePath>,
|
prefix: Option<&RemotePath>,
|
||||||
@@ -336,41 +317,6 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// A function for listing all the files in a "directory"
|
|
||||||
// Example:
|
|
||||||
// list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
|
|
||||||
//
|
|
||||||
// max_keys limits max number of keys returned; None means unlimited.
|
|
||||||
pub async fn list_files(
|
|
||||||
&self,
|
|
||||||
folder: Option<&RemotePath>,
|
|
||||||
max_keys: Option<NonZeroU32>,
|
|
||||||
cancel: &CancellationToken,
|
|
||||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
|
||||||
match self {
|
|
||||||
Self::LocalFs(s) => s.list_files(folder, max_keys, cancel).await,
|
|
||||||
Self::AwsS3(s) => s.list_files(folder, max_keys, cancel).await,
|
|
||||||
Self::AzureBlob(s) => s.list_files(folder, max_keys, cancel).await,
|
|
||||||
Self::Unreliable(s) => s.list_files(folder, max_keys, cancel).await,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// lists common *prefixes*, if any of files
|
|
||||||
// Example:
|
|
||||||
// list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"]
|
|
||||||
pub async fn list_prefixes(
|
|
||||||
&self,
|
|
||||||
prefix: Option<&RemotePath>,
|
|
||||||
cancel: &CancellationToken,
|
|
||||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
|
||||||
match self {
|
|
||||||
Self::LocalFs(s) => s.list_prefixes(prefix, cancel).await,
|
|
||||||
Self::AwsS3(s) => s.list_prefixes(prefix, cancel).await,
|
|
||||||
Self::AzureBlob(s) => s.list_prefixes(prefix, cancel).await,
|
|
||||||
Self::Unreliable(s) => s.list_prefixes(prefix, cancel).await,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// See [`RemoteStorage::upload`]
|
/// See [`RemoteStorage::upload`]
|
||||||
pub async fn upload(
|
pub async fn upload(
|
||||||
&self,
|
&self,
|
||||||
@@ -619,6 +565,7 @@ pub struct S3Config {
|
|||||||
/// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
|
/// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
|
||||||
pub concurrency_limit: NonZeroUsize,
|
pub concurrency_limit: NonZeroUsize,
|
||||||
pub max_keys_per_list_response: Option<i32>,
|
pub max_keys_per_list_response: Option<i32>,
|
||||||
|
pub upload_storage_class: Option<StorageClass>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Debug for S3Config {
|
impl Debug for S3Config {
|
||||||
@@ -747,6 +694,18 @@ impl RemoteStorageConfig {
|
|||||||
endpoint,
|
endpoint,
|
||||||
concurrency_limit,
|
concurrency_limit,
|
||||||
max_keys_per_list_response,
|
max_keys_per_list_response,
|
||||||
|
upload_storage_class: toml
|
||||||
|
.get("upload_storage_class")
|
||||||
|
.map(|prefix_in_bucket| -> anyhow::Result<_> {
|
||||||
|
let s = parse_toml_string("upload_storage_class", prefix_in_bucket)?;
|
||||||
|
let storage_class = StorageClass::from_str(&s).expect("infallible");
|
||||||
|
#[allow(deprecated)]
|
||||||
|
if matches!(storage_class, StorageClass::Unknown(_)) {
|
||||||
|
bail!("Specified storage class unknown to SDK: '{s}'. Allowed values: {:?}", StorageClass::values());
|
||||||
|
}
|
||||||
|
Ok(storage_class)
|
||||||
|
})
|
||||||
|
.transpose()?,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
(_, _, _, Some(_), None) => {
|
(_, _, _, Some(_), None) => {
|
||||||
|
|||||||
@@ -5,11 +5,9 @@
|
|||||||
//! volume is mounted to the local FS.
|
//! volume is mounted to the local FS.
|
||||||
|
|
||||||
use std::{
|
use std::{
|
||||||
borrow::Cow,
|
collections::HashSet,
|
||||||
future::Future,
|
|
||||||
io::ErrorKind,
|
io::ErrorKind,
|
||||||
num::NonZeroU32,
|
num::NonZeroU32,
|
||||||
pin::Pin,
|
|
||||||
time::{Duration, SystemTime, UNIX_EPOCH},
|
time::{Duration, SystemTime, UNIX_EPOCH},
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -22,11 +20,11 @@ use tokio::{
|
|||||||
io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
|
io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
|
||||||
};
|
};
|
||||||
use tokio_util::{io::ReaderStream, sync::CancellationToken};
|
use tokio_util::{io::ReaderStream, sync::CancellationToken};
|
||||||
use tracing::*;
|
use utils::crashsafe::path_with_suffix_extension;
|
||||||
use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
|
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
|
Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
|
||||||
|
REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||||
};
|
};
|
||||||
|
|
||||||
use super::{RemoteStorage, StorageMetadata};
|
use super::{RemoteStorage, StorageMetadata};
|
||||||
@@ -93,7 +91,47 @@ impl LocalFs {
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
async fn list_all(&self) -> anyhow::Result<Vec<RemotePath>> {
|
async fn list_all(&self) -> anyhow::Result<Vec<RemotePath>> {
|
||||||
Ok(get_all_files(&self.storage_root, true)
|
use std::{future::Future, pin::Pin};
|
||||||
|
fn get_all_files<'a, P>(
|
||||||
|
directory_path: P,
|
||||||
|
) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
|
||||||
|
where
|
||||||
|
P: AsRef<Utf8Path> + Send + Sync + 'a,
|
||||||
|
{
|
||||||
|
Box::pin(async move {
|
||||||
|
let directory_path = directory_path.as_ref();
|
||||||
|
if directory_path.exists() {
|
||||||
|
if directory_path.is_dir() {
|
||||||
|
let mut paths = Vec::new();
|
||||||
|
let mut dir_contents = fs::read_dir(directory_path).await?;
|
||||||
|
while let Some(dir_entry) = dir_contents.next_entry().await? {
|
||||||
|
let file_type = dir_entry.file_type().await?;
|
||||||
|
let entry_path =
|
||||||
|
Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
|
||||||
|
anyhow::Error::msg(format!(
|
||||||
|
"non-Unicode path: {}",
|
||||||
|
pb.to_string_lossy()
|
||||||
|
))
|
||||||
|
})?;
|
||||||
|
if file_type.is_symlink() {
|
||||||
|
tracing::debug!("{entry_path:?} is a symlink, skipping")
|
||||||
|
} else if file_type.is_dir() {
|
||||||
|
paths.extend(get_all_files(&entry_path).await?.into_iter())
|
||||||
|
} else {
|
||||||
|
paths.push(entry_path);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(paths)
|
||||||
|
} else {
|
||||||
|
bail!("Path {directory_path:?} is not a directory")
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
Ok(Vec::new())
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(get_all_files(&self.storage_root)
|
||||||
.await?
|
.await?
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|path| {
|
.map(|path| {
|
||||||
@@ -120,6 +158,14 @@ impl LocalFs {
|
|||||||
// S3 object list prefixes can be arbitrary strings, but when reading
|
// S3 object list prefixes can be arbitrary strings, but when reading
|
||||||
// the local filesystem we need a directory to start calling read_dir on.
|
// the local filesystem we need a directory to start calling read_dir on.
|
||||||
let mut initial_dir = full_path.clone();
|
let mut initial_dir = full_path.clone();
|
||||||
|
|
||||||
|
// If there's no trailing slash, we have to start looking from one above: even if
|
||||||
|
// `initial_dir` is a directory, we should still list any prefixes in the parent
|
||||||
|
// that start with the same string.
|
||||||
|
if !full_path.to_string().ends_with('/') {
|
||||||
|
initial_dir.pop();
|
||||||
|
}
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
// Did we make it to the root?
|
// Did we make it to the root?
|
||||||
if initial_dir.parent().is_none() {
|
if initial_dir.parent().is_none() {
|
||||||
@@ -295,61 +341,66 @@ impl RemoteStorage for LocalFs {
|
|||||||
let op = async {
|
let op = async {
|
||||||
let mut result = Listing::default();
|
let mut result = Listing::default();
|
||||||
|
|
||||||
if let ListingMode::NoDelimiter = mode {
|
// Filter out directories: in S3 directories don't exist, only the keys within them do.
|
||||||
let keys = self
|
let keys = self
|
||||||
.list_recursive(prefix)
|
.list_recursive(prefix)
|
||||||
.await
|
|
||||||
.map_err(DownloadError::Other)?;
|
|
||||||
|
|
||||||
result.keys = keys
|
|
||||||
.into_iter()
|
|
||||||
.filter(|k| {
|
|
||||||
let path = k.with_base(&self.storage_root);
|
|
||||||
!path.is_dir()
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
if let Some(max_keys) = max_keys {
|
|
||||||
result.keys.truncate(max_keys.get() as usize);
|
|
||||||
}
|
|
||||||
|
|
||||||
return Ok(result);
|
|
||||||
}
|
|
||||||
|
|
||||||
let path = match prefix {
|
|
||||||
Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
|
|
||||||
None => Cow::Borrowed(&self.storage_root),
|
|
||||||
};
|
|
||||||
|
|
||||||
let prefixes_to_filter = get_all_files(path.as_ref(), false)
|
|
||||||
.await
|
.await
|
||||||
.map_err(DownloadError::Other)?;
|
.map_err(DownloadError::Other)?;
|
||||||
|
let keys = keys
|
||||||
|
.into_iter()
|
||||||
|
.filter(|k| {
|
||||||
|
let path = k.with_base(&self.storage_root);
|
||||||
|
!path.is_dir()
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
// filter out empty directories to mirror s3 behavior.
|
if let ListingMode::NoDelimiter = mode {
|
||||||
for prefix in prefixes_to_filter {
|
result.keys = keys;
|
||||||
if prefix.is_dir()
|
} else {
|
||||||
&& is_directory_empty(&prefix)
|
let mut prefixes = HashSet::new();
|
||||||
.await
|
for key in keys {
|
||||||
.map_err(DownloadError::Other)?
|
// If the part after the prefix includes a "/", take only the first part and put it in `prefixes`.
|
||||||
{
|
let relative_key = if let Some(prefix) = prefix {
|
||||||
continue;
|
let mut prefix = prefix.clone();
|
||||||
}
|
// We only strip the dirname of the prefix, so that when we strip it from the start of keys we
|
||||||
|
// end up with full file/dir names.
|
||||||
let stripped = prefix
|
let prefix_full_local_path = prefix.with_base(&self.storage_root);
|
||||||
.strip_prefix(&self.storage_root)
|
let has_slash = prefix.0.to_string().ends_with('/');
|
||||||
.context("Failed to strip prefix")
|
let strip_prefix = if prefix_full_local_path.is_dir() && has_slash {
|
||||||
.and_then(RemotePath::new)
|
prefix
|
||||||
.expect(
|
} else {
|
||||||
"We list files for storage root, hence should be able to remote the prefix",
|
prefix.0.pop();
|
||||||
);
|
prefix
|
||||||
|
};
|
||||||
if prefix.is_dir() {
|
|
||||||
result.prefixes.push(stripped);
|
RemotePath::new(key.strip_prefix(&strip_prefix).unwrap()).unwrap()
|
||||||
} else {
|
} else {
|
||||||
result.keys.push(stripped);
|
key
|
||||||
|
};
|
||||||
|
|
||||||
|
let relative_key = format!("{}", relative_key);
|
||||||
|
if relative_key.contains(REMOTE_STORAGE_PREFIX_SEPARATOR) {
|
||||||
|
let first_part = relative_key
|
||||||
|
.split(REMOTE_STORAGE_PREFIX_SEPARATOR)
|
||||||
|
.next()
|
||||||
|
.unwrap()
|
||||||
|
.to_owned();
|
||||||
|
prefixes.insert(first_part);
|
||||||
|
} else {
|
||||||
|
result
|
||||||
|
.keys
|
||||||
|
.push(RemotePath::from_string(&relative_key).unwrap());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
result.prefixes = prefixes
|
||||||
|
.into_iter()
|
||||||
|
.map(|s| RemotePath::from_string(&s).unwrap())
|
||||||
|
.collect();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if let Some(max_keys) = max_keys {
|
||||||
|
result.keys.truncate(max_keys.get() as usize);
|
||||||
|
}
|
||||||
Ok(result)
|
Ok(result)
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -560,50 +611,6 @@ fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
|
|||||||
path_with_suffix_extension(original_path, "metadata")
|
path_with_suffix_extension(original_path, "metadata")
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_all_files<'a, P>(
|
|
||||||
directory_path: P,
|
|
||||||
recursive: bool,
|
|
||||||
) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
|
|
||||||
where
|
|
||||||
P: AsRef<Utf8Path> + Send + Sync + 'a,
|
|
||||||
{
|
|
||||||
Box::pin(async move {
|
|
||||||
let directory_path = directory_path.as_ref();
|
|
||||||
if directory_path.exists() {
|
|
||||||
if directory_path.is_dir() {
|
|
||||||
let mut paths = Vec::new();
|
|
||||||
let mut dir_contents = fs::read_dir(directory_path).await?;
|
|
||||||
while let Some(dir_entry) = dir_contents.next_entry().await? {
|
|
||||||
let file_type = dir_entry.file_type().await?;
|
|
||||||
let entry_path =
|
|
||||||
Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
|
|
||||||
anyhow::Error::msg(format!(
|
|
||||||
"non-Unicode path: {}",
|
|
||||||
pb.to_string_lossy()
|
|
||||||
))
|
|
||||||
})?;
|
|
||||||
if file_type.is_symlink() {
|
|
||||||
debug!("{entry_path:?} is a symlink, skipping")
|
|
||||||
} else if file_type.is_dir() {
|
|
||||||
if recursive {
|
|
||||||
paths.extend(get_all_files(&entry_path, true).await?.into_iter())
|
|
||||||
} else {
|
|
||||||
paths.push(entry_path)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
paths.push(entry_path);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(paths)
|
|
||||||
} else {
|
|
||||||
bail!("Path {directory_path:?} is not a directory")
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
Ok(Vec::new())
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<()> {
|
async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<()> {
|
||||||
let target_dir = match target_file_path.parent() {
|
let target_dir = match target_file_path.parent() {
|
||||||
Some(parent_dir) => parent_dir,
|
Some(parent_dir) => parent_dir,
|
||||||
@@ -923,13 +930,18 @@ mod fs_tests {
|
|||||||
// No delimiter: should recursively list everything
|
// No delimiter: should recursively list everything
|
||||||
let (storage, cancel) = create_storage()?;
|
let (storage, cancel) = create_storage()?;
|
||||||
let child = upload_dummy_file(&storage, "grandparent/parent/child", None, &cancel).await?;
|
let child = upload_dummy_file(&storage, "grandparent/parent/child", None, &cancel).await?;
|
||||||
|
let child_sibling =
|
||||||
|
upload_dummy_file(&storage, "grandparent/parent/child_sibling", None, &cancel).await?;
|
||||||
let uncle = upload_dummy_file(&storage, "grandparent/uncle", None, &cancel).await?;
|
let uncle = upload_dummy_file(&storage, "grandparent/uncle", None, &cancel).await?;
|
||||||
|
|
||||||
let listing = storage
|
let listing = storage
|
||||||
.list(None, ListingMode::NoDelimiter, None, &cancel)
|
.list(None, ListingMode::NoDelimiter, None, &cancel)
|
||||||
.await?;
|
.await?;
|
||||||
assert!(listing.prefixes.is_empty());
|
assert!(listing.prefixes.is_empty());
|
||||||
assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());
|
assert_eq!(
|
||||||
|
listing.keys.into_iter().collect::<HashSet<_>>(),
|
||||||
|
HashSet::from([uncle.clone(), child.clone(), child_sibling.clone()])
|
||||||
|
);
|
||||||
|
|
||||||
// Delimiter: should only go one deep
|
// Delimiter: should only go one deep
|
||||||
let listing = storage
|
let listing = storage
|
||||||
@@ -942,7 +954,25 @@ mod fs_tests {
|
|||||||
);
|
);
|
||||||
assert!(listing.keys.is_empty());
|
assert!(listing.keys.is_empty());
|
||||||
|
|
||||||
// Delimiter & prefix
|
// Delimiter & prefix with a trailing slash
|
||||||
|
let listing = storage
|
||||||
|
.list(
|
||||||
|
Some(&RemotePath::from_string("timelines/some_timeline/grandparent/").unwrap()),
|
||||||
|
ListingMode::WithDelimiter,
|
||||||
|
None,
|
||||||
|
&cancel,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
assert_eq!(
|
||||||
|
listing.keys,
|
||||||
|
[RemotePath::from_string("uncle").unwrap()].to_vec()
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
listing.prefixes,
|
||||||
|
[RemotePath::from_string("parent").unwrap()].to_vec()
|
||||||
|
);
|
||||||
|
|
||||||
|
// Delimiter and prefix without a trailing slash
|
||||||
let listing = storage
|
let listing = storage
|
||||||
.list(
|
.list(
|
||||||
Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
|
Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
|
||||||
@@ -951,12 +981,66 @@ mod fs_tests {
|
|||||||
&cancel,
|
&cancel,
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
assert_eq!(listing.keys, [].to_vec());
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
listing.prefixes,
|
listing.prefixes,
|
||||||
[RemotePath::from_string("timelines/some_timeline/grandparent/parent").unwrap()]
|
[RemotePath::from_string("grandparent").unwrap()].to_vec()
|
||||||
.to_vec()
|
);
|
||||||
|
|
||||||
|
// Delimiter and prefix that's partway through a path component
|
||||||
|
let listing = storage
|
||||||
|
.list(
|
||||||
|
Some(&RemotePath::from_string("timelines/some_timeline/grandp").unwrap()),
|
||||||
|
ListingMode::WithDelimiter,
|
||||||
|
None,
|
||||||
|
&cancel,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
assert_eq!(listing.keys, [].to_vec());
|
||||||
|
assert_eq!(
|
||||||
|
listing.prefixes,
|
||||||
|
[RemotePath::from_string("grandparent").unwrap()].to_vec()
|
||||||
|
);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn list_part_component() -> anyhow::Result<()> {
|
||||||
|
// No delimiter: should recursively list everything
|
||||||
|
let (storage, cancel) = create_storage()?;
|
||||||
|
|
||||||
|
// Imitates what happens in a tenant path when we have an unsharded path and a sharded path, and do a listing
|
||||||
|
// of the unsharded path: although there is a "directory" at the unsharded path, it should be handled as
|
||||||
|
// a freeform prefix.
|
||||||
|
let _child_a =
|
||||||
|
upload_dummy_file(&storage, "grandparent/tenant-01/child", None, &cancel).await?;
|
||||||
|
let _child_b =
|
||||||
|
upload_dummy_file(&storage, "grandparent/tenant/child", None, &cancel).await?;
|
||||||
|
|
||||||
|
// Delimiter and prefix that's partway through a path component
|
||||||
|
let listing = storage
|
||||||
|
.list(
|
||||||
|
Some(
|
||||||
|
&RemotePath::from_string("timelines/some_timeline/grandparent/tenant").unwrap(),
|
||||||
|
),
|
||||||
|
ListingMode::WithDelimiter,
|
||||||
|
None,
|
||||||
|
&cancel,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
assert_eq!(listing.keys, [].to_vec());
|
||||||
|
|
||||||
|
let mut found_prefixes = listing.prefixes.clone();
|
||||||
|
found_prefixes.sort();
|
||||||
|
assert_eq!(
|
||||||
|
found_prefixes,
|
||||||
|
[
|
||||||
|
RemotePath::from_string("tenant").unwrap(),
|
||||||
|
RemotePath::from_string("tenant-01").unwrap(),
|
||||||
|
]
|
||||||
|
.to_vec()
|
||||||
);
|
);
|
||||||
assert_eq!(listing.keys, [uncle.clone()].to_vec());
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -27,10 +27,10 @@ use aws_config::{
|
|||||||
};
|
};
|
||||||
use aws_credential_types::provider::SharedCredentialsProvider;
|
use aws_credential_types::provider::SharedCredentialsProvider;
|
||||||
use aws_sdk_s3::{
|
use aws_sdk_s3::{
|
||||||
config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep},
|
config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep},
|
||||||
error::SdkError,
|
error::SdkError,
|
||||||
operation::get_object::GetObjectError,
|
operation::get_object::GetObjectError,
|
||||||
types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion},
|
types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass},
|
||||||
Client,
|
Client,
|
||||||
};
|
};
|
||||||
use aws_smithy_async::rt::sleep::TokioSleep;
|
use aws_smithy_async::rt::sleep::TokioSleep;
|
||||||
@@ -62,6 +62,7 @@ pub struct S3Bucket {
|
|||||||
bucket_name: String,
|
bucket_name: String,
|
||||||
prefix_in_bucket: Option<String>,
|
prefix_in_bucket: Option<String>,
|
||||||
max_keys_per_list_response: Option<i32>,
|
max_keys_per_list_response: Option<i32>,
|
||||||
|
upload_storage_class: Option<StorageClass>,
|
||||||
concurrency_limiter: ConcurrencyLimiter,
|
concurrency_limiter: ConcurrencyLimiter,
|
||||||
// Per-request timeout. Accessible for tests.
|
// Per-request timeout. Accessible for tests.
|
||||||
pub timeout: Duration,
|
pub timeout: Duration,
|
||||||
@@ -74,13 +75,13 @@ struct GetObjectRequest {
|
|||||||
}
|
}
|
||||||
impl S3Bucket {
|
impl S3Bucket {
|
||||||
/// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
|
/// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
|
||||||
pub fn new(aws_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
|
pub fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
|
||||||
tracing::debug!(
|
tracing::debug!(
|
||||||
"Creating s3 remote storage for S3 bucket {}",
|
"Creating s3 remote storage for S3 bucket {}",
|
||||||
aws_config.bucket_name
|
remote_storage_config.bucket_name
|
||||||
);
|
);
|
||||||
|
|
||||||
let region = Some(Region::new(aws_config.bucket_region.clone()));
|
let region = Some(Region::new(remote_storage_config.bucket_region.clone()));
|
||||||
|
|
||||||
let provider_conf = ProviderConfig::without_region().with_region(region.clone());
|
let provider_conf = ProviderConfig::without_region().with_region(region.clone());
|
||||||
|
|
||||||
@@ -112,6 +113,38 @@ impl S3Bucket {
|
|||||||
// AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off
|
// AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off
|
||||||
let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
|
let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
|
||||||
|
|
||||||
|
let sdk_config_loader: aws_config::ConfigLoader = aws_config::defaults(
|
||||||
|
#[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */
|
||||||
|
BehaviorVersion::v2023_11_09(),
|
||||||
|
)
|
||||||
|
.region(region)
|
||||||
|
.identity_cache(IdentityCache::lazy().build())
|
||||||
|
.credentials_provider(SharedCredentialsProvider::new(credentials_provider))
|
||||||
|
.sleep_impl(SharedAsyncSleep::from(sleep_impl));
|
||||||
|
|
||||||
|
let sdk_config: aws_config::SdkConfig = std::thread::scope(|s| {
|
||||||
|
s.spawn(|| {
|
||||||
|
// TODO: make this function async.
|
||||||
|
tokio::runtime::Builder::new_current_thread()
|
||||||
|
.enable_all()
|
||||||
|
.build()
|
||||||
|
.unwrap()
|
||||||
|
.block_on(sdk_config_loader.load())
|
||||||
|
})
|
||||||
|
.join()
|
||||||
|
.unwrap()
|
||||||
|
});
|
||||||
|
|
||||||
|
let mut s3_config_builder = aws_sdk_s3::config::Builder::from(&sdk_config);
|
||||||
|
|
||||||
|
// Technically, the `remote_storage_config.endpoint` field only applies to S3 interactions.
|
||||||
|
// (In case we ever re-use the `sdk_config` for more than just the S3 client in the future)
|
||||||
|
if let Some(custom_endpoint) = remote_storage_config.endpoint.clone() {
|
||||||
|
s3_config_builder = s3_config_builder
|
||||||
|
.endpoint_url(custom_endpoint)
|
||||||
|
.force_path_style(true);
|
||||||
|
}
|
||||||
|
|
||||||
// We do our own retries (see [`backoff::retry`]). However, for the AWS SDK to enable rate limiting in response to throttling
|
// We do our own retries (see [`backoff::retry`]). However, for the AWS SDK to enable rate limiting in response to throttling
|
||||||
// responses (e.g. 429 on too many ListObjectsv2 requests), we must provide a retry config. We set it to use at most one
|
// responses (e.g. 429 on too many ListObjectsv2 requests), we must provide a retry config. We set it to use at most one
|
||||||
// attempt, and enable 'Adaptive' mode, which causes rate limiting to be enabled.
|
// attempt, and enable 'Adaptive' mode, which causes rate limiting to be enabled.
|
||||||
@@ -119,41 +152,36 @@ impl S3Bucket {
|
|||||||
retry_config
|
retry_config
|
||||||
.set_max_attempts(Some(1))
|
.set_max_attempts(Some(1))
|
||||||
.set_mode(Some(RetryMode::Adaptive));
|
.set_mode(Some(RetryMode::Adaptive));
|
||||||
|
s3_config_builder = s3_config_builder.retry_config(retry_config.build());
|
||||||
|
|
||||||
let mut config_builder = Builder::default()
|
let s3_config = s3_config_builder.build();
|
||||||
.behavior_version(BehaviorVersion::v2023_11_09())
|
let client = aws_sdk_s3::Client::from_conf(s3_config);
|
||||||
.region(region)
|
|
||||||
.identity_cache(IdentityCache::lazy().build())
|
|
||||||
.credentials_provider(SharedCredentialsProvider::new(credentials_provider))
|
|
||||||
.retry_config(retry_config.build())
|
|
||||||
.sleep_impl(SharedAsyncSleep::from(sleep_impl));
|
|
||||||
|
|
||||||
if let Some(custom_endpoint) = aws_config.endpoint.clone() {
|
let prefix_in_bucket = remote_storage_config
|
||||||
config_builder = config_builder
|
.prefix_in_bucket
|
||||||
.endpoint_url(custom_endpoint)
|
.as_deref()
|
||||||
.force_path_style(true);
|
.map(|prefix| {
|
||||||
}
|
let mut prefix = prefix;
|
||||||
|
while prefix.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
|
||||||
|
prefix = &prefix[1..]
|
||||||
|
}
|
||||||
|
|
||||||
let client = Client::from_conf(config_builder.build());
|
let mut prefix = prefix.to_string();
|
||||||
|
while prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
|
||||||
|
prefix.pop();
|
||||||
|
}
|
||||||
|
prefix
|
||||||
|
});
|
||||||
|
|
||||||
let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| {
|
|
||||||
let mut prefix = prefix;
|
|
||||||
while prefix.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
|
|
||||||
prefix = &prefix[1..]
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut prefix = prefix.to_string();
|
|
||||||
while prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
|
|
||||||
prefix.pop();
|
|
||||||
}
|
|
||||||
prefix
|
|
||||||
});
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
client,
|
client,
|
||||||
bucket_name: aws_config.bucket_name.clone(),
|
bucket_name: remote_storage_config.bucket_name.clone(),
|
||||||
max_keys_per_list_response: aws_config.max_keys_per_list_response,
|
max_keys_per_list_response: remote_storage_config.max_keys_per_list_response,
|
||||||
prefix_in_bucket,
|
prefix_in_bucket,
|
||||||
concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()),
|
concurrency_limiter: ConcurrencyLimiter::new(
|
||||||
|
remote_storage_config.concurrency_limit.get(),
|
||||||
|
),
|
||||||
|
upload_storage_class: remote_storage_config.upload_storage_class.clone(),
|
||||||
timeout,
|
timeout,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@@ -178,10 +206,7 @@ impl S3Bucket {
|
|||||||
|
|
||||||
pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
|
pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
|
||||||
assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
|
assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
|
||||||
let path_string = path
|
let path_string = path.get_path().as_str();
|
||||||
.get_path()
|
|
||||||
.as_str()
|
|
||||||
.trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR);
|
|
||||||
match &self.prefix_in_bucket {
|
match &self.prefix_in_bucket {
|
||||||
Some(prefix) => prefix.clone() + "/" + path_string,
|
Some(prefix) => prefix.clone() + "/" + path_string,
|
||||||
None => path_string.to_string(),
|
None => path_string.to_string(),
|
||||||
@@ -471,16 +496,11 @@ impl RemoteStorage for S3Bucket {
|
|||||||
// get the passed prefix or if it is not set use prefix_in_bucket value
|
// get the passed prefix or if it is not set use prefix_in_bucket value
|
||||||
let list_prefix = prefix
|
let list_prefix = prefix
|
||||||
.map(|p| self.relative_path_to_s3_object(p))
|
.map(|p| self.relative_path_to_s3_object(p))
|
||||||
.or_else(|| self.prefix_in_bucket.clone())
|
.or_else(|| {
|
||||||
.map(|mut p| {
|
self.prefix_in_bucket.clone().map(|mut s| {
|
||||||
// required to end with a separator
|
s.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
|
||||||
// otherwise request will return only the entry of a prefix
|
s
|
||||||
if matches!(mode, ListingMode::WithDelimiter)
|
})
|
||||||
&& !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
|
|
||||||
{
|
|
||||||
p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
|
|
||||||
}
|
|
||||||
p
|
|
||||||
});
|
});
|
||||||
|
|
||||||
let _permit = self.permit(kind, cancel).await?;
|
let _permit = self.permit(kind, cancel).await?;
|
||||||
@@ -549,11 +569,15 @@ impl RemoteStorage for S3Bucket {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
result.prefixes.extend(
|
// S3 gives us prefixes like "foo/", we return them like "foo"
|
||||||
prefixes
|
result.prefixes.extend(prefixes.iter().filter_map(|o| {
|
||||||
.iter()
|
Some(
|
||||||
.filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
|
self.s3_object_to_relative_path(
|
||||||
);
|
o.prefix()?
|
||||||
|
.trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
}));
|
||||||
|
|
||||||
continuation_token = match response.next_continuation_token {
|
continuation_token = match response.next_continuation_token {
|
||||||
Some(new_token) => Some(new_token),
|
Some(new_token) => Some(new_token),
|
||||||
@@ -586,6 +610,7 @@ impl RemoteStorage for S3Bucket {
|
|||||||
.bucket(self.bucket_name.clone())
|
.bucket(self.bucket_name.clone())
|
||||||
.key(self.relative_path_to_s3_object(to))
|
.key(self.relative_path_to_s3_object(to))
|
||||||
.set_metadata(metadata.map(|m| m.0))
|
.set_metadata(metadata.map(|m| m.0))
|
||||||
|
.set_storage_class(self.upload_storage_class.clone())
|
||||||
.content_length(from_size_bytes.try_into()?)
|
.content_length(from_size_bytes.try_into()?)
|
||||||
.body(bytes_stream)
|
.body(bytes_stream)
|
||||||
.send();
|
.send();
|
||||||
@@ -637,6 +662,7 @@ impl RemoteStorage for S3Bucket {
|
|||||||
.copy_object()
|
.copy_object()
|
||||||
.bucket(self.bucket_name.clone())
|
.bucket(self.bucket_name.clone())
|
||||||
.key(self.relative_path_to_s3_object(to))
|
.key(self.relative_path_to_s3_object(to))
|
||||||
|
.set_storage_class(self.upload_storage_class.clone())
|
||||||
.copy_source(copy_source)
|
.copy_source(copy_source)
|
||||||
.send();
|
.send();
|
||||||
|
|
||||||
@@ -894,6 +920,7 @@ impl RemoteStorage for S3Bucket {
|
|||||||
.copy_object()
|
.copy_object()
|
||||||
.bucket(self.bucket_name.clone())
|
.bucket(self.bucket_name.clone())
|
||||||
.key(key)
|
.key(key)
|
||||||
|
.set_storage_class(self.upload_storage_class.clone())
|
||||||
.copy_source(&source_id)
|
.copy_source(&source_id)
|
||||||
.send();
|
.send();
|
||||||
|
|
||||||
@@ -1050,22 +1077,22 @@ mod tests {
|
|||||||
Some("/test/prefix/"),
|
Some("/test/prefix/"),
|
||||||
];
|
];
|
||||||
let expected_outputs = [
|
let expected_outputs = [
|
||||||
vec!["", "some/path", "some/path"],
|
vec!["", "some/path", "some/path/"],
|
||||||
vec!["/", "/some/path", "/some/path"],
|
vec!["/", "/some/path", "/some/path/"],
|
||||||
vec![
|
vec![
|
||||||
"test/prefix/",
|
"test/prefix/",
|
||||||
"test/prefix/some/path",
|
"test/prefix/some/path",
|
||||||
"test/prefix/some/path",
|
"test/prefix/some/path/",
|
||||||
],
|
],
|
||||||
vec![
|
vec![
|
||||||
"test/prefix/",
|
"test/prefix/",
|
||||||
"test/prefix/some/path",
|
"test/prefix/some/path",
|
||||||
"test/prefix/some/path",
|
"test/prefix/some/path/",
|
||||||
],
|
],
|
||||||
vec![
|
vec![
|
||||||
"test/prefix/",
|
"test/prefix/",
|
||||||
"test/prefix/some/path",
|
"test/prefix/some/path",
|
||||||
"test/prefix/some/path",
|
"test/prefix/some/path/",
|
||||||
],
|
],
|
||||||
];
|
];
|
||||||
|
|
||||||
@@ -1077,6 +1104,7 @@ mod tests {
|
|||||||
endpoint: None,
|
endpoint: None,
|
||||||
concurrency_limit: NonZeroUsize::new(100).unwrap(),
|
concurrency_limit: NonZeroUsize::new(100).unwrap(),
|
||||||
max_keys_per_list_response: Some(5),
|
max_keys_per_list_response: Some(5),
|
||||||
|
upload_storage_class: None,
|
||||||
};
|
};
|
||||||
let storage =
|
let storage =
|
||||||
S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init");
|
S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init");
|
||||||
|
|||||||
@@ -107,27 +107,6 @@ impl UnreliableWrapper {
|
|||||||
type VoidStorage = crate::LocalFs;
|
type VoidStorage = crate::LocalFs;
|
||||||
|
|
||||||
impl RemoteStorage for UnreliableWrapper {
|
impl RemoteStorage for UnreliableWrapper {
|
||||||
async fn list_prefixes(
|
|
||||||
&self,
|
|
||||||
prefix: Option<&RemotePath>,
|
|
||||||
cancel: &CancellationToken,
|
|
||||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
|
||||||
self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
|
|
||||||
.map_err(DownloadError::Other)?;
|
|
||||||
self.inner.list_prefixes(prefix, cancel).await
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn list_files(
|
|
||||||
&self,
|
|
||||||
folder: Option<&RemotePath>,
|
|
||||||
max_keys: Option<NonZeroU32>,
|
|
||||||
cancel: &CancellationToken,
|
|
||||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
|
||||||
self.attempt(RemoteOp::ListPrefixes(folder.cloned()))
|
|
||||||
.map_err(DownloadError::Other)?;
|
|
||||||
self.inner.list_files(folder, max_keys, cancel).await
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn list(
|
async fn list(
|
||||||
&self,
|
&self,
|
||||||
prefix: Option<&RemotePath>,
|
prefix: Option<&RemotePath>,
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use camino::Utf8Path;
|
use camino::Utf8Path;
|
||||||
|
use remote_storage::ListingMode;
|
||||||
use remote_storage::RemotePath;
|
use remote_storage::RemotePath;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::{collections::HashSet, num::NonZeroU32};
|
use std::{collections::HashSet, num::NonZeroU32};
|
||||||
@@ -54,9 +55,9 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
|
|||||||
let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
|
let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
|
||||||
.context("common_prefix construction")?;
|
.context("common_prefix construction")?;
|
||||||
let root_remote_prefixes = test_client
|
let root_remote_prefixes = test_client
|
||||||
.list_prefixes(None, &cancel)
|
.list(None, ListingMode::WithDelimiter, None, &cancel)
|
||||||
.await
|
.await?
|
||||||
.context("client list root prefixes failure")?
|
.prefixes
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.collect::<HashSet<_>>();
|
.collect::<HashSet<_>>();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
@@ -65,9 +66,14 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
|
|||||||
);
|
);
|
||||||
|
|
||||||
let nested_remote_prefixes = test_client
|
let nested_remote_prefixes = test_client
|
||||||
.list_prefixes(Some(&base_prefix), &cancel)
|
.list(
|
||||||
.await
|
Some(&base_prefix.add_trailing_slash()),
|
||||||
.context("client list nested prefixes failure")?
|
ListingMode::WithDelimiter,
|
||||||
|
None,
|
||||||
|
&cancel,
|
||||||
|
)
|
||||||
|
.await?
|
||||||
|
.prefixes
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.collect::<HashSet<_>>();
|
.collect::<HashSet<_>>();
|
||||||
let remote_only_prefixes = nested_remote_prefixes
|
let remote_only_prefixes = nested_remote_prefixes
|
||||||
@@ -90,11 +96,13 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
|
|||||||
///
|
///
|
||||||
/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
|
/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
|
||||||
/// Then performs the following queries:
|
/// Then performs the following queries:
|
||||||
/// 1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
|
/// 1. `list(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
|
||||||
/// 2. `list_files("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt`
|
/// 2. `list("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt`
|
||||||
#[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
|
#[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> anyhow::Result<()> {
|
async fn list_no_delimiter_works(
|
||||||
|
ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
let ctx = match ctx {
|
let ctx = match ctx {
|
||||||
MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
|
MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
|
||||||
MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
|
MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
|
||||||
@@ -107,29 +115,36 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
|
|||||||
let base_prefix =
|
let base_prefix =
|
||||||
RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
|
RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
|
||||||
let root_files = test_client
|
let root_files = test_client
|
||||||
.list_files(None, None, &cancel)
|
.list(None, ListingMode::NoDelimiter, None, &cancel)
|
||||||
.await
|
.await
|
||||||
.context("client list root files failure")?
|
.context("client list root files failure")?
|
||||||
|
.keys
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.collect::<HashSet<_>>();
|
.collect::<HashSet<_>>();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
root_files,
|
root_files,
|
||||||
ctx.remote_blobs.clone(),
|
ctx.remote_blobs.clone(),
|
||||||
"remote storage list_files on root mismatches with the uploads."
|
"remote storage list on root mismatches with the uploads."
|
||||||
);
|
);
|
||||||
|
|
||||||
// Test that max_keys limit works. In total there are about 21 files (see
|
// Test that max_keys limit works. In total there are about 21 files (see
|
||||||
// upload_simple_remote_data call in test_real_s3.rs).
|
// upload_simple_remote_data call in test_real_s3.rs).
|
||||||
let limited_root_files = test_client
|
let limited_root_files = test_client
|
||||||
.list_files(None, Some(NonZeroU32::new(2).unwrap()), &cancel)
|
.list(
|
||||||
|
None,
|
||||||
|
ListingMode::NoDelimiter,
|
||||||
|
Some(NonZeroU32::new(2).unwrap()),
|
||||||
|
&cancel,
|
||||||
|
)
|
||||||
.await
|
.await
|
||||||
.context("client list root files failure")?;
|
.context("client list root files failure")?;
|
||||||
assert_eq!(limited_root_files.len(), 2);
|
assert_eq!(limited_root_files.keys.len(), 2);
|
||||||
|
|
||||||
let nested_remote_files = test_client
|
let nested_remote_files = test_client
|
||||||
.list_files(Some(&base_prefix), None, &cancel)
|
.list(Some(&base_prefix), ListingMode::NoDelimiter, None, &cancel)
|
||||||
.await
|
.await
|
||||||
.context("client list nested files failure")?
|
.context("client list nested files failure")?
|
||||||
|
.keys
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.collect::<HashSet<_>>();
|
.collect::<HashSet<_>>();
|
||||||
let trim_remote_blobs: HashSet<_> = ctx
|
let trim_remote_blobs: HashSet<_> = ctx
|
||||||
@@ -141,7 +156,7 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
|
|||||||
.collect();
|
.collect();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
nested_remote_files, trim_remote_blobs,
|
nested_remote_files, trim_remote_blobs,
|
||||||
"remote storage list_files on subdirrectory mismatches with the uploads."
|
"remote storage list on subdirrectory mismatches with the uploads."
|
||||||
);
|
);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -199,7 +214,11 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<(
|
|||||||
|
|
||||||
ctx.client.delete_objects(&[path1, path2], &cancel).await?;
|
ctx.client.delete_objects(&[path1, path2], &cancel).await?;
|
||||||
|
|
||||||
let prefixes = ctx.client.list_prefixes(None, &cancel).await?;
|
let prefixes = ctx
|
||||||
|
.client
|
||||||
|
.list(None, ListingMode::WithDelimiter, None, &cancel)
|
||||||
|
.await?
|
||||||
|
.prefixes;
|
||||||
|
|
||||||
assert_eq!(prefixes.len(), 1);
|
assert_eq!(prefixes.len(), 1);
|
||||||
|
|
||||||
|
|||||||
@@ -132,10 +132,6 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// NOTE: the setups for the list_prefixes test and the list_files test are very similar
|
|
||||||
// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
|
|
||||||
// whereas the list_files function is concerned with listing files.
|
|
||||||
// See `RemoteStorage::list_files` documentation for more details
|
|
||||||
enum MaybeEnabledStorageWithSimpleTestBlobs {
|
enum MaybeEnabledStorageWithSimpleTestBlobs {
|
||||||
Enabled(AzureWithSimpleTestBlobs),
|
Enabled(AzureWithSimpleTestBlobs),
|
||||||
Disabled,
|
Disabled,
|
||||||
|
|||||||
@@ -12,8 +12,8 @@ use anyhow::Context;
|
|||||||
use camino::Utf8Path;
|
use camino::Utf8Path;
|
||||||
use futures_util::StreamExt;
|
use futures_util::StreamExt;
|
||||||
use remote_storage::{
|
use remote_storage::{
|
||||||
DownloadError, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
|
DownloadError, GenericRemoteStorage, ListingMode, RemotePath, RemoteStorageConfig,
|
||||||
S3Config,
|
RemoteStorageKind, S3Config,
|
||||||
};
|
};
|
||||||
use test_context::test_context;
|
use test_context::test_context;
|
||||||
use test_context::AsyncTestContext;
|
use test_context::AsyncTestContext;
|
||||||
@@ -75,11 +75,14 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
|||||||
client: &Arc<GenericRemoteStorage>,
|
client: &Arc<GenericRemoteStorage>,
|
||||||
cancel: &CancellationToken,
|
cancel: &CancellationToken,
|
||||||
) -> anyhow::Result<HashSet<RemotePath>> {
|
) -> anyhow::Result<HashSet<RemotePath>> {
|
||||||
Ok(retry(|| client.list_files(None, None, cancel))
|
Ok(
|
||||||
.await
|
retry(|| client.list(None, ListingMode::NoDelimiter, None, cancel))
|
||||||
.context("list root files failure")?
|
.await
|
||||||
.into_iter()
|
.context("list root files failure")?
|
||||||
.collect::<HashSet<_>>())
|
.keys
|
||||||
|
.into_iter()
|
||||||
|
.collect::<HashSet<_>>(),
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
let cancel = CancellationToken::new();
|
let cancel = CancellationToken::new();
|
||||||
@@ -294,10 +297,6 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// NOTE: the setups for the list_prefixes test and the list_files test are very similar
|
|
||||||
// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
|
|
||||||
// whereas the list_files function is concerned with listing files.
|
|
||||||
// See `RemoteStorage::list_files` documentation for more details
|
|
||||||
enum MaybeEnabledStorageWithSimpleTestBlobs {
|
enum MaybeEnabledStorageWithSimpleTestBlobs {
|
||||||
Enabled(S3WithSimpleTestBlobs),
|
Enabled(S3WithSimpleTestBlobs),
|
||||||
Disabled,
|
Disabled,
|
||||||
@@ -381,6 +380,7 @@ fn create_s3_client(
|
|||||||
endpoint: None,
|
endpoint: None,
|
||||||
concurrency_limit: NonZeroUsize::new(100).unwrap(),
|
concurrency_limit: NonZeroUsize::new(100).unwrap(),
|
||||||
max_keys_per_list_response,
|
max_keys_per_list_response,
|
||||||
|
upload_storage_class: None,
|
||||||
}),
|
}),
|
||||||
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
|
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -50,6 +50,9 @@ pub struct SkTimelineInfo {
|
|||||||
pub safekeeper_connstr: Option<String>,
|
pub safekeeper_connstr: Option<String>,
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub http_connstr: Option<String>,
|
pub http_connstr: Option<String>,
|
||||||
|
// Minimum of all active RO replicas flush LSN
|
||||||
|
#[serde(default = "lsn_invalid")]
|
||||||
|
pub standby_horizon: Lsn,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||||
|
|||||||
@@ -9,6 +9,33 @@ use serde::{Deserialize, Serialize};
|
|||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
|
|
||||||
|
/// Declare a failpoint that can use the `pause` failpoint action.
|
||||||
|
/// We don't want to block the executor thread, hence, spawn_blocking + await.
|
||||||
|
#[macro_export]
|
||||||
|
macro_rules! pausable_failpoint {
|
||||||
|
($name:literal) => {
|
||||||
|
if cfg!(feature = "testing") {
|
||||||
|
tokio::task::spawn_blocking({
|
||||||
|
let current = tracing::Span::current();
|
||||||
|
move || {
|
||||||
|
let _entered = current.entered();
|
||||||
|
tracing::info!("at failpoint {}", $name);
|
||||||
|
fail::fail_point!($name);
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.expect("spawn_blocking");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
($name:literal, $cond:expr) => {
|
||||||
|
if cfg!(feature = "testing") {
|
||||||
|
if $cond {
|
||||||
|
pausable_failpoint!($name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
/// use with fail::cfg("$name", "return(2000)")
|
/// use with fail::cfg("$name", "return(2000)")
|
||||||
///
|
///
|
||||||
/// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
|
/// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
|
||||||
|
|||||||
@@ -34,6 +34,8 @@ pub enum Generation {
|
|||||||
/// scenarios where pageservers might otherwise issue conflicting writes to
|
/// scenarios where pageservers might otherwise issue conflicting writes to
|
||||||
/// remote storage
|
/// remote storage
|
||||||
impl Generation {
|
impl Generation {
|
||||||
|
pub const MAX: Self = Self::Valid(u32::MAX);
|
||||||
|
|
||||||
/// Create a new Generation that represents a legacy key format with
|
/// Create a new Generation that represents a legacy key format with
|
||||||
/// no generation suffix
|
/// no generation suffix
|
||||||
pub fn none() -> Self {
|
pub fn none() -> Self {
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
//! # Example
|
//! # Example
|
||||||
//!
|
//!
|
||||||
//! ```
|
//! ```
|
||||||
//! # tokio_test::block_on(async {
|
//! # tokio::runtime::Builder::new_current_thread().enable_all().build().unwrap().block_on(async {
|
||||||
//! use utils::poison::Poison;
|
//! use utils::poison::Poison;
|
||||||
//! use std::time::Duration;
|
//! use std::time::Duration;
|
||||||
//!
|
//!
|
||||||
|
|||||||
@@ -2,11 +2,10 @@
|
|||||||
|
|
||||||
use std::cmp::{Eq, Ordering};
|
use std::cmp::{Eq, Ordering};
|
||||||
use std::collections::BinaryHeap;
|
use std::collections::BinaryHeap;
|
||||||
use std::fmt::Debug;
|
|
||||||
use std::mem;
|
use std::mem;
|
||||||
use std::sync::Mutex;
|
use std::sync::Mutex;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use tokio::sync::watch::{channel, Receiver, Sender};
|
use tokio::sync::watch::{self, channel};
|
||||||
use tokio::time::timeout;
|
use tokio::time::timeout;
|
||||||
|
|
||||||
/// An error happened while waiting for a number
|
/// An error happened while waiting for a number
|
||||||
@@ -35,23 +34,73 @@ pub trait MonotonicCounter<V> {
|
|||||||
fn cnt_value(&self) -> V;
|
fn cnt_value(&self) -> V;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Internal components of a `SeqWait`
|
/// Heap of waiters, lowest numbers pop first.
|
||||||
struct SeqWaitInt<S, V>
|
struct Waiters<V>
|
||||||
where
|
where
|
||||||
S: MonotonicCounter<V>,
|
|
||||||
V: Ord,
|
V: Ord,
|
||||||
{
|
{
|
||||||
waiters: BinaryHeap<Waiter<V>>,
|
heap: BinaryHeap<Waiter<V>>,
|
||||||
current: S,
|
/// Number of the first waiter in the heap, or None if there are no waiters.
|
||||||
shutdown: bool,
|
status_channel: watch::Sender<Option<V>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<V> Waiters<V>
|
||||||
|
where
|
||||||
|
V: Ord + Copy,
|
||||||
|
{
|
||||||
|
fn new() -> Self {
|
||||||
|
Waiters {
|
||||||
|
heap: BinaryHeap::new(),
|
||||||
|
status_channel: channel(None).0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// `status_channel` contains the number of the first waiter in the heap.
|
||||||
|
/// This function should be called whenever waiters heap changes.
|
||||||
|
fn update_status(&self) {
|
||||||
|
let first_waiter = self.heap.peek().map(|w| w.wake_num);
|
||||||
|
let _ = self.status_channel.send_replace(first_waiter);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Add new waiter to the heap, return a channel that will be notified when the number arrives.
|
||||||
|
fn add(&mut self, num: V) -> watch::Receiver<()> {
|
||||||
|
let (tx, rx) = channel(());
|
||||||
|
self.heap.push(Waiter {
|
||||||
|
wake_num: num,
|
||||||
|
wake_channel: tx,
|
||||||
|
});
|
||||||
|
self.update_status();
|
||||||
|
rx
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Pop all waiters <= num from the heap. Collect channels in a vector,
|
||||||
|
/// so that caller can wake them up.
|
||||||
|
fn pop_leq(&mut self, num: V) -> Vec<watch::Sender<()>> {
|
||||||
|
let mut wake_these = Vec::new();
|
||||||
|
while let Some(n) = self.heap.peek() {
|
||||||
|
if n.wake_num > num {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
wake_these.push(self.heap.pop().unwrap().wake_channel);
|
||||||
|
}
|
||||||
|
self.update_status();
|
||||||
|
wake_these
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Used on shutdown to efficiently drop all waiters.
|
||||||
|
fn take_all(&mut self) -> BinaryHeap<Waiter<V>> {
|
||||||
|
let heap = mem::take(&mut self.heap);
|
||||||
|
self.update_status();
|
||||||
|
heap
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct Waiter<T>
|
struct Waiter<T>
|
||||||
where
|
where
|
||||||
T: Ord,
|
T: Ord,
|
||||||
{
|
{
|
||||||
wake_num: T, // wake me when this number arrives ...
|
wake_num: T, // wake me when this number arrives ...
|
||||||
wake_channel: Sender<()>, // ... by sending a message to this channel
|
wake_channel: watch::Sender<()>, // ... by sending a message to this channel
|
||||||
}
|
}
|
||||||
|
|
||||||
// BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
|
// BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
|
||||||
@@ -76,6 +125,17 @@ impl<T: Ord> PartialEq for Waiter<T> {
|
|||||||
|
|
||||||
impl<T: Ord> Eq for Waiter<T> {}
|
impl<T: Ord> Eq for Waiter<T> {}
|
||||||
|
|
||||||
|
/// Internal components of a `SeqWait`
|
||||||
|
struct SeqWaitInt<S, V>
|
||||||
|
where
|
||||||
|
S: MonotonicCounter<V>,
|
||||||
|
V: Ord,
|
||||||
|
{
|
||||||
|
waiters: Waiters<V>,
|
||||||
|
current: S,
|
||||||
|
shutdown: bool,
|
||||||
|
}
|
||||||
|
|
||||||
/// A tool for waiting on a sequence number
|
/// A tool for waiting on a sequence number
|
||||||
///
|
///
|
||||||
/// This provides a way to wait the arrival of a number.
|
/// This provides a way to wait the arrival of a number.
|
||||||
@@ -108,7 +168,7 @@ where
|
|||||||
/// Create a new `SeqWait`, initialized to a particular number
|
/// Create a new `SeqWait`, initialized to a particular number
|
||||||
pub fn new(starting_num: S) -> Self {
|
pub fn new(starting_num: S) -> Self {
|
||||||
let internal = SeqWaitInt {
|
let internal = SeqWaitInt {
|
||||||
waiters: BinaryHeap::new(),
|
waiters: Waiters::new(),
|
||||||
current: starting_num,
|
current: starting_num,
|
||||||
shutdown: false,
|
shutdown: false,
|
||||||
};
|
};
|
||||||
@@ -128,9 +188,8 @@ where
|
|||||||
// Block any future waiters from starting
|
// Block any future waiters from starting
|
||||||
internal.shutdown = true;
|
internal.shutdown = true;
|
||||||
|
|
||||||
// This will steal the entire waiters map.
|
// Take all waiters to drop them later.
|
||||||
// When we drop it all waiters will be woken.
|
internal.waiters.take_all()
|
||||||
mem::take(&mut internal.waiters)
|
|
||||||
|
|
||||||
// Drop the lock as we exit this scope.
|
// Drop the lock as we exit this scope.
|
||||||
};
|
};
|
||||||
@@ -196,7 +255,7 @@ where
|
|||||||
|
|
||||||
/// Register and return a channel that will be notified when a number arrives,
|
/// Register and return a channel that will be notified when a number arrives,
|
||||||
/// or None, if it has already arrived.
|
/// or None, if it has already arrived.
|
||||||
fn queue_for_wait(&self, num: V) -> Result<Option<Receiver<()>>, SeqWaitError> {
|
fn queue_for_wait(&self, num: V) -> Result<Option<watch::Receiver<()>>, SeqWaitError> {
|
||||||
let mut internal = self.internal.lock().unwrap();
|
let mut internal = self.internal.lock().unwrap();
|
||||||
if internal.current.cnt_value() >= num {
|
if internal.current.cnt_value() >= num {
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
@@ -205,12 +264,8 @@ where
|
|||||||
return Err(SeqWaitError::Shutdown);
|
return Err(SeqWaitError::Shutdown);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create a new channel.
|
// Add waiter channel to the queue.
|
||||||
let (tx, rx) = channel(());
|
let rx = internal.waiters.add(num);
|
||||||
internal.waiters.push(Waiter {
|
|
||||||
wake_num: num,
|
|
||||||
wake_channel: tx,
|
|
||||||
});
|
|
||||||
// Drop the lock as we exit this scope.
|
// Drop the lock as we exit this scope.
|
||||||
Ok(Some(rx))
|
Ok(Some(rx))
|
||||||
}
|
}
|
||||||
@@ -231,16 +286,8 @@ where
|
|||||||
}
|
}
|
||||||
internal.current.cnt_advance(num);
|
internal.current.cnt_advance(num);
|
||||||
|
|
||||||
// Pop all waiters <= num from the heap. Collect them in a vector, and
|
// Pop all waiters <= num from the heap.
|
||||||
// wake them up after releasing the lock.
|
internal.waiters.pop_leq(num)
|
||||||
let mut wake_these = Vec::new();
|
|
||||||
while let Some(n) = internal.waiters.peek() {
|
|
||||||
if n.wake_num > num {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
wake_these.push(internal.waiters.pop().unwrap().wake_channel);
|
|
||||||
}
|
|
||||||
wake_these
|
|
||||||
};
|
};
|
||||||
|
|
||||||
for tx in wake_these {
|
for tx in wake_these {
|
||||||
@@ -255,6 +302,23 @@ where
|
|||||||
pub fn load(&self) -> S {
|
pub fn load(&self) -> S {
|
||||||
self.internal.lock().unwrap().current
|
self.internal.lock().unwrap().current
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Get a Receiver for the current status.
|
||||||
|
///
|
||||||
|
/// The current status is the number of the first waiter in the queue,
|
||||||
|
/// or None if there are no waiters.
|
||||||
|
///
|
||||||
|
/// This receiver will be notified whenever the status changes.
|
||||||
|
/// It is useful for receiving notifications when the first waiter
|
||||||
|
/// starts waiting for a number, or when there are no more waiters left.
|
||||||
|
pub fn status_receiver(&self) -> watch::Receiver<Option<V>> {
|
||||||
|
self.internal
|
||||||
|
.lock()
|
||||||
|
.unwrap()
|
||||||
|
.waiters
|
||||||
|
.status_channel
|
||||||
|
.subscribe()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
|||||||
@@ -135,7 +135,8 @@ impl Gate {
|
|||||||
let started_at = std::time::Instant::now();
|
let started_at = std::time::Instant::now();
|
||||||
let mut do_close = std::pin::pin!(self.do_close());
|
let mut do_close = std::pin::pin!(self.do_close());
|
||||||
|
|
||||||
let nag_after = Duration::from_secs(1);
|
// with 1s we rarely saw anything, let's try if we get more gate closing reasons with 100ms
|
||||||
|
let nag_after = Duration::from_millis(100);
|
||||||
|
|
||||||
let Err(_timeout) = tokio::time::timeout(nag_after, &mut do_close).await else {
|
let Err(_timeout) = tokio::time::timeout(nag_after, &mut do_close).await else {
|
||||||
return;
|
return;
|
||||||
|
|||||||
@@ -50,6 +50,14 @@ extern "C" fn get_flush_rec_ptr(wp: *mut WalProposer) -> XLogRecPtr {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
extern "C" fn update_donor(wp: *mut WalProposer, donor: *mut Safekeeper, donor_lsn: XLogRecPtr) {
|
||||||
|
unsafe {
|
||||||
|
let callback_data = (*(*wp).config).callback_data;
|
||||||
|
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||||
|
(*api).update_donor(&mut (*donor), donor_lsn)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
extern "C" fn get_current_timestamp(wp: *mut WalProposer) -> TimestampTz {
|
extern "C" fn get_current_timestamp(wp: *mut WalProposer) -> TimestampTz {
|
||||||
unsafe {
|
unsafe {
|
||||||
let callback_data = (*(*wp).config).callback_data;
|
let callback_data = (*(*wp).config).callback_data;
|
||||||
@@ -391,6 +399,7 @@ pub(crate) fn create_api() -> walproposer_api {
|
|||||||
get_shmem_state: Some(get_shmem_state),
|
get_shmem_state: Some(get_shmem_state),
|
||||||
start_streaming: Some(start_streaming),
|
start_streaming: Some(start_streaming),
|
||||||
get_flush_rec_ptr: Some(get_flush_rec_ptr),
|
get_flush_rec_ptr: Some(get_flush_rec_ptr),
|
||||||
|
update_donor: Some(update_donor),
|
||||||
get_current_timestamp: Some(get_current_timestamp),
|
get_current_timestamp: Some(get_current_timestamp),
|
||||||
conn_error_message: Some(conn_error_message),
|
conn_error_message: Some(conn_error_message),
|
||||||
conn_status: Some(conn_status),
|
conn_status: Some(conn_status),
|
||||||
@@ -421,6 +430,32 @@ pub(crate) fn create_api() -> walproposer_api {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn empty_shmem() -> crate::bindings::WalproposerShmemState {
|
||||||
|
let empty_feedback = crate::bindings::PageserverFeedback {
|
||||||
|
present: false,
|
||||||
|
currentClusterSize: 0,
|
||||||
|
last_received_lsn: 0,
|
||||||
|
disk_consistent_lsn: 0,
|
||||||
|
remote_consistent_lsn: 0,
|
||||||
|
replytime: 0,
|
||||||
|
shard_number: 0,
|
||||||
|
};
|
||||||
|
|
||||||
|
crate::bindings::WalproposerShmemState {
|
||||||
|
propEpochStartLsn: crate::bindings::pg_atomic_uint64 { value: 0 },
|
||||||
|
donor_name: [0; 64],
|
||||||
|
donor_conninfo: [0; 1024],
|
||||||
|
donor_lsn: 0,
|
||||||
|
mutex: 0,
|
||||||
|
mineLastElectedTerm: crate::bindings::pg_atomic_uint64 { value: 0 },
|
||||||
|
backpressureThrottlingTime: crate::bindings::pg_atomic_uint64 { value: 0 },
|
||||||
|
currentClusterSize: crate::bindings::pg_atomic_uint64 { value: 0 },
|
||||||
|
shard_ps_feedback: [empty_feedback; 128],
|
||||||
|
num_shards: 0,
|
||||||
|
min_ps_feedback: empty_feedback,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl std::fmt::Display for Level {
|
impl std::fmt::Display for Level {
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||||
write!(f, "{:?}", self)
|
write!(f, "{:?}", self)
|
||||||
|
|||||||
@@ -1,8 +1,5 @@
|
|||||||
use std::ffi::CString;
|
use std::ffi::CString;
|
||||||
|
|
||||||
use postgres_ffi::WAL_SEGMENT_SIZE;
|
|
||||||
use utils::{id::TenantTimelineId, lsn::Lsn};
|
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
api_bindings::{create_api, take_vec_u8, Level},
|
api_bindings::{create_api, take_vec_u8, Level},
|
||||||
bindings::{
|
bindings::{
|
||||||
@@ -10,6 +7,8 @@ use crate::{
|
|||||||
WalProposerCreate, WalProposerFree, WalProposerPoll, WalProposerStart,
|
WalProposerCreate, WalProposerFree, WalProposerPoll, WalProposerStart,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
use postgres_ffi::WAL_SEGMENT_SIZE;
|
||||||
|
use utils::{id::TenantTimelineId, lsn::Lsn};
|
||||||
|
|
||||||
/// Rust high-level wrapper for C walproposer API. Many methods are not required
|
/// Rust high-level wrapper for C walproposer API. Many methods are not required
|
||||||
/// for simple cases, hence todo!() in default implementations.
|
/// for simple cases, hence todo!() in default implementations.
|
||||||
@@ -28,6 +27,10 @@ pub trait ApiImpl {
|
|||||||
todo!()
|
todo!()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn update_donor(&self, _donor: &mut Safekeeper, _donor_lsn: u64) {
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
|
||||||
fn get_current_timestamp(&self) -> i64 {
|
fn get_current_timestamp(&self) -> i64 {
|
||||||
todo!()
|
todo!()
|
||||||
}
|
}
|
||||||
@@ -274,6 +277,7 @@ mod tests {
|
|||||||
sync::{atomic::AtomicUsize, mpsc::sync_channel},
|
sync::{atomic::AtomicUsize, mpsc::sync_channel},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
use std::cell::UnsafeCell;
|
||||||
use utils::id::TenantTimelineId;
|
use utils::id::TenantTimelineId;
|
||||||
|
|
||||||
use crate::{api_bindings::Level, bindings::NeonWALReadResult, walproposer::Wrapper};
|
use crate::{api_bindings::Level, bindings::NeonWALReadResult, walproposer::Wrapper};
|
||||||
@@ -297,6 +301,8 @@ mod tests {
|
|||||||
replies_ptr: AtomicUsize,
|
replies_ptr: AtomicUsize,
|
||||||
// channel to send LSN to the main thread
|
// channel to send LSN to the main thread
|
||||||
sync_channel: std::sync::mpsc::SyncSender<u64>,
|
sync_channel: std::sync::mpsc::SyncSender<u64>,
|
||||||
|
// Shmem state, used for storing donor info
|
||||||
|
shmem: UnsafeCell<crate::bindings::WalproposerShmemState>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl MockImpl {
|
impl MockImpl {
|
||||||
@@ -327,11 +333,22 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl ApiImpl for MockImpl {
|
impl ApiImpl for MockImpl {
|
||||||
|
fn get_shmem_state(&self) -> *mut crate::bindings::WalproposerShmemState {
|
||||||
|
self.shmem.get()
|
||||||
|
}
|
||||||
|
|
||||||
fn get_current_timestamp(&self) -> i64 {
|
fn get_current_timestamp(&self) -> i64 {
|
||||||
println!("get_current_timestamp");
|
println!("get_current_timestamp");
|
||||||
0
|
0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn update_donor(&self, donor: &mut crate::bindings::Safekeeper, donor_lsn: u64) {
|
||||||
|
let mut shmem = unsafe { *self.get_shmem_state() };
|
||||||
|
shmem.propEpochStartLsn.value = donor_lsn;
|
||||||
|
shmem.donor_conninfo = donor.conninfo;
|
||||||
|
shmem.donor_lsn = donor_lsn;
|
||||||
|
}
|
||||||
|
|
||||||
fn conn_status(
|
fn conn_status(
|
||||||
&self,
|
&self,
|
||||||
_: &mut crate::bindings::Safekeeper,
|
_: &mut crate::bindings::Safekeeper,
|
||||||
@@ -479,9 +496,9 @@ mod tests {
|
|||||||
// TODO: When updating Postgres versions, this test will cause
|
// TODO: When updating Postgres versions, this test will cause
|
||||||
// problems. Postgres version in message needs updating.
|
// problems. Postgres version in message needs updating.
|
||||||
//
|
//
|
||||||
// Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160002, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
|
// Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160003, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
|
||||||
vec![
|
vec![
|
||||||
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
|
||||||
147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
|
147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
|
||||||
188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
|
188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
|
||||||
@@ -507,6 +524,7 @@ mod tests {
|
|||||||
],
|
],
|
||||||
replies_ptr: AtomicUsize::new(0),
|
replies_ptr: AtomicUsize::new(0),
|
||||||
sync_channel: sender,
|
sync_channel: sender,
|
||||||
|
shmem: UnsafeCell::new(crate::api_bindings::empty_shmem()),
|
||||||
});
|
});
|
||||||
let config = crate::walproposer::Config {
|
let config = crate::walproposer::Config {
|
||||||
ttid,
|
ttid,
|
||||||
|
|||||||
@@ -70,6 +70,7 @@ tokio-stream.workspace = true
|
|||||||
tokio-util.workspace = true
|
tokio-util.workspace = true
|
||||||
toml_edit = { workspace = true, features = [ "serde" ] }
|
toml_edit = { workspace = true, features = [ "serde" ] }
|
||||||
tracing.workspace = true
|
tracing.workspace = true
|
||||||
|
twox-hash.workspace = true
|
||||||
url.workspace = true
|
url.workspace = true
|
||||||
walkdir.workspace = true
|
walkdir.workspace = true
|
||||||
metrics.workspace = true
|
metrics.workspace = true
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
use pageserver::keyspace::{KeyPartitioning, KeySpace};
|
use pageserver::keyspace::{KeyPartitioning, KeySpace};
|
||||||
use pageserver::repository::Key;
|
use pageserver::repository::Key;
|
||||||
use pageserver::tenant::layer_map::LayerMap;
|
use pageserver::tenant::layer_map::LayerMap;
|
||||||
use pageserver::tenant::storage_layer::LayerFileName;
|
use pageserver::tenant::storage_layer::LayerName;
|
||||||
use pageserver::tenant::storage_layer::PersistentLayerDesc;
|
use pageserver::tenant::storage_layer::PersistentLayerDesc;
|
||||||
use pageserver_api::shard::TenantShardId;
|
use pageserver_api::shard::TenantShardId;
|
||||||
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
|
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
|
||||||
@@ -28,7 +28,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
|
|||||||
let mut updates = layer_map.batch_update();
|
let mut updates = layer_map.batch_update();
|
||||||
for fname in filenames {
|
for fname in filenames {
|
||||||
let fname = fname.unwrap();
|
let fname = fname.unwrap();
|
||||||
let fname = LayerFileName::from_str(&fname).unwrap();
|
let fname = LayerName::from_str(&fname).unwrap();
|
||||||
let layer = PersistentLayerDesc::from(fname);
|
let layer = PersistentLayerDesc::from(fname);
|
||||||
|
|
||||||
let lsn_range = layer.get_lsn_range();
|
let lsn_range = layer.get_lsn_range();
|
||||||
|
|||||||
@@ -30,47 +30,27 @@
|
|||||||
//! 2024-04-15 on i3en.3xlarge
|
//! 2024-04-15 on i3en.3xlarge
|
||||||
//!
|
//!
|
||||||
//! ```text
|
//! ```text
|
||||||
//! async-short/1 time: [24.584 µs 24.737 µs 24.922 µs]
|
//! short/1 time: [24.584 µs 24.737 µs 24.922 µs]
|
||||||
//! async-short/2 time: [33.479 µs 33.660 µs 33.888 µs]
|
//! short/2 time: [33.479 µs 33.660 µs 33.888 µs]
|
||||||
//! async-short/4 time: [42.713 µs 43.046 µs 43.440 µs]
|
//! short/4 time: [42.713 µs 43.046 µs 43.440 µs]
|
||||||
//! async-short/8 time: [71.814 µs 72.478 µs 73.240 µs]
|
//! short/8 time: [71.814 µs 72.478 µs 73.240 µs]
|
||||||
//! async-short/16 time: [132.73 µs 134.45 µs 136.22 µs]
|
//! short/16 time: [132.73 µs 134.45 µs 136.22 µs]
|
||||||
//! async-short/32 time: [258.31 µs 260.73 µs 263.27 µs]
|
//! short/32 time: [258.31 µs 260.73 µs 263.27 µs]
|
||||||
//! async-short/64 time: [511.61 µs 514.44 µs 517.51 µs]
|
//! short/64 time: [511.61 µs 514.44 µs 517.51 µs]
|
||||||
//! async-short/128 time: [992.64 µs 998.23 µs 1.0042 ms]
|
//! short/128 time: [992.64 µs 998.23 µs 1.0042 ms]
|
||||||
//! async-medium/1 time: [110.11 µs 110.50 µs 110.96 µs]
|
//! medium/1 time: [110.11 µs 110.50 µs 110.96 µs]
|
||||||
//! async-medium/2 time: [153.06 µs 153.85 µs 154.99 µs]
|
//! medium/2 time: [153.06 µs 153.85 µs 154.99 µs]
|
||||||
//! async-medium/4 time: [317.51 µs 319.92 µs 322.85 µs]
|
//! medium/4 time: [317.51 µs 319.92 µs 322.85 µs]
|
||||||
//! async-medium/8 time: [638.30 µs 644.68 µs 652.12 µs]
|
//! medium/8 time: [638.30 µs 644.68 µs 652.12 µs]
|
||||||
//! async-medium/16 time: [1.2651 ms 1.2773 ms 1.2914 ms]
|
//! medium/16 time: [1.2651 ms 1.2773 ms 1.2914 ms]
|
||||||
//! async-medium/32 time: [2.5117 ms 2.5410 ms 2.5720 ms]
|
//! medium/32 time: [2.5117 ms 2.5410 ms 2.5720 ms]
|
||||||
//! async-medium/64 time: [4.8088 ms 4.8555 ms 4.9047 ms]
|
//! medium/64 time: [4.8088 ms 4.8555 ms 4.9047 ms]
|
||||||
//! async-medium/128 time: [8.8311 ms 8.9849 ms 9.1263 ms]
|
//! medium/128 time: [8.8311 ms 8.9849 ms 9.1263 ms]
|
||||||
//! sync-short/1 time: [25.503 µs 25.626 µs 25.771 µs]
|
|
||||||
//! sync-short/2 time: [30.850 µs 31.013 µs 31.208 µs]
|
|
||||||
//! sync-short/4 time: [45.543 µs 45.856 µs 46.193 µs]
|
|
||||||
//! sync-short/8 time: [84.114 µs 84.639 µs 85.220 µs]
|
|
||||||
//! sync-short/16 time: [185.22 µs 186.15 µs 187.13 µs]
|
|
||||||
//! sync-short/32 time: [377.43 µs 378.87 µs 380.46 µs]
|
|
||||||
//! sync-short/64 time: [756.49 µs 759.04 µs 761.70 µs]
|
|
||||||
//! sync-short/128 time: [1.4825 ms 1.4874 ms 1.4923 ms]
|
|
||||||
//! sync-medium/1 time: [105.66 µs 106.01 µs 106.43 µs]
|
|
||||||
//! sync-medium/2 time: [153.10 µs 153.84 µs 154.72 µs]
|
|
||||||
//! sync-medium/4 time: [327.13 µs 329.44 µs 332.27 µs]
|
|
||||||
//! sync-medium/8 time: [654.26 µs 658.73 µs 663.63 µs]
|
|
||||||
//! sync-medium/16 time: [1.2682 ms 1.2748 ms 1.2816 ms]
|
|
||||||
//! sync-medium/32 time: [2.4456 ms 2.4595 ms 2.4731 ms]
|
|
||||||
//! sync-medium/64 time: [4.6523 ms 4.6890 ms 4.7256 ms]
|
|
||||||
//! sync-medium/128 time: [8.7215 ms 8.8323 ms 8.9344 ms]
|
|
||||||
//! ```
|
//! ```
|
||||||
|
|
||||||
use bytes::{Buf, Bytes};
|
use bytes::{Buf, Bytes};
|
||||||
use criterion::{BenchmarkId, Criterion};
|
use criterion::{BenchmarkId, Criterion};
|
||||||
use pageserver::{
|
use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
|
||||||
config::PageServerConf,
|
|
||||||
walrecord::NeonWalRecord,
|
|
||||||
walredo::{PostgresRedoManager, ProcessKind},
|
|
||||||
};
|
|
||||||
use pageserver_api::{key::Key, shard::TenantShardId};
|
use pageserver_api::{key::Key, shard::TenantShardId};
|
||||||
use std::{
|
use std::{
|
||||||
sync::Arc,
|
sync::Arc,
|
||||||
@@ -80,39 +60,32 @@ use tokio::{sync::Barrier, task::JoinSet};
|
|||||||
use utils::{id::TenantId, lsn::Lsn};
|
use utils::{id::TenantId, lsn::Lsn};
|
||||||
|
|
||||||
fn bench(c: &mut Criterion) {
|
fn bench(c: &mut Criterion) {
|
||||||
for process_kind in &[ProcessKind::Async, ProcessKind::Sync] {
|
{
|
||||||
{
|
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||||
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
for nclients in nclients {
|
||||||
for nclients in nclients {
|
let mut group = c.benchmark_group("short");
|
||||||
let mut group = c.benchmark_group(format!("{process_kind}-short"));
|
group.bench_with_input(
|
||||||
group.bench_with_input(
|
BenchmarkId::from_parameter(nclients),
|
||||||
BenchmarkId::from_parameter(nclients),
|
&nclients,
|
||||||
&nclients,
|
|b, nclients| {
|
||||||
|b, nclients| {
|
let redo_work = Arc::new(Request::short_input());
|
||||||
let redo_work = Arc::new(Request::short_input());
|
b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
|
||||||
b.iter_custom(|iters| {
|
},
|
||||||
bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
|
);
|
||||||
});
|
|
||||||
},
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
{
|
{
|
||||||
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||||
for nclients in nclients {
|
for nclients in nclients {
|
||||||
let mut group = c.benchmark_group(format!("{process_kind}-medium"));
|
let mut group = c.benchmark_group("medium");
|
||||||
group.bench_with_input(
|
group.bench_with_input(
|
||||||
BenchmarkId::from_parameter(nclients),
|
BenchmarkId::from_parameter(nclients),
|
||||||
&nclients,
|
&nclients,
|
||||||
|b, nclients| {
|
|b, nclients| {
|
||||||
let redo_work = Arc::new(Request::medium_input());
|
let redo_work = Arc::new(Request::medium_input());
|
||||||
b.iter_custom(|iters| {
|
b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
|
||||||
bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
|
},
|
||||||
});
|
);
|
||||||
},
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -120,16 +93,10 @@ criterion::criterion_group!(benches, bench);
|
|||||||
criterion::criterion_main!(benches);
|
criterion::criterion_main!(benches);
|
||||||
|
|
||||||
// Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
|
// Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
|
||||||
fn bench_impl(
|
fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
|
||||||
process_kind: ProcessKind,
|
|
||||||
redo_work: Arc<Request>,
|
|
||||||
n_redos: u64,
|
|
||||||
nclients: u64,
|
|
||||||
) -> Duration {
|
|
||||||
let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
|
let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
|
||||||
|
|
||||||
let mut conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
|
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
|
||||||
conf.walredo_process_kind = process_kind;
|
|
||||||
let conf = Box::leak(Box::new(conf));
|
let conf = Box::leak(Box::new(conf));
|
||||||
let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
|
let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
|
||||||
|
|
||||||
@@ -158,27 +125,13 @@ fn bench_impl(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
let elapsed = rt.block_on(async move {
|
rt.block_on(async move {
|
||||||
let mut total_wallclock_time = Duration::ZERO;
|
let mut total_wallclock_time = Duration::ZERO;
|
||||||
while let Some(res) = tasks.join_next().await {
|
while let Some(res) = tasks.join_next().await {
|
||||||
total_wallclock_time += res.unwrap();
|
total_wallclock_time += res.unwrap();
|
||||||
}
|
}
|
||||||
total_wallclock_time
|
total_wallclock_time
|
||||||
});
|
})
|
||||||
|
|
||||||
// consistency check to ensure process kind setting worked
|
|
||||||
if nredos_per_client > 0 {
|
|
||||||
assert_eq!(
|
|
||||||
manager
|
|
||||||
.status()
|
|
||||||
.process
|
|
||||||
.map(|p| p.kind)
|
|
||||||
.expect("the benchmark work causes a walredo process to be spawned"),
|
|
||||||
std::borrow::Cow::Borrowed(process_kind.into())
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
elapsed
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn client(
|
async fn client(
|
||||||
|
|||||||
@@ -1,8 +1,12 @@
|
|||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
use bytes::Bytes;
|
||||||
use pageserver_api::{models::*, shard::TenantShardId};
|
use pageserver_api::{models::*, shard::TenantShardId};
|
||||||
use reqwest::{IntoUrl, Method, StatusCode};
|
use reqwest::{IntoUrl, Method, StatusCode};
|
||||||
use utils::{
|
use utils::{
|
||||||
http::error::HttpErrorBody,
|
http::error::HttpErrorBody,
|
||||||
id::{TenantId, TimelineId},
|
id::{TenantId, TimelineId},
|
||||||
|
lsn::Lsn,
|
||||||
};
|
};
|
||||||
|
|
||||||
pub mod util;
|
pub mod util;
|
||||||
@@ -243,6 +247,19 @@ impl Client {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn tenant_scan_remote_storage(
|
||||||
|
&self,
|
||||||
|
tenant_id: TenantId,
|
||||||
|
) -> Result<TenantScanRemoteStorageResponse> {
|
||||||
|
let uri = format!(
|
||||||
|
"{}/v1/tenant/{tenant_id}/scan_remote_storage",
|
||||||
|
self.mgmt_api_endpoint
|
||||||
|
);
|
||||||
|
let response = self.request(Method::GET, &uri, ()).await?;
|
||||||
|
let body = response.json().await.map_err(Error::ReceiveBody)?;
|
||||||
|
Ok(body)
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
|
pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
|
||||||
let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
|
let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
|
||||||
self.request(Method::PUT, &uri, req).await?;
|
self.request(Method::PUT, &uri, req).await?;
|
||||||
@@ -271,6 +288,34 @@ impl Client {
|
|||||||
Ok((status, progress))
|
Ok((status, progress))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn tenant_secondary_status(
|
||||||
|
&self,
|
||||||
|
tenant_shard_id: TenantShardId,
|
||||||
|
) -> Result<SecondaryProgress> {
|
||||||
|
let path = reqwest::Url::parse(&format!(
|
||||||
|
"{}/v1/tenant/{}/secondary/status",
|
||||||
|
self.mgmt_api_endpoint, tenant_shard_id
|
||||||
|
))
|
||||||
|
.expect("Cannot build URL");
|
||||||
|
|
||||||
|
self.request(Method::GET, path, ())
|
||||||
|
.await?
|
||||||
|
.json()
|
||||||
|
.await
|
||||||
|
.map_err(Error::ReceiveBody)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn tenant_heatmap_upload(&self, tenant_id: TenantShardId) -> Result<()> {
|
||||||
|
let path = reqwest::Url::parse(&format!(
|
||||||
|
"{}/v1/tenant/{}/heatmap_upload",
|
||||||
|
self.mgmt_api_endpoint, tenant_id
|
||||||
|
))
|
||||||
|
.expect("Cannot build URL");
|
||||||
|
|
||||||
|
self.request(Method::POST, path, ()).await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn location_config(
|
pub async fn location_config(
|
||||||
&self,
|
&self,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_shard_id: TenantShardId,
|
||||||
@@ -278,10 +323,7 @@ impl Client {
|
|||||||
flush_ms: Option<std::time::Duration>,
|
flush_ms: Option<std::time::Duration>,
|
||||||
lazy: bool,
|
lazy: bool,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let req_body = TenantLocationConfigRequest {
|
let req_body = TenantLocationConfigRequest { config };
|
||||||
tenant_id: Some(tenant_shard_id),
|
|
||||||
config,
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut path = reqwest::Url::parse(&format!(
|
let mut path = reqwest::Url::parse(&format!(
|
||||||
"{}/v1/tenant/{}/location_config",
|
"{}/v1/tenant/{}/location_config",
|
||||||
@@ -448,6 +490,18 @@ impl Client {
|
|||||||
.map_err(Error::ReceiveBody)
|
.map_err(Error::ReceiveBody)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn top_tenant_shards(
|
||||||
|
&self,
|
||||||
|
request: TopTenantShardsRequest,
|
||||||
|
) -> Result<TopTenantShardsResponse> {
|
||||||
|
let uri = format!("{}/v1/top_tenants", self.mgmt_api_endpoint);
|
||||||
|
self.request(Method::POST, uri, request)
|
||||||
|
.await?
|
||||||
|
.json()
|
||||||
|
.await
|
||||||
|
.map_err(Error::ReceiveBody)
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn layer_map_info(
|
pub async fn layer_map_info(
|
||||||
&self,
|
&self,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_shard_id: TenantShardId,
|
||||||
@@ -511,4 +565,57 @@ impl Client {
|
|||||||
}),
|
}),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn ingest_aux_files(
|
||||||
|
&self,
|
||||||
|
tenant_shard_id: TenantShardId,
|
||||||
|
timeline_id: TimelineId,
|
||||||
|
aux_files: HashMap<String, String>,
|
||||||
|
) -> Result<bool> {
|
||||||
|
let uri = format!(
|
||||||
|
"{}/v1/tenant/{}/timeline/{}/ingest_aux_files",
|
||||||
|
self.mgmt_api_endpoint, tenant_shard_id, timeline_id
|
||||||
|
);
|
||||||
|
let resp = self
|
||||||
|
.request_noerror(Method::POST, &uri, IngestAuxFilesRequest { aux_files })
|
||||||
|
.await?;
|
||||||
|
match resp.status() {
|
||||||
|
StatusCode::OK => Ok(true),
|
||||||
|
status => Err(match resp.json::<HttpErrorBody>().await {
|
||||||
|
Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
|
||||||
|
Err(_) => {
|
||||||
|
Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
|
||||||
|
}
|
||||||
|
}),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn list_aux_files(
|
||||||
|
&self,
|
||||||
|
tenant_shard_id: TenantShardId,
|
||||||
|
timeline_id: TimelineId,
|
||||||
|
lsn: Lsn,
|
||||||
|
) -> Result<HashMap<String, Bytes>> {
|
||||||
|
let uri = format!(
|
||||||
|
"{}/v1/tenant/{}/timeline/{}/list_aux_files",
|
||||||
|
self.mgmt_api_endpoint, tenant_shard_id, timeline_id
|
||||||
|
);
|
||||||
|
let resp = self
|
||||||
|
.request_noerror(Method::POST, &uri, ListAuxFilesRequest { lsn })
|
||||||
|
.await?;
|
||||||
|
match resp.status() {
|
||||||
|
StatusCode::OK => {
|
||||||
|
let resp: HashMap<String, Bytes> = resp.json().await.map_err(|e| {
|
||||||
|
Error::ApiError(StatusCode::INTERNAL_SERVER_ERROR, format!("{e}"))
|
||||||
|
})?;
|
||||||
|
Ok(resp)
|
||||||
|
}
|
||||||
|
status => Err(match resp.json::<HttpErrorBody>().await {
|
||||||
|
Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
|
||||||
|
Err(_) => {
|
||||||
|
Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
|
||||||
|
}
|
||||||
|
}),
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -60,7 +60,7 @@ impl Client {
|
|||||||
) -> anyhow::Result<PagestreamClient> {
|
) -> anyhow::Result<PagestreamClient> {
|
||||||
let copy_both: tokio_postgres::CopyBothDuplex<bytes::Bytes> = self
|
let copy_both: tokio_postgres::CopyBothDuplex<bytes::Bytes> = self
|
||||||
.client
|
.client
|
||||||
.copy_both_simple(&format!("pagestream {tenant_id} {timeline_id}"))
|
.copy_both_simple(&format!("pagestream_v2 {tenant_id} {timeline_id}"))
|
||||||
.await?;
|
.await?;
|
||||||
let Client {
|
let Client {
|
||||||
cancel_on_client_drop,
|
cancel_on_client_drop,
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
use clap::{Parser, Subcommand};
|
use clap::{Parser, Subcommand};
|
||||||
|
use pageserver_compaction::helpers::PAGE_SZ;
|
||||||
use pageserver_compaction::simulator::MockTimeline;
|
use pageserver_compaction::simulator::MockTimeline;
|
||||||
use rand::Rng;
|
use rand::Rng;
|
||||||
use std::io::Write;
|
use std::io::Write;
|
||||||
@@ -51,7 +52,7 @@ async fn simulate(cmd: &SimulateCmd, results_path: &Path) -> anyhow::Result<()>
|
|||||||
let mut executor = MockTimeline::new();
|
let mut executor = MockTimeline::new();
|
||||||
|
|
||||||
// Convert the logical size in MB into a key range.
|
// Convert the logical size in MB into a key range.
|
||||||
let key_range = 0..((cmd.logical_size * 1024 * 1024) / 8192);
|
let key_range = 0..((cmd.logical_size * 1024 * 1024) / PAGE_SZ);
|
||||||
//let key_range = u64::MIN..u64::MAX;
|
//let key_range = u64::MIN..u64::MAX;
|
||||||
println!(
|
println!(
|
||||||
"starting simulation with key range {:016X}-{:016X}",
|
"starting simulation with key range {:016X}-{:016X}",
|
||||||
|
|||||||
@@ -18,12 +18,15 @@
|
|||||||
//! database size. For example, if the logical database size is 10 GB, we would
|
//! database size. For example, if the logical database size is 10 GB, we would
|
||||||
//! generate new image layers every 10 GB of WAL.
|
//! generate new image layers every 10 GB of WAL.
|
||||||
use futures::StreamExt;
|
use futures::StreamExt;
|
||||||
|
use pageserver_api::shard::ShardIdentity;
|
||||||
use tracing::{debug, info};
|
use tracing::{debug, info};
|
||||||
|
|
||||||
use std::collections::{HashSet, VecDeque};
|
use std::collections::{HashSet, VecDeque};
|
||||||
use std::ops::Range;
|
use std::ops::Range;
|
||||||
|
|
||||||
use crate::helpers::{accum_key_values, keyspace_total_size, merge_delta_keys, overlaps_with};
|
use crate::helpers::{
|
||||||
|
accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with, PAGE_SZ,
|
||||||
|
};
|
||||||
use crate::interface::*;
|
use crate::interface::*;
|
||||||
use utils::lsn::Lsn;
|
use utils::lsn::Lsn;
|
||||||
|
|
||||||
@@ -103,7 +106,13 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
|
|||||||
ctx,
|
ctx,
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
if target_file_size == u64::MAX {
|
if current_level_target_height == u64::MAX {
|
||||||
|
// our target height includes all possible lsns
|
||||||
|
info!(
|
||||||
|
level = current_level_no,
|
||||||
|
depth = depth,
|
||||||
|
"compaction loop reached max current_level_target_height"
|
||||||
|
);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
current_level_no += 1;
|
current_level_no += 1;
|
||||||
@@ -125,6 +134,7 @@ async fn compact_level<E: CompactionJobExecutor>(
|
|||||||
}
|
}
|
||||||
|
|
||||||
let mut state = LevelCompactionState {
|
let mut state = LevelCompactionState {
|
||||||
|
shard_identity: *executor.get_shard_identity(),
|
||||||
target_file_size,
|
target_file_size,
|
||||||
_lsn_range: lsn_range.clone(),
|
_lsn_range: lsn_range.clone(),
|
||||||
layers: layer_fragments,
|
layers: layer_fragments,
|
||||||
@@ -164,6 +174,8 @@ struct LevelCompactionState<'a, E>
|
|||||||
where
|
where
|
||||||
E: CompactionJobExecutor,
|
E: CompactionJobExecutor,
|
||||||
{
|
{
|
||||||
|
shard_identity: ShardIdentity,
|
||||||
|
|
||||||
// parameters
|
// parameters
|
||||||
target_file_size: u64,
|
target_file_size: u64,
|
||||||
|
|
||||||
@@ -366,7 +378,8 @@ where
|
|||||||
.executor
|
.executor
|
||||||
.get_keyspace(&job.key_range, job.lsn_range.end, ctx)
|
.get_keyspace(&job.key_range, job.lsn_range.end, ctx)
|
||||||
.await?,
|
.await?,
|
||||||
) * 8192;
|
&self.shard_identity,
|
||||||
|
) * PAGE_SZ;
|
||||||
|
|
||||||
let wal_size = job
|
let wal_size = job
|
||||||
.input_layers
|
.input_layers
|
||||||
@@ -428,9 +441,9 @@ where
|
|||||||
let mut window = KeyspaceWindow::new(
|
let mut window = KeyspaceWindow::new(
|
||||||
E::Key::MIN..E::Key::MAX,
|
E::Key::MIN..E::Key::MAX,
|
||||||
keyspace,
|
keyspace,
|
||||||
self.target_file_size / 8192,
|
self.target_file_size / PAGE_SZ,
|
||||||
);
|
);
|
||||||
while let Some(key_range) = window.choose_next_image() {
|
while let Some(key_range) = window.choose_next_image(&self.shard_identity) {
|
||||||
new_jobs.push(CompactionJob::<E> {
|
new_jobs.push(CompactionJob::<E> {
|
||||||
key_range,
|
key_range,
|
||||||
lsn_range: job.lsn_range.clone(),
|
lsn_range: job.lsn_range.clone(),
|
||||||
@@ -517,8 +530,6 @@ where
|
|||||||
// If we have accumulated only a narrow band of keyspace, create an
|
// If we have accumulated only a narrow band of keyspace, create an
|
||||||
// image layer. Otherwise write a delta layer.
|
// image layer. Otherwise write a delta layer.
|
||||||
|
|
||||||
// FIXME: deal with the case of lots of values for same key
|
|
||||||
|
|
||||||
// FIXME: we are ignoring images here. Did we already divide the work
|
// FIXME: we are ignoring images here. Did we already divide the work
|
||||||
// so that we won't encounter them here?
|
// so that we won't encounter them here?
|
||||||
|
|
||||||
@@ -530,43 +541,101 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Open stream
|
// Open stream
|
||||||
let key_value_stream = std::pin::pin!(merge_delta_keys::<E>(deltas.as_slice(), ctx));
|
let key_value_stream =
|
||||||
|
std::pin::pin!(merge_delta_keys_buffered::<E>(deltas.as_slice(), ctx)
|
||||||
|
.await?
|
||||||
|
.map(Result::<_, anyhow::Error>::Ok));
|
||||||
let mut new_jobs = Vec::new();
|
let mut new_jobs = Vec::new();
|
||||||
|
|
||||||
// Slide a window through the keyspace
|
// Slide a window through the keyspace
|
||||||
let mut key_accum = std::pin::pin!(accum_key_values(key_value_stream));
|
let mut key_accum =
|
||||||
|
std::pin::pin!(accum_key_values(key_value_stream, self.target_file_size));
|
||||||
let mut all_in_window: bool = false;
|
let mut all_in_window: bool = false;
|
||||||
let mut window = Window::new();
|
let mut window = Window::new();
|
||||||
|
|
||||||
|
// Helper function to create a job for a new delta layer with given key-lsn
|
||||||
|
// rectangle.
|
||||||
|
let create_delta_job = |key_range, lsn_range: &Range<Lsn>, new_jobs: &mut Vec<_>| {
|
||||||
|
// The inputs for the job are all the input layers of the original job that
|
||||||
|
// overlap with the rectangle.
|
||||||
|
let batch_layers: Vec<LayerId> = job
|
||||||
|
.input_layers
|
||||||
|
.iter()
|
||||||
|
.filter(|layer_id| {
|
||||||
|
overlaps_with(self.layers[layer_id.0].layer.key_range(), &key_range)
|
||||||
|
})
|
||||||
|
.cloned()
|
||||||
|
.collect();
|
||||||
|
assert!(!batch_layers.is_empty());
|
||||||
|
new_jobs.push(CompactionJob {
|
||||||
|
key_range,
|
||||||
|
lsn_range: lsn_range.clone(),
|
||||||
|
strategy: CompactionStrategy::CreateDelta,
|
||||||
|
input_layers: batch_layers,
|
||||||
|
completed: false,
|
||||||
|
});
|
||||||
|
};
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
if all_in_window && window.elems.is_empty() {
|
if all_in_window && window.is_empty() {
|
||||||
// All done!
|
// All done!
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If we now have enough keyspace for next delta layer in the window, create a
|
||||||
|
// new delta layer
|
||||||
if let Some(key_range) = window.choose_next_delta(self.target_file_size, !all_in_window)
|
if let Some(key_range) = window.choose_next_delta(self.target_file_size, !all_in_window)
|
||||||
{
|
{
|
||||||
let batch_layers: Vec<LayerId> = job
|
create_delta_job(key_range, &job.lsn_range, &mut new_jobs);
|
||||||
.input_layers
|
continue;
|
||||||
.iter()
|
}
|
||||||
.filter(|layer_id| {
|
assert!(!all_in_window);
|
||||||
overlaps_with(self.layers[layer_id.0].layer.key_range(), &key_range)
|
|
||||||
})
|
// Process next key in the key space
|
||||||
.cloned()
|
match key_accum.next().await.transpose()? {
|
||||||
.collect();
|
None => {
|
||||||
assert!(!batch_layers.is_empty());
|
|
||||||
new_jobs.push(CompactionJob {
|
|
||||||
key_range,
|
|
||||||
lsn_range: job.lsn_range.clone(),
|
|
||||||
strategy: CompactionStrategy::CreateDelta,
|
|
||||||
input_layers: batch_layers,
|
|
||||||
completed: false,
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
assert!(!all_in_window);
|
|
||||||
if let Some(next_key) = key_accum.next().await.transpose()? {
|
|
||||||
window.feed(next_key.key, next_key.size);
|
|
||||||
} else {
|
|
||||||
all_in_window = true;
|
all_in_window = true;
|
||||||
}
|
}
|
||||||
|
Some(next_key) if next_key.partition_lsns.is_empty() => {
|
||||||
|
// Normal case: extend the window by the key
|
||||||
|
window.feed(next_key.key, next_key.size);
|
||||||
|
}
|
||||||
|
Some(next_key) => {
|
||||||
|
// A key with too large size impact for a single delta layer. This
|
||||||
|
// case occurs if you make a huge number of updates for a single key.
|
||||||
|
//
|
||||||
|
// Drain the window with has_more = false to make a clean cut before
|
||||||
|
// the key, and then make dedicated delta layers for the single key.
|
||||||
|
//
|
||||||
|
// We cannot cluster the key with the others, because we don't want
|
||||||
|
// layer files to overlap with each other in the lsn,key space (no
|
||||||
|
// overlaps for the rectangles).
|
||||||
|
let key = next_key.key;
|
||||||
|
debug!("key {key} with size impact larger than the layer size");
|
||||||
|
while !window.is_empty() {
|
||||||
|
let has_more = false;
|
||||||
|
let key_range = window.choose_next_delta(self.target_file_size, has_more)
|
||||||
|
.expect("with has_more==false, choose_next_delta always returns something for a non-empty Window");
|
||||||
|
create_delta_job(key_range, &job.lsn_range, &mut new_jobs);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Not really required: but here for future resilience:
|
||||||
|
// We make a "gap" here, so any structure the window holds should
|
||||||
|
// probably be reset.
|
||||||
|
window = Window::new();
|
||||||
|
|
||||||
|
let mut prior_lsn = job.lsn_range.start;
|
||||||
|
let mut lsn_ranges = Vec::new();
|
||||||
|
for (lsn, _size) in next_key.partition_lsns.iter() {
|
||||||
|
lsn_ranges.push(prior_lsn..*lsn);
|
||||||
|
prior_lsn = *lsn;
|
||||||
|
}
|
||||||
|
lsn_ranges.push(prior_lsn..job.lsn_range.end);
|
||||||
|
for lsn_range in lsn_ranges {
|
||||||
|
let key_range = key..key.next();
|
||||||
|
create_delta_job(key_range, &lsn_range, &mut new_jobs);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -594,8 +663,8 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sliding window through keyspace and values
|
/// Sliding window through keyspace and values for image layer
|
||||||
// This is used by over_with_images to decide on good split points
|
/// This is used by [`LevelCompactionState::cover_with_images`] to decide on good split points
|
||||||
struct KeyspaceWindow<K> {
|
struct KeyspaceWindow<K> {
|
||||||
head: KeyspaceWindowHead<K>,
|
head: KeyspaceWindowHead<K>,
|
||||||
|
|
||||||
@@ -623,7 +692,12 @@ impl<K: CompactionKey> KeyspaceWindowPos<K> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Advance the cursor until it reaches 'target_keysize'.
|
// Advance the cursor until it reaches 'target_keysize'.
|
||||||
fn advance_until_size(&mut self, w: &KeyspaceWindowHead<K>, max_size: u64) {
|
fn advance_until_size(
|
||||||
|
&mut self,
|
||||||
|
w: &KeyspaceWindowHead<K>,
|
||||||
|
max_size: u64,
|
||||||
|
shard_identity: &ShardIdentity,
|
||||||
|
) {
|
||||||
while self.accum_keysize < max_size && !self.reached_end(w) {
|
while self.accum_keysize < max_size && !self.reached_end(w) {
|
||||||
let curr_range = &w.keyspace[self.keyspace_idx];
|
let curr_range = &w.keyspace[self.keyspace_idx];
|
||||||
if self.end_key < curr_range.start {
|
if self.end_key < curr_range.start {
|
||||||
@@ -632,7 +706,7 @@ impl<K: CompactionKey> KeyspaceWindowPos<K> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// We're now within 'curr_range'. Can we advance past it completely?
|
// We're now within 'curr_range'. Can we advance past it completely?
|
||||||
let distance = K::key_range_size(&(self.end_key..curr_range.end));
|
let distance = K::key_range_size(&(self.end_key..curr_range.end), shard_identity);
|
||||||
if (self.accum_keysize + distance as u64) < max_size {
|
if (self.accum_keysize + distance as u64) < max_size {
|
||||||
// oh yeah, it fits
|
// oh yeah, it fits
|
||||||
self.end_key = curr_range.end;
|
self.end_key = curr_range.end;
|
||||||
@@ -641,7 +715,7 @@ impl<K: CompactionKey> KeyspaceWindowPos<K> {
|
|||||||
} else {
|
} else {
|
||||||
// advance within the range
|
// advance within the range
|
||||||
let skip_key = self.end_key.skip_some();
|
let skip_key = self.end_key.skip_some();
|
||||||
let distance = K::key_range_size(&(self.end_key..skip_key));
|
let distance = K::key_range_size(&(self.end_key..skip_key), shard_identity);
|
||||||
if (self.accum_keysize + distance as u64) < max_size {
|
if (self.accum_keysize + distance as u64) < max_size {
|
||||||
self.end_key = skip_key;
|
self.end_key = skip_key;
|
||||||
self.accum_keysize += distance as u64;
|
self.accum_keysize += distance as u64;
|
||||||
@@ -677,7 +751,7 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn choose_next_image(&mut self) -> Option<Range<K>> {
|
fn choose_next_image(&mut self, shard_identity: &ShardIdentity) -> Option<Range<K>> {
|
||||||
if self.start_pos.keyspace_idx == self.head.keyspace.len() {
|
if self.start_pos.keyspace_idx == self.head.keyspace.len() {
|
||||||
// we've reached the end
|
// we've reached the end
|
||||||
return None;
|
return None;
|
||||||
@@ -687,6 +761,7 @@ where
|
|||||||
next_pos.advance_until_size(
|
next_pos.advance_until_size(
|
||||||
&self.head,
|
&self.head,
|
||||||
self.start_pos.accum_keysize + self.head.target_keysize,
|
self.start_pos.accum_keysize + self.head.target_keysize,
|
||||||
|
shard_identity,
|
||||||
);
|
);
|
||||||
|
|
||||||
// See if we can gobble up the rest of the keyspace if we stretch out the layer, up to
|
// See if we can gobble up the rest of the keyspace if we stretch out the layer, up to
|
||||||
@@ -695,6 +770,7 @@ where
|
|||||||
end_pos.advance_until_size(
|
end_pos.advance_until_size(
|
||||||
&self.head,
|
&self.head,
|
||||||
self.start_pos.accum_keysize + (self.head.target_keysize * 5 / 4),
|
self.start_pos.accum_keysize + (self.head.target_keysize * 5 / 4),
|
||||||
|
shard_identity,
|
||||||
);
|
);
|
||||||
if end_pos.reached_end(&self.head) {
|
if end_pos.reached_end(&self.head) {
|
||||||
// gobble up any unused keyspace between the last used key and end of the range
|
// gobble up any unused keyspace between the last used key and end of the range
|
||||||
@@ -728,9 +804,9 @@ struct WindowElement<K> {
|
|||||||
accum_size: u64,
|
accum_size: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sliding window through keyspace and values
|
/// Sliding window through keyspace and values for delta layer tiling
|
||||||
//
|
///
|
||||||
// This is used to decide what layer to write next, from the beginning of the window.
|
/// This is used to decide which delta layer to write next.
|
||||||
struct Window<K> {
|
struct Window<K> {
|
||||||
elems: VecDeque<WindowElement<K>>,
|
elems: VecDeque<WindowElement<K>>,
|
||||||
|
|
||||||
@@ -754,11 +830,13 @@ where
|
|||||||
fn feed(&mut self, key: K, size: u64) {
|
fn feed(&mut self, key: K, size: u64) {
|
||||||
let last_size;
|
let last_size;
|
||||||
if let Some(last) = self.elems.back_mut() {
|
if let Some(last) = self.elems.back_mut() {
|
||||||
assert!(last.last_key <= key);
|
// We require the keys to be strictly increasing for the window.
|
||||||
if key == last.last_key {
|
// Keys should already have been deduplicated by `accum_key_values`
|
||||||
last.accum_size += size;
|
assert!(
|
||||||
return;
|
last.last_key < key,
|
||||||
}
|
"last_key(={}) >= key(={key})",
|
||||||
|
last.last_key
|
||||||
|
);
|
||||||
last_size = last.accum_size;
|
last_size = last.accum_size;
|
||||||
} else {
|
} else {
|
||||||
last_size = 0;
|
last_size = 0;
|
||||||
@@ -780,6 +858,10 @@ where
|
|||||||
self.elems.front().unwrap().accum_size - self.splitoff_size
|
self.elems.front().unwrap().accum_size - self.splitoff_size
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn is_empty(&self) -> bool {
|
||||||
|
self.elems.is_empty()
|
||||||
|
}
|
||||||
|
|
||||||
fn commit_upto(&mut self, mut upto: usize) {
|
fn commit_upto(&mut self, mut upto: usize) {
|
||||||
while upto > 1 {
|
while upto > 1 {
|
||||||
let popped = self.elems.pop_front().unwrap();
|
let popped = self.elems.pop_front().unwrap();
|
||||||
@@ -842,7 +924,7 @@ where
|
|||||||
// If we're willing to stretch it up to 1.25 target size, could we
|
// If we're willing to stretch it up to 1.25 target size, could we
|
||||||
// gobble up the rest of the work? This avoids creating very small
|
// gobble up the rest of the work? This avoids creating very small
|
||||||
// "tail" layers at the end of the keyspace
|
// "tail" layers at the end of the keyspace
|
||||||
if !has_more && self.remain_size() < target_size * 5 / 3 {
|
if !has_more && self.remain_size() < target_size * 5 / 4 {
|
||||||
self.commit_upto(self.elems.len());
|
self.commit_upto(self.elems.len());
|
||||||
} else {
|
} else {
|
||||||
let delta_split_at = self.find_size_split(target_size);
|
let delta_split_at = self.find_size_split(target_size);
|
||||||
|
|||||||
@@ -5,19 +5,30 @@ use crate::interface::*;
|
|||||||
use futures::future::BoxFuture;
|
use futures::future::BoxFuture;
|
||||||
use futures::{Stream, StreamExt};
|
use futures::{Stream, StreamExt};
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
|
use pageserver_api::shard::ShardIdentity;
|
||||||
use pin_project_lite::pin_project;
|
use pin_project_lite::pin_project;
|
||||||
use std::collections::BinaryHeap;
|
use std::collections::BinaryHeap;
|
||||||
use std::collections::VecDeque;
|
use std::collections::VecDeque;
|
||||||
|
use std::fmt::Display;
|
||||||
use std::future::Future;
|
use std::future::Future;
|
||||||
use std::ops::{DerefMut, Range};
|
use std::ops::{DerefMut, Range};
|
||||||
use std::pin::Pin;
|
use std::pin::Pin;
|
||||||
use std::task::{ready, Poll};
|
use std::task::{ready, Poll};
|
||||||
|
use utils::lsn::Lsn;
|
||||||
|
|
||||||
pub fn keyspace_total_size<K>(keyspace: &CompactionKeySpace<K>) -> u64
|
pub const PAGE_SZ: u64 = 8192;
|
||||||
|
|
||||||
|
pub fn keyspace_total_size<K>(
|
||||||
|
keyspace: &CompactionKeySpace<K>,
|
||||||
|
shard_identity: &ShardIdentity,
|
||||||
|
) -> u64
|
||||||
where
|
where
|
||||||
K: CompactionKey,
|
K: CompactionKey,
|
||||||
{
|
{
|
||||||
keyspace.iter().map(|r| K::key_range_size(r) as u64).sum()
|
keyspace
|
||||||
|
.iter()
|
||||||
|
.map(|r| K::key_range_size(r, shard_identity) as u64)
|
||||||
|
.sum()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
|
pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
|
||||||
@@ -101,17 +112,40 @@ pub fn merge_delta_keys<'a, E: CompactionJobExecutor>(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn merge_delta_keys_buffered<'a, E: CompactionJobExecutor + 'a>(
|
||||||
|
layers: &'a [E::DeltaLayer],
|
||||||
|
ctx: &'a E::RequestContext,
|
||||||
|
) -> anyhow::Result<impl Stream<Item = <E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>>
|
||||||
|
{
|
||||||
|
let mut keys = Vec::new();
|
||||||
|
for l in layers {
|
||||||
|
// Boxing and casting to LoadFuture is required to obtain the right Sync bound.
|
||||||
|
// If we do l.load_keys(ctx).await? directly, there is a compilation error.
|
||||||
|
let load_future: LoadFuture<'a, _> = Box::pin(l.load_keys(ctx));
|
||||||
|
keys.extend(load_future.await?.into_iter());
|
||||||
|
}
|
||||||
|
keys.sort_by_key(|k| (k.key(), k.lsn()));
|
||||||
|
let stream = futures::stream::iter(keys.into_iter());
|
||||||
|
Ok(stream)
|
||||||
|
}
|
||||||
|
|
||||||
enum LazyLoadLayer<'a, E: CompactionJobExecutor> {
|
enum LazyLoadLayer<'a, E: CompactionJobExecutor> {
|
||||||
Loaded(VecDeque<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>),
|
Loaded(VecDeque<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>),
|
||||||
Unloaded(&'a E::DeltaLayer),
|
Unloaded(&'a E::DeltaLayer),
|
||||||
}
|
}
|
||||||
impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> {
|
impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> {
|
||||||
fn key(&self) -> E::Key {
|
fn min_key(&self) -> E::Key {
|
||||||
match self {
|
match self {
|
||||||
Self::Loaded(entries) => entries.front().unwrap().key(),
|
Self::Loaded(entries) => entries.front().unwrap().key(),
|
||||||
Self::Unloaded(dl) => dl.key_range().start,
|
Self::Unloaded(dl) => dl.key_range().start,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
fn min_lsn(&self) -> Lsn {
|
||||||
|
match self {
|
||||||
|
Self::Loaded(entries) => entries.front().unwrap().lsn(),
|
||||||
|
Self::Unloaded(dl) => dl.lsn_range().start,
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> {
|
impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> {
|
||||||
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
||||||
@@ -121,12 +155,12 @@ impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> {
|
|||||||
impl<'a, E: CompactionJobExecutor> Ord for LazyLoadLayer<'a, E> {
|
impl<'a, E: CompactionJobExecutor> Ord for LazyLoadLayer<'a, E> {
|
||||||
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||||
// reverse order so that we get a min-heap
|
// reverse order so that we get a min-heap
|
||||||
other.key().cmp(&self.key())
|
(other.min_key(), other.min_lsn()).cmp(&(self.min_key(), self.min_lsn()))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
impl<'a, E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'a, E> {
|
impl<'a, E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'a, E> {
|
||||||
fn eq(&self, other: &Self) -> bool {
|
fn eq(&self, other: &Self) -> bool {
|
||||||
self.key().eq(&other.key())
|
self.cmp(other) == std::cmp::Ordering::Equal
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
impl<'a, E: CompactionJobExecutor> Eq for LazyLoadLayer<'a, E> {}
|
impl<'a, E: CompactionJobExecutor> Eq for LazyLoadLayer<'a, E> {}
|
||||||
@@ -203,11 +237,16 @@ pub struct KeySize<K> {
|
|||||||
pub key: K,
|
pub key: K,
|
||||||
pub num_values: u64,
|
pub num_values: u64,
|
||||||
pub size: u64,
|
pub size: u64,
|
||||||
|
/// The lsns to partition at (if empty then no per-lsn partitioning)
|
||||||
|
pub partition_lsns: Vec<(Lsn, u64)>,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn accum_key_values<'a, I, K, D, E>(input: I) -> impl Stream<Item = Result<KeySize<K>, E>>
|
pub fn accum_key_values<'a, I, K, D, E>(
|
||||||
|
input: I,
|
||||||
|
target_size: u64,
|
||||||
|
) -> impl Stream<Item = Result<KeySize<K>, E>>
|
||||||
where
|
where
|
||||||
K: Eq,
|
K: Eq + PartialOrd + Display + Copy,
|
||||||
I: Stream<Item = Result<D, E>>,
|
I: Stream<Item = Result<D, E>>,
|
||||||
D: CompactionDeltaEntry<'a, K>,
|
D: CompactionDeltaEntry<'a, K>,
|
||||||
{
|
{
|
||||||
@@ -217,25 +256,39 @@ where
|
|||||||
|
|
||||||
if let Some(first) = input.next().await {
|
if let Some(first) = input.next().await {
|
||||||
let first = first?;
|
let first = first?;
|
||||||
|
let mut part_size = first.size();
|
||||||
let mut accum: KeySize<K> = KeySize {
|
let mut accum: KeySize<K> = KeySize {
|
||||||
key: first.key(),
|
key: first.key(),
|
||||||
num_values: 1,
|
num_values: 1,
|
||||||
size: first.size(),
|
size: part_size,
|
||||||
|
partition_lsns: Vec::new(),
|
||||||
};
|
};
|
||||||
|
let mut last_key = accum.key;
|
||||||
while let Some(this) = input.next().await {
|
while let Some(this) = input.next().await {
|
||||||
let this = this?;
|
let this = this?;
|
||||||
if this.key() == accum.key {
|
if this.key() == accum.key {
|
||||||
accum.size += this.size();
|
let add_size = this.size();
|
||||||
|
if part_size + add_size > target_size {
|
||||||
|
accum.partition_lsns.push((this.lsn(), part_size));
|
||||||
|
part_size = 0;
|
||||||
|
}
|
||||||
|
part_size += add_size;
|
||||||
|
accum.size += add_size;
|
||||||
accum.num_values += 1;
|
accum.num_values += 1;
|
||||||
} else {
|
} else {
|
||||||
|
assert!(last_key <= accum.key, "last_key={last_key} <= accum.key={}", accum.key);
|
||||||
|
last_key = accum.key;
|
||||||
yield accum;
|
yield accum;
|
||||||
|
part_size = this.size();
|
||||||
accum = KeySize {
|
accum = KeySize {
|
||||||
key: this.key(),
|
key: this.key(),
|
||||||
num_values: 1,
|
num_values: 1,
|
||||||
size: this.size(),
|
size: part_size,
|
||||||
|
partition_lsns: Vec::new(),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
assert!(last_key <= accum.key, "last_key={last_key} <= accum.key={}", accum.key);
|
||||||
yield accum;
|
yield accum;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -184,6 +184,12 @@ impl<L> Level<L> {
|
|||||||
}
|
}
|
||||||
let mut events: Vec<Event<K>> = Vec::new();
|
let mut events: Vec<Event<K>> = Vec::new();
|
||||||
for (idx, l) in self.layers.iter().enumerate() {
|
for (idx, l) in self.layers.iter().enumerate() {
|
||||||
|
let key_range = l.key_range();
|
||||||
|
if key_range.end == key_range.start.next() && l.is_delta() {
|
||||||
|
// Ignore single-key delta layers as they can be stacked on top of each other
|
||||||
|
// as that is the only way to cut further.
|
||||||
|
continue;
|
||||||
|
}
|
||||||
events.push(Event {
|
events.push(Event {
|
||||||
key: l.key_range().start,
|
key: l.key_range().start,
|
||||||
layer_idx: idx,
|
layer_idx: idx,
|
||||||
|
|||||||
@@ -4,7 +4,7 @@
|
|||||||
//! All the heavy lifting is done by the create_image and create_delta
|
//! All the heavy lifting is done by the create_image and create_delta
|
||||||
//! functions that the implementor provides.
|
//! functions that the implementor provides.
|
||||||
use futures::Future;
|
use futures::Future;
|
||||||
use pageserver_api::{key::Key, keyspace::key_range_size};
|
use pageserver_api::{key::Key, keyspace::ShardedRange, shard::ShardIdentity};
|
||||||
use std::ops::Range;
|
use std::ops::Range;
|
||||||
use utils::lsn::Lsn;
|
use utils::lsn::Lsn;
|
||||||
|
|
||||||
@@ -32,6 +32,8 @@ pub trait CompactionJobExecutor {
|
|||||||
// Functions that the planner uses to support its decisions
|
// Functions that the planner uses to support its decisions
|
||||||
// ----
|
// ----
|
||||||
|
|
||||||
|
fn get_shard_identity(&self) -> &ShardIdentity;
|
||||||
|
|
||||||
/// Return all layers that overlap the given bounding box.
|
/// Return all layers that overlap the given bounding box.
|
||||||
fn get_layers(
|
fn get_layers(
|
||||||
&mut self,
|
&mut self,
|
||||||
@@ -98,7 +100,7 @@ pub trait CompactionKey: std::cmp::Ord + Clone + Copy + std::fmt::Display {
|
|||||||
///
|
///
|
||||||
/// This returns u32, for compatibility with Repository::key. If the
|
/// This returns u32, for compatibility with Repository::key. If the
|
||||||
/// distance is larger, return u32::MAX.
|
/// distance is larger, return u32::MAX.
|
||||||
fn key_range_size(key_range: &Range<Self>) -> u32;
|
fn key_range_size(key_range: &Range<Self>, shard_identity: &ShardIdentity) -> u32;
|
||||||
|
|
||||||
// return "self + 1"
|
// return "self + 1"
|
||||||
fn next(&self) -> Self;
|
fn next(&self) -> Self;
|
||||||
@@ -113,8 +115,8 @@ impl CompactionKey for Key {
|
|||||||
const MIN: Self = Self::MIN;
|
const MIN: Self = Self::MIN;
|
||||||
const MAX: Self = Self::MAX;
|
const MAX: Self = Self::MAX;
|
||||||
|
|
||||||
fn key_range_size(r: &std::ops::Range<Self>) -> u32 {
|
fn key_range_size(r: &std::ops::Range<Self>, shard_identity: &ShardIdentity) -> u32 {
|
||||||
key_range_size(r)
|
ShardedRange::new(r.clone(), shard_identity).page_count()
|
||||||
}
|
}
|
||||||
fn next(&self) -> Key {
|
fn next(&self) -> Key {
|
||||||
(self as &Key).next()
|
(self as &Key).next()
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ mod draw;
|
|||||||
use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp};
|
use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp};
|
||||||
|
|
||||||
use futures::StreamExt;
|
use futures::StreamExt;
|
||||||
|
use pageserver_api::shard::ShardIdentity;
|
||||||
use rand::Rng;
|
use rand::Rng;
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
|
|
||||||
@@ -13,6 +14,7 @@ use std::ops::Range;
|
|||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::sync::Mutex;
|
use std::sync::Mutex;
|
||||||
|
|
||||||
|
use crate::helpers::PAGE_SZ;
|
||||||
use crate::helpers::{merge_delta_keys, overlaps_with};
|
use crate::helpers::{merge_delta_keys, overlaps_with};
|
||||||
|
|
||||||
use crate::interface;
|
use crate::interface;
|
||||||
@@ -71,7 +73,7 @@ impl interface::CompactionKey for Key {
|
|||||||
const MIN: Self = u64::MIN;
|
const MIN: Self = u64::MIN;
|
||||||
const MAX: Self = u64::MAX;
|
const MAX: Self = u64::MAX;
|
||||||
|
|
||||||
fn key_range_size(key_range: &Range<Self>) -> u32 {
|
fn key_range_size(key_range: &Range<Self>, _shard_identity: &ShardIdentity) -> u32 {
|
||||||
std::cmp::min(key_range.end - key_range.start, u32::MAX as u64) as u32
|
std::cmp::min(key_range.end - key_range.start, u32::MAX as u64) as u32
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -378,8 +380,8 @@ impl interface::CompactionLayer<Key> for MockLayer {
|
|||||||
}
|
}
|
||||||
fn file_size(&self) -> u64 {
|
fn file_size(&self) -> u64 {
|
||||||
match self {
|
match self {
|
||||||
MockLayer::Delta(this) => this.file_size(),
|
MockLayer::Delta(this) => this.file_size,
|
||||||
MockLayer::Image(this) => this.file_size(),
|
MockLayer::Image(this) => this.file_size,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fn short_id(&self) -> String {
|
fn short_id(&self) -> String {
|
||||||
@@ -434,6 +436,11 @@ impl interface::CompactionJobExecutor for MockTimeline {
|
|||||||
type ImageLayer = Arc<MockImageLayer>;
|
type ImageLayer = Arc<MockImageLayer>;
|
||||||
type RequestContext = MockRequestContext;
|
type RequestContext = MockRequestContext;
|
||||||
|
|
||||||
|
fn get_shard_identity(&self) -> &ShardIdentity {
|
||||||
|
static IDENTITY: ShardIdentity = ShardIdentity::unsharded();
|
||||||
|
&IDENTITY
|
||||||
|
}
|
||||||
|
|
||||||
async fn get_layers(
|
async fn get_layers(
|
||||||
&mut self,
|
&mut self,
|
||||||
key_range: &Range<Self::Key>,
|
key_range: &Range<Self::Key>,
|
||||||
@@ -503,7 +510,7 @@ impl interface::CompactionJobExecutor for MockTimeline {
|
|||||||
let new_layer = Arc::new(MockImageLayer {
|
let new_layer = Arc::new(MockImageLayer {
|
||||||
key_range: key_range.clone(),
|
key_range: key_range.clone(),
|
||||||
lsn_range: lsn..lsn,
|
lsn_range: lsn..lsn,
|
||||||
file_size: accum_size * 8192,
|
file_size: accum_size * PAGE_SZ,
|
||||||
deleted: Mutex::new(false),
|
deleted: Mutex::new(false),
|
||||||
});
|
});
|
||||||
info!(
|
info!(
|
||||||
|
|||||||
@@ -1,23 +1,35 @@
|
|||||||
|
use once_cell::sync::OnceCell;
|
||||||
use pageserver_compaction::interface::CompactionLayer;
|
use pageserver_compaction::interface::CompactionLayer;
|
||||||
use pageserver_compaction::simulator::MockTimeline;
|
use pageserver_compaction::simulator::MockTimeline;
|
||||||
|
use utils::logging;
|
||||||
|
|
||||||
|
static LOG_HANDLE: OnceCell<()> = OnceCell::new();
|
||||||
|
|
||||||
|
pub(crate) fn setup_logging() {
|
||||||
|
LOG_HANDLE.get_or_init(|| {
|
||||||
|
logging::init(
|
||||||
|
logging::LogFormat::Test,
|
||||||
|
logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
|
||||||
|
logging::Output::Stdout,
|
||||||
|
)
|
||||||
|
.expect("Failed to init test logging")
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
/// Test the extreme case that there are so many updates for a single key that
|
/// Test the extreme case that there are so many updates for a single key that
|
||||||
/// even if we produce an extremely narrow delta layer, spanning just that one
|
/// even if we produce an extremely narrow delta layer, spanning just that one
|
||||||
/// key, we still too many records to fit in the target file size. We need to
|
/// key, we still too many records to fit in the target file size. We need to
|
||||||
/// split in the LSN dimension too in that case.
|
/// split in the LSN dimension too in that case.
|
||||||
///
|
|
||||||
/// TODO: The code to avoid this problem has not been implemented yet! So the
|
|
||||||
/// assertion currently fails, but we need to make it not fail.
|
|
||||||
#[ignore]
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_many_updates_for_single_key() {
|
async fn test_many_updates_for_single_key() {
|
||||||
|
setup_logging();
|
||||||
let mut executor = MockTimeline::new();
|
let mut executor = MockTimeline::new();
|
||||||
executor.target_file_size = 10_000_000; // 10 MB
|
executor.target_file_size = 1_000_000; // 1 MB
|
||||||
|
|
||||||
// Ingest 100 MB of updates to a single key.
|
// Ingest 10 MB of updates to a single key.
|
||||||
for _ in 1..1000 {
|
for _ in 1..1000 {
|
||||||
executor.ingest_uniform(100, 10, &(0..100_000)).unwrap();
|
executor.ingest_uniform(100, 10, &(0..100_000)).unwrap();
|
||||||
executor.ingest_uniform(10_000, 10, &(0..1)).unwrap();
|
executor.ingest_uniform(1000, 10, &(0..1)).unwrap();
|
||||||
executor.compact().await.unwrap();
|
executor.compact().await.unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -27,9 +39,32 @@ async fn test_many_updates_for_single_key() {
|
|||||||
}
|
}
|
||||||
for l in executor.live_layers.iter() {
|
for l in executor.live_layers.iter() {
|
||||||
assert!(l.file_size() < executor.target_file_size * 2);
|
assert!(l.file_size() < executor.target_file_size * 2);
|
||||||
// sanity check that none of the delta layers are stupidly small either
|
// Sanity check that none of the delta layers are empty either.
|
||||||
if l.is_delta() {
|
if l.is_delta() {
|
||||||
assert!(l.file_size() > executor.target_file_size / 2);
|
assert!(l.file_size() > 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_simple_updates() {
|
||||||
|
setup_logging();
|
||||||
|
let mut executor = MockTimeline::new();
|
||||||
|
executor.target_file_size = 500_000; // 500 KB
|
||||||
|
|
||||||
|
// Ingest some traffic.
|
||||||
|
for _ in 1..400 {
|
||||||
|
executor.ingest_uniform(100, 500, &(0..100_000)).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
for l in executor.live_layers.iter() {
|
||||||
|
println!("layer {}: {}", l.short_id(), l.file_size());
|
||||||
|
}
|
||||||
|
|
||||||
|
println!("Running compaction...");
|
||||||
|
executor.compact().await.unwrap();
|
||||||
|
|
||||||
|
for l in executor.live_layers.iter() {
|
||||||
|
println!("layer {}: {}", l.short_id(), l.file_size());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -9,21 +9,49 @@
|
|||||||
//! Coordinates in both axis are compressed for better readability.
|
//! Coordinates in both axis are compressed for better readability.
|
||||||
//! (see <https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb>)
|
//! (see <https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb>)
|
||||||
//!
|
//!
|
||||||
//! Example use:
|
//! The plain text API was chosen so that we can easily work with filenames from various
|
||||||
|
//! sources; see the Usage section below for examples.
|
||||||
|
//!
|
||||||
|
//! # Usage
|
||||||
|
//!
|
||||||
|
//! ## Producing the SVG
|
||||||
|
//!
|
||||||
//! ```bash
|
//! ```bash
|
||||||
//! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
|
//!
|
||||||
//! $ grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
|
//! # local timeline dir
|
||||||
//! $ firefox out.svg
|
//! ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
|
||||||
|
//! grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
|
||||||
|
//!
|
||||||
|
//! # Layer map dump from `/v1/tenant/$TENANT/timeline/$TIMELINE/layer`
|
||||||
|
//! (jq -r '.historic_layers[] | .layer_file_name' | cargo run -p pagectl draw-timeline) < layer-map.json > out.svg
|
||||||
|
//!
|
||||||
|
//! # From an `index_part.json` in S3
|
||||||
|
//! (jq -r '.layer_metadata | keys[]' | cargo run -p pagectl draw-timeline ) < index_part.json-00000016 > out.svg
|
||||||
|
//!
|
||||||
|
//! # enrich with lines for gc_cutoff and a child branch point
|
||||||
|
//! cat <(jq -r '.historic_layers[] | .layer_file_name' < layers.json) <(echo -e 'gc_cutoff:0000001CE3FE32C9\nbranch:0000001DE3FE32C9') | cargo run --bin pagectl draw-timeline >| out.svg
|
||||||
//! ```
|
//! ```
|
||||||
//!
|
//!
|
||||||
//! This API was chosen so that we can easily work with filenames extracted from ssh,
|
//! ## Viewing
|
||||||
//! or from pageserver log files.
|
|
||||||
//!
|
//!
|
||||||
//! TODO Consider shipping this as a grafana panel plugin:
|
//! **Inkscape** is better than the built-in viewers in browsers.
|
||||||
//! <https://grafana.com/tutorials/build-a-panel-plugin/>
|
//!
|
||||||
use anyhow::Result;
|
//! After selecting a layer file rectangle, use "Open XML Editor" (Ctrl|Cmd + Shift + X)
|
||||||
|
//! to see the layer file name in the comment field.
|
||||||
|
//!
|
||||||
|
//! ```bash
|
||||||
|
//!
|
||||||
|
//! # Linux
|
||||||
|
//! inkscape out.svg
|
||||||
|
//!
|
||||||
|
//! # macOS
|
||||||
|
//! /Applications/Inkscape.app/Contents/MacOS/inkscape out.svg
|
||||||
|
//!
|
||||||
|
//! ```
|
||||||
|
//!
|
||||||
|
|
||||||
|
use anyhow::{Context, Result};
|
||||||
use pageserver::repository::Key;
|
use pageserver::repository::Key;
|
||||||
use pageserver::METADATA_FILE_NAME;
|
|
||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use std::io::{self, BufRead};
|
use std::io::{self, BufRead};
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
@@ -54,6 +82,11 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
|
|||||||
let split: Vec<&str> = name.split("__").collect();
|
let split: Vec<&str> = name.split("__").collect();
|
||||||
let keys: Vec<&str> = split[0].split('-').collect();
|
let keys: Vec<&str> = split[0].split('-').collect();
|
||||||
let mut lsns: Vec<&str> = split[1].split('-').collect();
|
let mut lsns: Vec<&str> = split[1].split('-').collect();
|
||||||
|
|
||||||
|
if lsns.last().expect("should").len() == 8 {
|
||||||
|
lsns.pop();
|
||||||
|
}
|
||||||
|
|
||||||
if lsns.len() == 1 {
|
if lsns.len() == 1 {
|
||||||
lsns.push(lsns[0]);
|
lsns.push(lsns[0]);
|
||||||
}
|
}
|
||||||
@@ -63,33 +96,94 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
|
|||||||
(keys, lsns)
|
(keys, lsns)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Copy)]
|
||||||
|
enum LineKind {
|
||||||
|
GcCutoff,
|
||||||
|
Branch,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<LineKind> for Fill {
|
||||||
|
fn from(value: LineKind) -> Self {
|
||||||
|
match value {
|
||||||
|
LineKind::GcCutoff => Fill::Color(rgb(255, 0, 0)),
|
||||||
|
LineKind::Branch => Fill::Color(rgb(0, 255, 0)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FromStr for LineKind {
|
||||||
|
type Err = anyhow::Error;
|
||||||
|
|
||||||
|
fn from_str(s: &str) -> std::prelude::v1::Result<Self, Self::Err> {
|
||||||
|
Ok(match s {
|
||||||
|
"gc_cutoff" => LineKind::GcCutoff,
|
||||||
|
"branch" => LineKind::Branch,
|
||||||
|
_ => anyhow::bail!("unsupported linekind: {s}"),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn main() -> Result<()> {
|
pub fn main() -> Result<()> {
|
||||||
// Parse layer filenames from stdin
|
// Parse layer filenames from stdin
|
||||||
let mut ranges: Vec<(Range<Key>, Range<Lsn>)> = vec![];
|
struct Layer {
|
||||||
|
filename: String,
|
||||||
|
key_range: Range<Key>,
|
||||||
|
lsn_range: Range<Lsn>,
|
||||||
|
}
|
||||||
|
let mut files: Vec<Layer> = vec![];
|
||||||
let stdin = io::stdin();
|
let stdin = io::stdin();
|
||||||
for line in stdin.lock().lines() {
|
|
||||||
|
let mut lines: Vec<(Lsn, LineKind)> = vec![];
|
||||||
|
|
||||||
|
for (lineno, line) in stdin.lock().lines().enumerate() {
|
||||||
|
let lineno = lineno + 1;
|
||||||
|
|
||||||
let line = line.unwrap();
|
let line = line.unwrap();
|
||||||
|
if let Some((kind, lsn)) = line.split_once(':') {
|
||||||
|
let (kind, lsn) = LineKind::from_str(kind)
|
||||||
|
.context("parse kind")
|
||||||
|
.and_then(|kind| {
|
||||||
|
if lsn.contains('/') {
|
||||||
|
Lsn::from_str(lsn)
|
||||||
|
} else {
|
||||||
|
Lsn::from_hex(lsn)
|
||||||
|
}
|
||||||
|
.map(|lsn| (kind, lsn))
|
||||||
|
.context("parse lsn")
|
||||||
|
})
|
||||||
|
.with_context(|| format!("parse {line:?} on {lineno}"))?;
|
||||||
|
lines.push((lsn, kind));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
let line = PathBuf::from_str(&line).unwrap();
|
let line = PathBuf::from_str(&line).unwrap();
|
||||||
let filename = line.file_name().unwrap();
|
let filename = line.file_name().unwrap();
|
||||||
let filename = filename.to_str().unwrap();
|
let filename = filename.to_str().unwrap();
|
||||||
if filename == METADATA_FILE_NAME {
|
let (key_range, lsn_range) = parse_filename(filename);
|
||||||
// Don't try and parse "metadata" like a key-lsn range
|
files.push(Layer {
|
||||||
continue;
|
filename: filename.to_owned(),
|
||||||
}
|
key_range,
|
||||||
let range = parse_filename(filename);
|
lsn_range,
|
||||||
ranges.push(range);
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// Collect all coordinates
|
// Collect all coordinates
|
||||||
let mut keys: Vec<Key> = vec![];
|
let mut keys: Vec<Key> = Vec::with_capacity(files.len());
|
||||||
let mut lsns: Vec<Lsn> = vec![];
|
let mut lsns: Vec<Lsn> = Vec::with_capacity(files.len() + lines.len());
|
||||||
for (keyr, lsnr) in &ranges {
|
|
||||||
|
for Layer {
|
||||||
|
key_range: keyr,
|
||||||
|
lsn_range: lsnr,
|
||||||
|
..
|
||||||
|
} in &files
|
||||||
|
{
|
||||||
keys.push(keyr.start);
|
keys.push(keyr.start);
|
||||||
keys.push(keyr.end);
|
keys.push(keyr.end);
|
||||||
lsns.push(lsnr.start);
|
lsns.push(lsnr.start);
|
||||||
lsns.push(lsnr.end);
|
lsns.push(lsnr.end);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
lsns.extend(lines.iter().map(|(lsn, _)| *lsn));
|
||||||
|
|
||||||
// Analyze
|
// Analyze
|
||||||
let key_map = build_coordinate_compression_map(keys);
|
let key_map = build_coordinate_compression_map(keys);
|
||||||
let lsn_map = build_coordinate_compression_map(lsns);
|
let lsn_map = build_coordinate_compression_map(lsns);
|
||||||
@@ -103,11 +197,19 @@ pub fn main() -> Result<()> {
|
|||||||
println!(
|
println!(
|
||||||
"{}",
|
"{}",
|
||||||
BeginSvg {
|
BeginSvg {
|
||||||
w: key_map.len() as f32,
|
w: (key_map.len() + 10) as f32,
|
||||||
h: stretch * lsn_map.len() as f32
|
h: stretch * lsn_map.len() as f32
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
for (keyr, lsnr) in &ranges {
|
|
||||||
|
let xmargin = 0.05; // Height-dependent margin to disambiguate overlapping deltas
|
||||||
|
|
||||||
|
for Layer {
|
||||||
|
filename,
|
||||||
|
key_range: keyr,
|
||||||
|
lsn_range: lsnr,
|
||||||
|
} in &files
|
||||||
|
{
|
||||||
let key_start = *key_map.get(&keyr.start).unwrap();
|
let key_start = *key_map.get(&keyr.start).unwrap();
|
||||||
let key_end = *key_map.get(&keyr.end).unwrap();
|
let key_end = *key_map.get(&keyr.end).unwrap();
|
||||||
let key_diff = key_end - key_start;
|
let key_diff = key_end - key_start;
|
||||||
@@ -123,7 +225,6 @@ pub fn main() -> Result<()> {
|
|||||||
let mut lsn_diff = (lsn_end - lsn_start) as f32;
|
let mut lsn_diff = (lsn_end - lsn_start) as f32;
|
||||||
let mut fill = Fill::None;
|
let mut fill = Fill::None;
|
||||||
let mut ymargin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas
|
let mut ymargin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas
|
||||||
let xmargin = 0.05; // Height-dependent margin to disambiguate overlapping deltas
|
|
||||||
let mut lsn_offset = 0.0;
|
let mut lsn_offset = 0.0;
|
||||||
|
|
||||||
// Fill in and thicken rectangle if it's an
|
// Fill in and thicken rectangle if it's an
|
||||||
@@ -143,7 +244,7 @@ pub fn main() -> Result<()> {
|
|||||||
println!(
|
println!(
|
||||||
" {}",
|
" {}",
|
||||||
rectangle(
|
rectangle(
|
||||||
key_start as f32 + stretch * xmargin,
|
5.0 + key_start as f32 + stretch * xmargin,
|
||||||
stretch * (lsn_max as f32 - (lsn_end as f32 - ymargin - lsn_offset)),
|
stretch * (lsn_max as f32 - (lsn_end as f32 - ymargin - lsn_offset)),
|
||||||
key_diff as f32 - stretch * 2.0 * xmargin,
|
key_diff as f32 - stretch * 2.0 * xmargin,
|
||||||
stretch * (lsn_diff - 2.0 * ymargin)
|
stretch * (lsn_diff - 2.0 * ymargin)
|
||||||
@@ -151,8 +252,29 @@ pub fn main() -> Result<()> {
|
|||||||
.fill(fill)
|
.fill(fill)
|
||||||
.stroke(Stroke::Color(rgb(0, 0, 0), 0.1))
|
.stroke(Stroke::Color(rgb(0, 0, 0), 0.1))
|
||||||
.border_radius(0.4)
|
.border_radius(0.4)
|
||||||
|
.comment(filename)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (lsn, kind) in lines {
|
||||||
|
let lsn_start = *lsn_map.get(&lsn).unwrap();
|
||||||
|
let lsn_end = lsn_start;
|
||||||
|
let stretch = 2.0;
|
||||||
|
let lsn_diff = 0.3;
|
||||||
|
let lsn_offset = -lsn_diff / 2.0;
|
||||||
|
let ymargin = 0.05;
|
||||||
|
println!(
|
||||||
|
"{}",
|
||||||
|
rectangle(
|
||||||
|
0.0f32 + stretch * xmargin,
|
||||||
|
stretch * (lsn_map.len() as f32 - (lsn_end as f32 - ymargin - lsn_offset)),
|
||||||
|
(key_map.len() + 10) as f32,
|
||||||
|
stretch * (lsn_diff - 2.0 * ymargin)
|
||||||
|
)
|
||||||
|
.fill(kind)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
println!("{}", EndSvg);
|
println!("{}", EndSvg);
|
||||||
|
|
||||||
eprintln!("num_images: {}", num_images);
|
eprintln!("num_images: {}", num_images);
|
||||||
|
|||||||
@@ -2,8 +2,8 @@ use std::collections::HashMap;
|
|||||||
|
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use camino::Utf8PathBuf;
|
use camino::Utf8PathBuf;
|
||||||
use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
|
use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||||
use pageserver::tenant::storage_layer::LayerFileName;
|
use pageserver::tenant::storage_layer::LayerName;
|
||||||
use pageserver::tenant::{metadata::TimelineMetadata, IndexPart};
|
use pageserver::tenant::{metadata::TimelineMetadata, IndexPart};
|
||||||
use utils::lsn::Lsn;
|
use utils::lsn::Lsn;
|
||||||
|
|
||||||
@@ -19,7 +19,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
|
|||||||
let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
|
let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
|
||||||
#[derive(serde::Serialize)]
|
#[derive(serde::Serialize)]
|
||||||
struct Output<'a> {
|
struct Output<'a> {
|
||||||
layer_metadata: &'a HashMap<LayerFileName, IndexLayerMetadata>,
|
layer_metadata: &'a HashMap<LayerName, LayerFileMetadata>,
|
||||||
disk_consistent_lsn: Lsn,
|
disk_consistent_lsn: Lsn,
|
||||||
timeline_metadata: &'a TimelineMetadata,
|
timeline_metadata: &'a TimelineMetadata,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -100,7 +100,7 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
|
|||||||
|
|
||||||
// Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
|
// Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
|
||||||
async fn get_holes(path: &Utf8Path, max_holes: usize, ctx: &RequestContext) -> Result<Vec<Hole>> {
|
async fn get_holes(path: &Utf8Path, max_holes: usize, ctx: &RequestContext) -> Result<Vec<Hole>> {
|
||||||
let file = VirtualFile::open(path).await?;
|
let file = VirtualFile::open(path, ctx).await?;
|
||||||
let file_id = page_cache::next_file_id();
|
let file_id = page_cache::next_file_id();
|
||||||
let block_reader = FileBlockReader::new(&file, file_id);
|
let block_reader = FileBlockReader::new(&file, file_id);
|
||||||
let summary_blk = block_reader.read_blk(0, ctx).await?;
|
let summary_blk = block_reader.read_blk(0, ctx).await?;
|
||||||
|
|||||||
@@ -61,7 +61,7 @@ async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
|
|||||||
let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
|
let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
|
||||||
virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
|
virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
|
||||||
page_cache::init(100);
|
page_cache::init(100);
|
||||||
let file = VirtualFile::open(path).await?;
|
let file = VirtualFile::open(path, ctx).await?;
|
||||||
let file_id = page_cache::next_file_id();
|
let file_id = page_cache::next_file_id();
|
||||||
let block_reader = FileBlockReader::new(&file, file_id);
|
let block_reader = FileBlockReader::new(&file, file_id);
|
||||||
let summary_blk = block_reader.read_blk(0, ctx).await?;
|
let summary_blk = block_reader.read_blk(0, ctx).await?;
|
||||||
|
|||||||
@@ -219,6 +219,7 @@ fn handle_metadata(
|
|||||||
let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
|
let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
|
||||||
println!("Current metadata:\n{meta:?}");
|
println!("Current metadata:\n{meta:?}");
|
||||||
let mut update_meta = false;
|
let mut update_meta = false;
|
||||||
|
// TODO: simplify this part
|
||||||
if let Some(disk_consistent_lsn) = disk_consistent_lsn {
|
if let Some(disk_consistent_lsn) = disk_consistent_lsn {
|
||||||
meta = TimelineMetadata::new(
|
meta = TimelineMetadata::new(
|
||||||
*disk_consistent_lsn,
|
*disk_consistent_lsn,
|
||||||
|
|||||||
98
pageserver/pagebench/src/cmd/aux_files.rs
Normal file
98
pageserver/pagebench/src/cmd/aux_files.rs
Normal file
@@ -0,0 +1,98 @@
|
|||||||
|
use pageserver_api::models::{AuxFilePolicy, TenantConfig, TenantConfigRequest};
|
||||||
|
use pageserver_api::shard::TenantShardId;
|
||||||
|
use utils::id::TenantTimelineId;
|
||||||
|
use utils::lsn::Lsn;
|
||||||
|
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
/// Ingest aux files into the pageserver.
|
||||||
|
#[derive(clap::Parser)]
|
||||||
|
pub(crate) struct Args {
|
||||||
|
#[clap(long, default_value = "http://localhost:9898")]
|
||||||
|
mgmt_api_endpoint: String,
|
||||||
|
#[clap(long, default_value = "postgres://postgres@localhost:64000")]
|
||||||
|
page_service_connstring: String,
|
||||||
|
#[clap(long)]
|
||||||
|
pageserver_jwt: Option<String>,
|
||||||
|
|
||||||
|
targets: Option<Vec<TenantTimelineId>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn main(args: Args) -> anyhow::Result<()> {
|
||||||
|
let rt = tokio::runtime::Builder::new_multi_thread()
|
||||||
|
.enable_all()
|
||||||
|
.build()
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let main_task = rt.spawn(main_impl(args));
|
||||||
|
rt.block_on(main_task).unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn main_impl(args: Args) -> anyhow::Result<()> {
|
||||||
|
let args: &'static Args = Box::leak(Box::new(args));
|
||||||
|
|
||||||
|
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
|
||||||
|
args.mgmt_api_endpoint.clone(),
|
||||||
|
args.pageserver_jwt.as_deref(),
|
||||||
|
));
|
||||||
|
|
||||||
|
// discover targets
|
||||||
|
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
|
||||||
|
&mgmt_api_client,
|
||||||
|
crate::util::cli::targets::Spec {
|
||||||
|
limit_to_first_n_targets: None,
|
||||||
|
targets: {
|
||||||
|
if let Some(targets) = &args.targets {
|
||||||
|
if targets.len() != 1 {
|
||||||
|
anyhow::bail!("must specify exactly one target");
|
||||||
|
}
|
||||||
|
Some(targets.clone())
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let timeline = timelines[0];
|
||||||
|
let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id);
|
||||||
|
let timeline_id = timeline.timeline_id;
|
||||||
|
|
||||||
|
println!("operating on timeline {}", timeline);
|
||||||
|
|
||||||
|
mgmt_api_client
|
||||||
|
.tenant_config(&TenantConfigRequest {
|
||||||
|
tenant_id: timeline.tenant_id,
|
||||||
|
config: TenantConfig {
|
||||||
|
switch_aux_file_policy: Some(AuxFilePolicy::V2),
|
||||||
|
..Default::default()
|
||||||
|
},
|
||||||
|
})
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
for batch in 0..100 {
|
||||||
|
let items = (0..100)
|
||||||
|
.map(|id| {
|
||||||
|
(
|
||||||
|
format!("pg_logical/mappings/{:03}.{:03}", batch, id),
|
||||||
|
format!("{:08}", id),
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.collect::<HashMap<_, _>>();
|
||||||
|
let file_cnt = items.len();
|
||||||
|
mgmt_api_client
|
||||||
|
.ingest_aux_files(tenant_shard_id, timeline_id, items)
|
||||||
|
.await?;
|
||||||
|
println!("ingested {file_cnt} files");
|
||||||
|
}
|
||||||
|
|
||||||
|
let files = mgmt_api_client
|
||||||
|
.list_aux_files(tenant_shard_id, timeline_id, Lsn(Lsn::MAX.0 - 1))
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
println!("{} files found", files.len());
|
||||||
|
|
||||||
|
anyhow::Ok(())
|
||||||
|
}
|
||||||
@@ -312,8 +312,12 @@ async fn main_impl(
|
|||||||
let (rel_tag, block_no) =
|
let (rel_tag, block_no) =
|
||||||
key_to_rel_block(key).expect("we filter non-rel-block keys out above");
|
key_to_rel_block(key).expect("we filter non-rel-block keys out above");
|
||||||
PagestreamGetPageRequest {
|
PagestreamGetPageRequest {
|
||||||
latest: rng.gen_bool(args.req_latest_probability),
|
request_lsn: if rng.gen_bool(args.req_latest_probability) {
|
||||||
lsn: r.timeline_lsn,
|
Lsn::MAX
|
||||||
|
} else {
|
||||||
|
r.timeline_lsn
|
||||||
|
},
|
||||||
|
not_modified_since: r.timeline_lsn,
|
||||||
rel: rel_tag,
|
rel: rel_tag,
|
||||||
blkno: block_no,
|
blkno: block_no,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,9 +2,11 @@ use pageserver_api::{models::HistoricLayerInfo, shard::TenantShardId};
|
|||||||
|
|
||||||
use pageserver_client::mgmt_api;
|
use pageserver_client::mgmt_api;
|
||||||
use rand::seq::SliceRandom;
|
use rand::seq::SliceRandom;
|
||||||
|
use tokio_util::sync::CancellationToken;
|
||||||
use tracing::{debug, info};
|
use tracing::{debug, info};
|
||||||
use utils::id::{TenantTimelineId, TimelineId};
|
use utils::id::{TenantTimelineId, TimelineId};
|
||||||
|
|
||||||
|
use std::{f64, sync::Arc};
|
||||||
use tokio::{
|
use tokio::{
|
||||||
sync::{mpsc, OwnedSemaphorePermit},
|
sync::{mpsc, OwnedSemaphorePermit},
|
||||||
task::JoinSet,
|
task::JoinSet,
|
||||||
@@ -12,10 +14,7 @@ use tokio::{
|
|||||||
|
|
||||||
use std::{
|
use std::{
|
||||||
num::NonZeroUsize,
|
num::NonZeroUsize,
|
||||||
sync::{
|
sync::atomic::{AtomicU64, Ordering},
|
||||||
atomic::{AtomicU64, Ordering},
|
|
||||||
Arc,
|
|
||||||
},
|
|
||||||
time::{Duration, Instant},
|
time::{Duration, Instant},
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -51,19 +50,31 @@ pub(crate) fn main(args: Args) -> anyhow::Result<()> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(serde::Serialize)]
|
||||||
|
struct Output {
|
||||||
|
downloads_count: u64,
|
||||||
|
downloads_bytes: u64,
|
||||||
|
evictions_count: u64,
|
||||||
|
timeline_restarts: u64,
|
||||||
|
#[serde(with = "humantime_serde")]
|
||||||
|
runtime: Duration,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Default)]
|
#[derive(Debug, Default)]
|
||||||
struct LiveStats {
|
struct LiveStats {
|
||||||
evictions: AtomicU64,
|
evictions_count: AtomicU64,
|
||||||
downloads: AtomicU64,
|
downloads_count: AtomicU64,
|
||||||
|
downloads_bytes: AtomicU64,
|
||||||
timeline_restarts: AtomicU64,
|
timeline_restarts: AtomicU64,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl LiveStats {
|
impl LiveStats {
|
||||||
fn eviction_done(&self) {
|
fn eviction_done(&self) {
|
||||||
self.evictions.fetch_add(1, Ordering::Relaxed);
|
self.evictions_count.fetch_add(1, Ordering::Relaxed);
|
||||||
}
|
}
|
||||||
fn download_done(&self) {
|
fn download_done(&self, size: u64) {
|
||||||
self.downloads.fetch_add(1, Ordering::Relaxed);
|
self.downloads_count.fetch_add(1, Ordering::Relaxed);
|
||||||
|
self.downloads_bytes.fetch_add(size, Ordering::Relaxed);
|
||||||
}
|
}
|
||||||
fn timeline_restart_done(&self) {
|
fn timeline_restart_done(&self) {
|
||||||
self.timeline_restarts.fetch_add(1, Ordering::Relaxed);
|
self.timeline_restarts.fetch_add(1, Ordering::Relaxed);
|
||||||
@@ -92,28 +103,49 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
|
|||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
|
let token = CancellationToken::new();
|
||||||
let mut tasks = JoinSet::new();
|
let mut tasks = JoinSet::new();
|
||||||
|
|
||||||
let live_stats = Arc::new(LiveStats::default());
|
let periodic_stats = Arc::new(LiveStats::default());
|
||||||
|
let total_stats = Arc::new(LiveStats::default());
|
||||||
|
|
||||||
|
let start = Instant::now();
|
||||||
tasks.spawn({
|
tasks.spawn({
|
||||||
let live_stats = Arc::clone(&live_stats);
|
let periodic_stats = Arc::clone(&periodic_stats);
|
||||||
|
let total_stats = Arc::clone(&total_stats);
|
||||||
|
let cloned_token = token.clone();
|
||||||
async move {
|
async move {
|
||||||
let mut last_at = Instant::now();
|
let mut last_at = Instant::now();
|
||||||
loop {
|
loop {
|
||||||
|
if cloned_token.is_cancelled() {
|
||||||
|
return;
|
||||||
|
}
|
||||||
tokio::time::sleep_until((last_at + Duration::from_secs(1)).into()).await;
|
tokio::time::sleep_until((last_at + Duration::from_secs(1)).into()).await;
|
||||||
let now = Instant::now();
|
let now = Instant::now();
|
||||||
let delta: Duration = now - last_at;
|
let delta: Duration = now - last_at;
|
||||||
last_at = now;
|
last_at = now;
|
||||||
|
|
||||||
let LiveStats {
|
let LiveStats {
|
||||||
evictions,
|
evictions_count,
|
||||||
downloads,
|
downloads_count,
|
||||||
|
downloads_bytes,
|
||||||
timeline_restarts,
|
timeline_restarts,
|
||||||
} = &*live_stats;
|
} = &*periodic_stats;
|
||||||
let evictions = evictions.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
|
let evictions_count = evictions_count.swap(0, Ordering::Relaxed);
|
||||||
let downloads = downloads.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
|
let downloads_count = downloads_count.swap(0, Ordering::Relaxed);
|
||||||
|
let downloads_bytes = downloads_bytes.swap(0, Ordering::Relaxed);
|
||||||
let timeline_restarts = timeline_restarts.swap(0, Ordering::Relaxed);
|
let timeline_restarts = timeline_restarts.swap(0, Ordering::Relaxed);
|
||||||
info!("evictions={evictions:.2}/s downloads={downloads:.2}/s timeline_restarts={timeline_restarts}");
|
|
||||||
|
total_stats.evictions_count.fetch_add(evictions_count, Ordering::Relaxed);
|
||||||
|
total_stats.downloads_count.fetch_add(downloads_count, Ordering::Relaxed);
|
||||||
|
total_stats.downloads_bytes.fetch_add(downloads_bytes, Ordering::Relaxed);
|
||||||
|
total_stats.timeline_restarts.fetch_add(timeline_restarts, Ordering::Relaxed);
|
||||||
|
|
||||||
|
let evictions_per_s = evictions_count as f64 / delta.as_secs_f64();
|
||||||
|
let downloads_per_s = downloads_count as f64 / delta.as_secs_f64();
|
||||||
|
let downloads_mibs_per_s = downloads_bytes as f64 / delta.as_secs_f64() / ((1 << 20) as f64);
|
||||||
|
|
||||||
|
info!("evictions={evictions_per_s:.2}/s downloads={downloads_per_s:.2}/s download_bytes={downloads_mibs_per_s:.2}MiB/s timeline_restarts={timeline_restarts}");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@@ -124,14 +156,42 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
|
|||||||
args,
|
args,
|
||||||
Arc::clone(&mgmt_api_client),
|
Arc::clone(&mgmt_api_client),
|
||||||
tl,
|
tl,
|
||||||
Arc::clone(&live_stats),
|
Arc::clone(&periodic_stats),
|
||||||
|
token.clone(),
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if let Some(runtime) = args.runtime {
|
||||||
|
tokio::spawn(async move {
|
||||||
|
tokio::time::sleep(runtime.into()).await;
|
||||||
|
token.cancel();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
while let Some(res) = tasks.join_next().await {
|
while let Some(res) = tasks.join_next().await {
|
||||||
res.unwrap();
|
res.unwrap();
|
||||||
}
|
}
|
||||||
|
let end = Instant::now();
|
||||||
|
let duration: Duration = end - start;
|
||||||
|
|
||||||
|
let output = {
|
||||||
|
let LiveStats {
|
||||||
|
evictions_count,
|
||||||
|
downloads_count,
|
||||||
|
downloads_bytes,
|
||||||
|
timeline_restarts,
|
||||||
|
} = &*total_stats;
|
||||||
|
Output {
|
||||||
|
downloads_count: downloads_count.load(Ordering::Relaxed),
|
||||||
|
downloads_bytes: downloads_bytes.load(Ordering::Relaxed),
|
||||||
|
evictions_count: evictions_count.load(Ordering::Relaxed),
|
||||||
|
timeline_restarts: timeline_restarts.load(Ordering::Relaxed),
|
||||||
|
runtime: duration,
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let output = serde_json::to_string_pretty(&output).unwrap();
|
||||||
|
println!("{output}");
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -140,6 +200,7 @@ async fn timeline_actor(
|
|||||||
mgmt_api_client: Arc<pageserver_client::mgmt_api::Client>,
|
mgmt_api_client: Arc<pageserver_client::mgmt_api::Client>,
|
||||||
timeline: TenantTimelineId,
|
timeline: TenantTimelineId,
|
||||||
live_stats: Arc<LiveStats>,
|
live_stats: Arc<LiveStats>,
|
||||||
|
token: CancellationToken,
|
||||||
) {
|
) {
|
||||||
// TODO: support sharding
|
// TODO: support sharding
|
||||||
let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id);
|
let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id);
|
||||||
@@ -149,7 +210,7 @@ async fn timeline_actor(
|
|||||||
layers: Vec<mpsc::Sender<OwnedSemaphorePermit>>,
|
layers: Vec<mpsc::Sender<OwnedSemaphorePermit>>,
|
||||||
concurrency: Arc<tokio::sync::Semaphore>,
|
concurrency: Arc<tokio::sync::Semaphore>,
|
||||||
}
|
}
|
||||||
loop {
|
while !token.is_cancelled() {
|
||||||
debug!("restarting timeline");
|
debug!("restarting timeline");
|
||||||
let layer_map_info = mgmt_api_client
|
let layer_map_info = mgmt_api_client
|
||||||
.layer_map_info(tenant_shard_id, timeline.timeline_id)
|
.layer_map_info(tenant_shard_id, timeline.timeline_id)
|
||||||
@@ -185,7 +246,7 @@ async fn timeline_actor(
|
|||||||
|
|
||||||
live_stats.timeline_restart_done();
|
live_stats.timeline_restart_done();
|
||||||
|
|
||||||
loop {
|
while !token.is_cancelled() {
|
||||||
assert!(!timeline.joinset.is_empty());
|
assert!(!timeline.joinset.is_empty());
|
||||||
if let Some(res) = timeline.joinset.try_join_next() {
|
if let Some(res) = timeline.joinset.try_join_next() {
|
||||||
debug!(?res, "a layer actor exited, should not happen");
|
debug!(?res, "a layer actor exited, should not happen");
|
||||||
@@ -255,7 +316,7 @@ async fn layer_actor(
|
|||||||
.layer_ondemand_download(tenant_shard_id, timeline_id, layer.layer_file_name())
|
.layer_ondemand_download(tenant_shard_id, timeline_id, layer.layer_file_name())
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
live_stats.download_done();
|
live_stats.download_done(layer.layer_file_size());
|
||||||
did_it
|
did_it
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ mod util {
|
|||||||
|
|
||||||
/// The pagebench CLI sub-commands, dispatched in [`main`] below.
|
/// The pagebench CLI sub-commands, dispatched in [`main`] below.
|
||||||
mod cmd {
|
mod cmd {
|
||||||
|
pub(super) mod aux_files;
|
||||||
pub(super) mod basebackup;
|
pub(super) mod basebackup;
|
||||||
pub(super) mod getpage_latest_lsn;
|
pub(super) mod getpage_latest_lsn;
|
||||||
pub(super) mod ondemand_download_churn;
|
pub(super) mod ondemand_download_churn;
|
||||||
@@ -27,6 +28,7 @@ enum Args {
|
|||||||
GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
|
GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
|
||||||
TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
|
TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
|
||||||
OndemandDownloadChurn(cmd::ondemand_download_churn::Args),
|
OndemandDownloadChurn(cmd::ondemand_download_churn::Args),
|
||||||
|
AuxFiles(cmd::aux_files::Args),
|
||||||
}
|
}
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
@@ -46,6 +48,7 @@ fn main() {
|
|||||||
cmd::trigger_initial_size_calculation::main(args)
|
cmd::trigger_initial_size_calculation::main(args)
|
||||||
}
|
}
|
||||||
Args::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args),
|
Args::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args),
|
||||||
|
Args::AuxFiles(args) => cmd::aux_files::main(args),
|
||||||
}
|
}
|
||||||
.unwrap()
|
.unwrap()
|
||||||
}
|
}
|
||||||
|
|||||||
285
pageserver/src/aux_file.rs
Normal file
285
pageserver/src/aux_file.rs
Normal file
@@ -0,0 +1,285 @@
|
|||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use ::metrics::IntGauge;
|
||||||
|
use bytes::{Buf, BufMut, Bytes};
|
||||||
|
use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE};
|
||||||
|
use tracing::warn;
|
||||||
|
|
||||||
|
// BEGIN Copyright (c) 2017 Servo Contributors
|
||||||
|
|
||||||
|
/// Const version of FNV hash.
|
||||||
|
#[inline]
|
||||||
|
#[must_use]
|
||||||
|
pub const fn fnv_hash(bytes: &[u8]) -> u128 {
|
||||||
|
const INITIAL_STATE: u128 = 0x6c62272e07bb014262b821756295c58d;
|
||||||
|
const PRIME: u128 = 0x0000000001000000000000000000013B;
|
||||||
|
|
||||||
|
let mut hash = INITIAL_STATE;
|
||||||
|
let mut i = 0;
|
||||||
|
while i < bytes.len() {
|
||||||
|
hash ^= bytes[i] as u128;
|
||||||
|
hash = hash.wrapping_mul(PRIME);
|
||||||
|
i += 1;
|
||||||
|
}
|
||||||
|
hash
|
||||||
|
}
|
||||||
|
|
||||||
|
// END Copyright (c) 2017 Servo Contributors
|
||||||
|
|
||||||
|
/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, least significant 13B of FNV hash].
|
||||||
|
fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key {
|
||||||
|
let mut key: [u8; 16] = [0; METADATA_KEY_SIZE];
|
||||||
|
let hash = fnv_hash(data).to_be_bytes();
|
||||||
|
key[0] = AUX_KEY_PREFIX;
|
||||||
|
key[1] = dir_level1;
|
||||||
|
key[2] = dir_level2;
|
||||||
|
key[3..16].copy_from_slice(&hash[3..16]);
|
||||||
|
Key::from_metadata_key_fixed_size(&key)
|
||||||
|
}
|
||||||
|
|
||||||
|
const AUX_DIR_PG_LOGICAL: u8 = 0x01;
|
||||||
|
const AUX_DIR_PG_REPLSLOT: u8 = 0x02;
|
||||||
|
const AUX_DIR_PG_UNKNOWN: u8 = 0xFF;
|
||||||
|
|
||||||
|
/// Encode the aux file into a fixed-size key.
|
||||||
|
///
|
||||||
|
/// The first byte is the AUX key prefix. We use the next 2 bytes of the key for the directory / aux file type.
|
||||||
|
/// We have one-to-one mapping for each of the aux file that we support. We hash the remaining part of the path
|
||||||
|
/// (usually a single file name, or several components) into 13-byte hash. The way we determine the 2-byte prefix
|
||||||
|
/// is roughly based on the first two components of the path, one unique number for one component.
|
||||||
|
///
|
||||||
|
/// * pg_logical/mappings -> 0x0101
|
||||||
|
/// * pg_logical/snapshots -> 0x0102
|
||||||
|
/// * pg_logical/replorigin_checkpoint -> 0x0103
|
||||||
|
/// * pg_logical/others -> 0x01FF
|
||||||
|
/// * pg_replslot/ -> 0x0201
|
||||||
|
/// * others -> 0xFFFF
|
||||||
|
///
|
||||||
|
/// If you add new AUX files to this function, please also add a test case to `test_encoding_portable`.
|
||||||
|
/// The new file type must have never been written to the storage before. Otherwise, there could be data
|
||||||
|
/// corruptions as the new file belongs to a new prefix but it might have been stored under the `others` prefix.
|
||||||
|
pub fn encode_aux_file_key(path: &str) -> Key {
|
||||||
|
if let Some(fname) = path.strip_prefix("pg_logical/mappings/") {
|
||||||
|
aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x01, fname.as_bytes())
|
||||||
|
} else if let Some(fname) = path.strip_prefix("pg_logical/snapshots/") {
|
||||||
|
aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x02, fname.as_bytes())
|
||||||
|
} else if path == "pg_logical/replorigin_checkpoint" {
|
||||||
|
aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x03, b"")
|
||||||
|
} else if let Some(fname) = path.strip_prefix("pg_logical/") {
|
||||||
|
if cfg!(debug_assertions) {
|
||||||
|
warn!(
|
||||||
|
"unsupported pg_logical aux file type: {}, putting to 0x01FF, would affect path scanning",
|
||||||
|
path
|
||||||
|
);
|
||||||
|
}
|
||||||
|
aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0xFF, fname.as_bytes())
|
||||||
|
} else if let Some(fname) = path.strip_prefix("pg_replslot/") {
|
||||||
|
aux_hash_to_metadata_key(AUX_DIR_PG_REPLSLOT, 0x01, fname.as_bytes())
|
||||||
|
} else {
|
||||||
|
if cfg!(debug_assertions) {
|
||||||
|
warn!(
|
||||||
|
"unsupported aux file type: {}, putting to 0xFFFF, would affect path scanning",
|
||||||
|
path
|
||||||
|
);
|
||||||
|
}
|
||||||
|
aux_hash_to_metadata_key(AUX_DIR_PG_UNKNOWN, 0xFF, path.as_bytes())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const AUX_FILE_ENCODING_VERSION: u8 = 0x01;
|
||||||
|
|
||||||
|
pub fn decode_file_value(val: &[u8]) -> anyhow::Result<Vec<(&str, &[u8])>> {
|
||||||
|
let mut ptr = val;
|
||||||
|
if ptr.is_empty() {
|
||||||
|
// empty value = no files
|
||||||
|
return Ok(Vec::new());
|
||||||
|
}
|
||||||
|
assert_eq!(
|
||||||
|
ptr.get_u8(),
|
||||||
|
AUX_FILE_ENCODING_VERSION,
|
||||||
|
"unsupported aux file value"
|
||||||
|
);
|
||||||
|
let mut files = vec![];
|
||||||
|
while ptr.has_remaining() {
|
||||||
|
let key_len = ptr.get_u32() as usize;
|
||||||
|
let key = &ptr[..key_len];
|
||||||
|
ptr.advance(key_len);
|
||||||
|
let val_len = ptr.get_u32() as usize;
|
||||||
|
let content = &ptr[..val_len];
|
||||||
|
ptr.advance(val_len);
|
||||||
|
|
||||||
|
let path = std::str::from_utf8(key)?;
|
||||||
|
files.push((path, content));
|
||||||
|
}
|
||||||
|
Ok(files)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Decode an aux file key-value pair into a list of files. The returned `Bytes` contains reference
|
||||||
|
/// to the original value slice. Be cautious about memory consumption.
|
||||||
|
pub fn decode_file_value_bytes(val: &Bytes) -> anyhow::Result<Vec<(String, Bytes)>> {
|
||||||
|
let mut ptr = val.clone();
|
||||||
|
if ptr.is_empty() {
|
||||||
|
// empty value = no files
|
||||||
|
return Ok(Vec::new());
|
||||||
|
}
|
||||||
|
assert_eq!(
|
||||||
|
ptr.get_u8(),
|
||||||
|
AUX_FILE_ENCODING_VERSION,
|
||||||
|
"unsupported aux file value"
|
||||||
|
);
|
||||||
|
let mut files = vec![];
|
||||||
|
while ptr.has_remaining() {
|
||||||
|
let key_len = ptr.get_u32() as usize;
|
||||||
|
let key = ptr.slice(..key_len);
|
||||||
|
ptr.advance(key_len);
|
||||||
|
let val_len = ptr.get_u32() as usize;
|
||||||
|
let content = ptr.slice(..val_len);
|
||||||
|
ptr.advance(val_len);
|
||||||
|
|
||||||
|
let path = std::str::from_utf8(&key)?.to_string();
|
||||||
|
files.push((path, content));
|
||||||
|
}
|
||||||
|
Ok(files)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn encode_file_value(files: &[(&str, &[u8])]) -> anyhow::Result<Vec<u8>> {
|
||||||
|
if files.is_empty() {
|
||||||
|
// no files = empty value
|
||||||
|
return Ok(Vec::new());
|
||||||
|
}
|
||||||
|
let mut encoded = vec![];
|
||||||
|
encoded.put_u8(AUX_FILE_ENCODING_VERSION);
|
||||||
|
for (path, content) in files {
|
||||||
|
if path.len() > u32::MAX as usize {
|
||||||
|
anyhow::bail!("{} exceeds path size limit", path);
|
||||||
|
}
|
||||||
|
encoded.put_u32(path.len() as u32);
|
||||||
|
encoded.put_slice(path.as_bytes());
|
||||||
|
if content.len() > u32::MAX as usize {
|
||||||
|
anyhow::bail!("{} exceeds content size limit", path);
|
||||||
|
}
|
||||||
|
encoded.put_u32(content.len() as u32);
|
||||||
|
encoded.put_slice(content);
|
||||||
|
}
|
||||||
|
Ok(encoded)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// An estimation of the size of aux files.
|
||||||
|
pub struct AuxFileSizeEstimator {
|
||||||
|
aux_file_size_gauge: IntGauge,
|
||||||
|
size: Arc<std::sync::Mutex<Option<isize>>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl AuxFileSizeEstimator {
|
||||||
|
pub fn new(aux_file_size_gauge: IntGauge) -> Self {
|
||||||
|
Self {
|
||||||
|
aux_file_size_gauge,
|
||||||
|
size: Arc::new(std::sync::Mutex::new(None)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn on_base_backup(&self, new_size: usize) {
|
||||||
|
let mut guard = self.size.lock().unwrap();
|
||||||
|
*guard = Some(new_size as isize);
|
||||||
|
self.report(new_size as isize);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn on_add(&self, file_size: usize) {
|
||||||
|
let mut guard = self.size.lock().unwrap();
|
||||||
|
if let Some(size) = &mut *guard {
|
||||||
|
*size += file_size as isize;
|
||||||
|
self.report(*size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn on_remove(&self, file_size: usize) {
|
||||||
|
let mut guard = self.size.lock().unwrap();
|
||||||
|
if let Some(size) = &mut *guard {
|
||||||
|
*size -= file_size as isize;
|
||||||
|
self.report(*size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn on_update(&self, old_size: usize, new_size: usize) {
|
||||||
|
let mut guard = self.size.lock().unwrap();
|
||||||
|
if let Some(size) = &mut *guard {
|
||||||
|
*size += new_size as isize - old_size as isize;
|
||||||
|
self.report(*size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn report(&self, size: isize) {
|
||||||
|
self.aux_file_size_gauge.set(size as i64);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_hash_portable() {
|
||||||
|
// AUX file encoding requires the hash to be portable across all platforms. This test case checks
|
||||||
|
// if the algorithm produces the same hash across different environments.
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
265160408618497461376862998434862070044,
|
||||||
|
super::fnv_hash("test1".as_bytes())
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
295486155126299629456360817749600553988,
|
||||||
|
super::fnv_hash("test/test2".as_bytes())
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
144066263297769815596495629667062367629,
|
||||||
|
super::fnv_hash("".as_bytes())
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_encoding_portable() {
|
||||||
|
// To correct retrieve AUX files, the generated keys for the same file must be the same for all versions
|
||||||
|
// of the page server.
|
||||||
|
assert_eq!(
|
||||||
|
"62000001017F8B83D94F7081693471ABF91C",
|
||||||
|
encode_aux_file_key("pg_logical/mappings/test1").to_string(),
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
"62000001027F8E83D94F7081693471ABFCCD",
|
||||||
|
encode_aux_file_key("pg_logical/snapshots/test2").to_string(),
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
"62000001032E07BB014262B821756295C58D",
|
||||||
|
encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string(),
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
"62000001FF4F38E1C74754E7D03C1A660178",
|
||||||
|
encode_aux_file_key("pg_logical/unsupported").to_string(),
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
"62000002017F8D83D94F7081693471ABFB92",
|
||||||
|
encode_aux_file_key("pg_replslot/test3").to_string()
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
"620000FFFF2B6ECC8AEF93F643DC44F15E03",
|
||||||
|
encode_aux_file_key("other_file_not_supported").to_string(),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_value_encoding() {
|
||||||
|
let files = vec![
|
||||||
|
("pg_logical/1.file", "1111".as_bytes()),
|
||||||
|
("pg_logical/2.file", "2222".as_bytes()),
|
||||||
|
];
|
||||||
|
assert_eq!(
|
||||||
|
files,
|
||||||
|
decode_file_value(&encode_file_value(&files).unwrap()).unwrap()
|
||||||
|
);
|
||||||
|
let files = vec![];
|
||||||
|
assert_eq!(
|
||||||
|
files,
|
||||||
|
decode_file_value(&encode_file_value(&files).unwrap()).unwrap()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -10,7 +10,7 @@
|
|||||||
//! This module is responsible for creation of such tarball
|
//! This module is responsible for creation of such tarball
|
||||||
//! from data stored in object storage.
|
//! from data stored in object storage.
|
||||||
//!
|
//!
|
||||||
use anyhow::{anyhow, bail, ensure, Context};
|
use anyhow::{anyhow, Context};
|
||||||
use bytes::{BufMut, Bytes, BytesMut};
|
use bytes::{BufMut, Bytes, BytesMut};
|
||||||
use fail::fail_point;
|
use fail::fail_point;
|
||||||
use pageserver_api::key::{key_to_slru_block, Key};
|
use pageserver_api::key::{key_to_slru_block, Key};
|
||||||
@@ -38,6 +38,14 @@ use postgres_ffi::PG_TLI;
|
|||||||
use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE};
|
use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE};
|
||||||
use utils::lsn::Lsn;
|
use utils::lsn::Lsn;
|
||||||
|
|
||||||
|
#[derive(Debug, thiserror::Error)]
|
||||||
|
pub enum BasebackupError {
|
||||||
|
#[error("basebackup pageserver error {0:#}")]
|
||||||
|
Server(#[from] anyhow::Error),
|
||||||
|
#[error("basebackup client error {0:#}")]
|
||||||
|
Client(#[source] io::Error),
|
||||||
|
}
|
||||||
|
|
||||||
/// Create basebackup with non-rel data in it.
|
/// Create basebackup with non-rel data in it.
|
||||||
/// Only include relational data if 'full_backup' is true.
|
/// Only include relational data if 'full_backup' is true.
|
||||||
///
|
///
|
||||||
@@ -53,7 +61,7 @@ pub async fn send_basebackup_tarball<'a, W>(
|
|||||||
prev_lsn: Option<Lsn>,
|
prev_lsn: Option<Lsn>,
|
||||||
full_backup: bool,
|
full_backup: bool,
|
||||||
ctx: &'a RequestContext,
|
ctx: &'a RequestContext,
|
||||||
) -> anyhow::Result<()>
|
) -> Result<(), BasebackupError>
|
||||||
where
|
where
|
||||||
W: AsyncWrite + Send + Sync + Unpin,
|
W: AsyncWrite + Send + Sync + Unpin,
|
||||||
{
|
{
|
||||||
@@ -92,8 +100,10 @@ where
|
|||||||
|
|
||||||
// Consolidate the derived and the provided prev_lsn values
|
// Consolidate the derived and the provided prev_lsn values
|
||||||
let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn {
|
let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn {
|
||||||
if backup_prev != Lsn(0) {
|
if backup_prev != Lsn(0) && backup_prev != provided_prev_lsn {
|
||||||
ensure!(backup_prev == provided_prev_lsn);
|
return Err(BasebackupError::Server(anyhow!(
|
||||||
|
"backup_prev {backup_prev} != provided_prev_lsn {provided_prev_lsn}"
|
||||||
|
)));
|
||||||
}
|
}
|
||||||
provided_prev_lsn
|
provided_prev_lsn
|
||||||
} else {
|
} else {
|
||||||
@@ -159,15 +169,26 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn add_block(&mut self, key: &Key, block: Bytes) -> anyhow::Result<()> {
|
async fn add_block(&mut self, key: &Key, block: Bytes) -> Result<(), BasebackupError> {
|
||||||
let (kind, segno, _) = key_to_slru_block(*key)?;
|
let (kind, segno, _) = key_to_slru_block(*key)?;
|
||||||
|
|
||||||
match kind {
|
match kind {
|
||||||
SlruKind::Clog => {
|
SlruKind::Clog => {
|
||||||
ensure!(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8);
|
if !(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8) {
|
||||||
|
return Err(BasebackupError::Server(anyhow!(
|
||||||
|
"invalid SlruKind::Clog record: block.len()={}",
|
||||||
|
block.len()
|
||||||
|
)));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
SlruKind::MultiXactMembers | SlruKind::MultiXactOffsets => {
|
SlruKind::MultiXactMembers | SlruKind::MultiXactOffsets => {
|
||||||
ensure!(block.len() == BLCKSZ as usize);
|
if block.len() != BLCKSZ as usize {
|
||||||
|
return Err(BasebackupError::Server(anyhow!(
|
||||||
|
"invalid {:?} record: block.len()={}",
|
||||||
|
kind,
|
||||||
|
block.len()
|
||||||
|
)));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -194,12 +215,15 @@ where
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn flush(&mut self) -> anyhow::Result<()> {
|
async fn flush(&mut self) -> Result<(), BasebackupError> {
|
||||||
let nblocks = self.buf.len() / BLCKSZ as usize;
|
let nblocks = self.buf.len() / BLCKSZ as usize;
|
||||||
let (kind, segno) = self.current_segment.take().unwrap();
|
let (kind, segno) = self.current_segment.take().unwrap();
|
||||||
let segname = format!("{}/{:>04X}", kind.to_str(), segno);
|
let segname = format!("{}/{:>04X}", kind.to_str(), segno);
|
||||||
let header = new_tar_header(&segname, self.buf.len() as u64)?;
|
let header = new_tar_header(&segname, self.buf.len() as u64)?;
|
||||||
self.ar.append(&header, self.buf.as_slice()).await?;
|
self.ar
|
||||||
|
.append(&header, self.buf.as_slice())
|
||||||
|
.await
|
||||||
|
.map_err(BasebackupError::Client)?;
|
||||||
|
|
||||||
self.total_blocks += nblocks;
|
self.total_blocks += nblocks;
|
||||||
debug!("Added to basebackup slru {} relsize {}", segname, nblocks);
|
debug!("Added to basebackup slru {} relsize {}", segname, nblocks);
|
||||||
@@ -209,7 +233,7 @@ where
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn finish(mut self) -> anyhow::Result<()> {
|
async fn finish(mut self) -> Result<(), BasebackupError> {
|
||||||
let res = if self.current_segment.is_none() || self.buf.is_empty() {
|
let res = if self.current_segment.is_none() || self.buf.is_empty() {
|
||||||
Ok(())
|
Ok(())
|
||||||
} else {
|
} else {
|
||||||
@@ -226,7 +250,7 @@ impl<'a, W> Basebackup<'a, W>
|
|||||||
where
|
where
|
||||||
W: AsyncWrite + Send + Sync + Unpin,
|
W: AsyncWrite + Send + Sync + Unpin,
|
||||||
{
|
{
|
||||||
async fn send_tarball(mut self) -> anyhow::Result<()> {
|
async fn send_tarball(mut self) -> Result<(), BasebackupError> {
|
||||||
// TODO include checksum
|
// TODO include checksum
|
||||||
|
|
||||||
let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup;
|
let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup;
|
||||||
@@ -262,16 +286,25 @@ where
|
|||||||
let slru_partitions = self
|
let slru_partitions = self
|
||||||
.timeline
|
.timeline
|
||||||
.get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
|
.get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
|
||||||
.await?
|
.await
|
||||||
.partition(Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64);
|
.map_err(|e| BasebackupError::Server(e.into()))?
|
||||||
|
.partition(
|
||||||
|
self.timeline.get_shard_identity(),
|
||||||
|
Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
|
||||||
|
);
|
||||||
|
|
||||||
let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
|
let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
|
||||||
|
|
||||||
for part in slru_partitions.parts {
|
for part in slru_partitions.parts {
|
||||||
let blocks = self.timeline.get_vectored(part, self.lsn, self.ctx).await?;
|
let blocks = self
|
||||||
|
.timeline
|
||||||
|
.get_vectored(part, self.lsn, self.ctx)
|
||||||
|
.await
|
||||||
|
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||||
|
|
||||||
for (key, block) in blocks {
|
for (key, block) in blocks {
|
||||||
slru_builder.add_block(&key, block?).await?;
|
let block = block.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||||
|
slru_builder.add_block(&key, block).await?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
slru_builder.finish().await?;
|
slru_builder.finish().await?;
|
||||||
@@ -279,8 +312,11 @@ where
|
|||||||
|
|
||||||
let mut min_restart_lsn: Lsn = Lsn::MAX;
|
let mut min_restart_lsn: Lsn = Lsn::MAX;
|
||||||
// Create tablespace directories
|
// Create tablespace directories
|
||||||
for ((spcnode, dbnode), has_relmap_file) in
|
for ((spcnode, dbnode), has_relmap_file) in self
|
||||||
self.timeline.list_dbdirs(self.lsn, self.ctx).await?
|
.timeline
|
||||||
|
.list_dbdirs(self.lsn, self.ctx)
|
||||||
|
.await
|
||||||
|
.map_err(|e| BasebackupError::Server(e.into()))?
|
||||||
{
|
{
|
||||||
self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
|
self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
|
||||||
|
|
||||||
@@ -289,7 +325,8 @@ where
|
|||||||
let rels = self
|
let rels = self
|
||||||
.timeline
|
.timeline
|
||||||
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
|
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
|
||||||
.await?;
|
.await
|
||||||
|
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||||
for &rel in rels.iter() {
|
for &rel in rels.iter() {
|
||||||
// Send init fork as main fork to provide well formed empty
|
// Send init fork as main fork to provide well formed empty
|
||||||
// contents of UNLOGGED relations. Postgres copies it in
|
// contents of UNLOGGED relations. Postgres copies it in
|
||||||
@@ -312,7 +349,12 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (path, content) in self.timeline.list_aux_files(self.lsn, self.ctx).await? {
|
for (path, content) in self
|
||||||
|
.timeline
|
||||||
|
.list_aux_files(self.lsn, self.ctx)
|
||||||
|
.await
|
||||||
|
.map_err(|e| BasebackupError::Server(e.into()))?
|
||||||
|
{
|
||||||
if path.starts_with("pg_replslot") {
|
if path.starts_with("pg_replslot") {
|
||||||
let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
|
let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
|
||||||
let restart_lsn = Lsn(u64::from_le_bytes(
|
let restart_lsn = Lsn(u64::from_le_bytes(
|
||||||
@@ -343,34 +385,41 @@ where
|
|||||||
for xid in self
|
for xid in self
|
||||||
.timeline
|
.timeline
|
||||||
.list_twophase_files(self.lsn, self.ctx)
|
.list_twophase_files(self.lsn, self.ctx)
|
||||||
.await?
|
.await
|
||||||
|
.map_err(|e| BasebackupError::Server(e.into()))?
|
||||||
{
|
{
|
||||||
self.add_twophase_file(xid).await?;
|
self.add_twophase_file(xid).await?;
|
||||||
}
|
}
|
||||||
|
|
||||||
fail_point!("basebackup-before-control-file", |_| {
|
fail_point!("basebackup-before-control-file", |_| {
|
||||||
bail!("failpoint basebackup-before-control-file")
|
Err(BasebackupError::Server(anyhow!(
|
||||||
|
"failpoint basebackup-before-control-file"
|
||||||
|
)))
|
||||||
});
|
});
|
||||||
|
|
||||||
// Generate pg_control and bootstrap WAL segment.
|
// Generate pg_control and bootstrap WAL segment.
|
||||||
self.add_pgcontrol_file().await?;
|
self.add_pgcontrol_file().await?;
|
||||||
self.ar.finish().await?;
|
self.ar.finish().await.map_err(BasebackupError::Client)?;
|
||||||
debug!("all tarred up!");
|
debug!("all tarred up!");
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Add contents of relfilenode `src`, naming it as `dst`.
|
/// Add contents of relfilenode `src`, naming it as `dst`.
|
||||||
async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
|
async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> Result<(), BasebackupError> {
|
||||||
let nblocks = self
|
let nblocks = self
|
||||||
.timeline
|
.timeline
|
||||||
.get_rel_size(src, Version::Lsn(self.lsn), false, self.ctx)
|
.get_rel_size(src, Version::Lsn(self.lsn), self.ctx)
|
||||||
.await?;
|
.await
|
||||||
|
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||||
|
|
||||||
// If the relation is empty, create an empty file
|
// If the relation is empty, create an empty file
|
||||||
if nblocks == 0 {
|
if nblocks == 0 {
|
||||||
let file_name = dst.to_segfile_name(0);
|
let file_name = dst.to_segfile_name(0);
|
||||||
let header = new_tar_header(&file_name, 0)?;
|
let header = new_tar_header(&file_name, 0)?;
|
||||||
self.ar.append(&header, &mut io::empty()).await?;
|
self.ar
|
||||||
|
.append(&header, &mut io::empty())
|
||||||
|
.await
|
||||||
|
.map_err(BasebackupError::Client)?;
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -384,14 +433,18 @@ where
|
|||||||
for blknum in startblk..endblk {
|
for blknum in startblk..endblk {
|
||||||
let img = self
|
let img = self
|
||||||
.timeline
|
.timeline
|
||||||
.get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), false, self.ctx)
|
.get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), self.ctx)
|
||||||
.await?;
|
.await
|
||||||
|
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||||
segment_data.extend_from_slice(&img[..]);
|
segment_data.extend_from_slice(&img[..]);
|
||||||
}
|
}
|
||||||
|
|
||||||
let file_name = dst.to_segfile_name(seg as u32);
|
let file_name = dst.to_segfile_name(seg as u32);
|
||||||
let header = new_tar_header(&file_name, segment_data.len() as u64)?;
|
let header = new_tar_header(&file_name, segment_data.len() as u64)?;
|
||||||
self.ar.append(&header, segment_data.as_slice()).await?;
|
self.ar
|
||||||
|
.append(&header, segment_data.as_slice())
|
||||||
|
.await
|
||||||
|
.map_err(BasebackupError::Client)?;
|
||||||
|
|
||||||
seg += 1;
|
seg += 1;
|
||||||
startblk = endblk;
|
startblk = endblk;
|
||||||
@@ -411,20 +464,22 @@ where
|
|||||||
spcnode: u32,
|
spcnode: u32,
|
||||||
dbnode: u32,
|
dbnode: u32,
|
||||||
has_relmap_file: bool,
|
has_relmap_file: bool,
|
||||||
) -> anyhow::Result<()> {
|
) -> Result<(), BasebackupError> {
|
||||||
let relmap_img = if has_relmap_file {
|
let relmap_img = if has_relmap_file {
|
||||||
let img = self
|
let img = self
|
||||||
.timeline
|
.timeline
|
||||||
.get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
|
.get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
|
||||||
.await?;
|
.await
|
||||||
|
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||||
|
|
||||||
ensure!(
|
if img.len()
|
||||||
img.len()
|
!= dispatch_pgversion!(self.timeline.pg_version, pgv::bindings::SIZEOF_RELMAPFILE)
|
||||||
== dispatch_pgversion!(
|
{
|
||||||
self.timeline.pg_version,
|
return Err(BasebackupError::Server(anyhow!(
|
||||||
pgv::bindings::SIZEOF_RELMAPFILE
|
"img.len() != SIZE_OF_RELMAPFILE, img.len()={}",
|
||||||
)
|
img.len(),
|
||||||
);
|
)));
|
||||||
|
}
|
||||||
|
|
||||||
Some(img)
|
Some(img)
|
||||||
} else {
|
} else {
|
||||||
@@ -437,14 +492,20 @@ where
|
|||||||
ver => format!("{ver}\x0A"),
|
ver => format!("{ver}\x0A"),
|
||||||
};
|
};
|
||||||
let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?;
|
let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?;
|
||||||
self.ar.append(&header, pg_version_str.as_bytes()).await?;
|
self.ar
|
||||||
|
.append(&header, pg_version_str.as_bytes())
|
||||||
|
.await
|
||||||
|
.map_err(BasebackupError::Client)?;
|
||||||
|
|
||||||
info!("timeline.pg_version {}", self.timeline.pg_version);
|
info!("timeline.pg_version {}", self.timeline.pg_version);
|
||||||
|
|
||||||
if let Some(img) = relmap_img {
|
if let Some(img) = relmap_img {
|
||||||
// filenode map for global tablespace
|
// filenode map for global tablespace
|
||||||
let header = new_tar_header("global/pg_filenode.map", img.len() as u64)?;
|
let header = new_tar_header("global/pg_filenode.map", img.len() as u64)?;
|
||||||
self.ar.append(&header, &img[..]).await?;
|
self.ar
|
||||||
|
.append(&header, &img[..])
|
||||||
|
.await
|
||||||
|
.map_err(BasebackupError::Client)?;
|
||||||
} else {
|
} else {
|
||||||
warn!("global/pg_filenode.map is missing");
|
warn!("global/pg_filenode.map is missing");
|
||||||
}
|
}
|
||||||
@@ -463,18 +524,26 @@ where
|
|||||||
&& self
|
&& self
|
||||||
.timeline
|
.timeline
|
||||||
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
|
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
|
||||||
.await?
|
.await
|
||||||
|
.map_err(|e| BasebackupError::Server(e.into()))?
|
||||||
.is_empty()
|
.is_empty()
|
||||||
{
|
{
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
// User defined tablespaces are not supported
|
// User defined tablespaces are not supported
|
||||||
ensure!(spcnode == DEFAULTTABLESPACE_OID);
|
if spcnode != DEFAULTTABLESPACE_OID {
|
||||||
|
return Err(BasebackupError::Server(anyhow!(
|
||||||
|
"spcnode != DEFAULTTABLESPACE_OID, spcnode={spcnode}"
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
|
||||||
// Append dir path for each database
|
// Append dir path for each database
|
||||||
let path = format!("base/{}", dbnode);
|
let path = format!("base/{}", dbnode);
|
||||||
let header = new_tar_header_dir(&path)?;
|
let header = new_tar_header_dir(&path)?;
|
||||||
self.ar.append(&header, &mut io::empty()).await?;
|
self.ar
|
||||||
|
.append(&header, &mut io::empty())
|
||||||
|
.await
|
||||||
|
.map_err(BasebackupError::Client)?;
|
||||||
|
|
||||||
if let Some(img) = relmap_img {
|
if let Some(img) = relmap_img {
|
||||||
let dst_path = format!("base/{}/PG_VERSION", dbnode);
|
let dst_path = format!("base/{}/PG_VERSION", dbnode);
|
||||||
@@ -484,11 +553,17 @@ where
|
|||||||
ver => format!("{ver}\x0A"),
|
ver => format!("{ver}\x0A"),
|
||||||
};
|
};
|
||||||
let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?;
|
let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?;
|
||||||
self.ar.append(&header, pg_version_str.as_bytes()).await?;
|
self.ar
|
||||||
|
.append(&header, pg_version_str.as_bytes())
|
||||||
|
.await
|
||||||
|
.map_err(BasebackupError::Client)?;
|
||||||
|
|
||||||
let relmap_path = format!("base/{}/pg_filenode.map", dbnode);
|
let relmap_path = format!("base/{}/pg_filenode.map", dbnode);
|
||||||
let header = new_tar_header(&relmap_path, img.len() as u64)?;
|
let header = new_tar_header(&relmap_path, img.len() as u64)?;
|
||||||
self.ar.append(&header, &img[..]).await?;
|
self.ar
|
||||||
|
.append(&header, &img[..])
|
||||||
|
.await
|
||||||
|
.map_err(BasebackupError::Client)?;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -497,11 +572,12 @@ where
|
|||||||
//
|
//
|
||||||
// Extract twophase state files
|
// Extract twophase state files
|
||||||
//
|
//
|
||||||
async fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
|
async fn add_twophase_file(&mut self, xid: TransactionId) -> Result<(), BasebackupError> {
|
||||||
let img = self
|
let img = self
|
||||||
.timeline
|
.timeline
|
||||||
.get_twophase_file(xid, self.lsn, self.ctx)
|
.get_twophase_file(xid, self.lsn, self.ctx)
|
||||||
.await?;
|
.await
|
||||||
|
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||||
|
|
||||||
let mut buf = BytesMut::new();
|
let mut buf = BytesMut::new();
|
||||||
buf.extend_from_slice(&img[..]);
|
buf.extend_from_slice(&img[..]);
|
||||||
@@ -509,7 +585,10 @@ where
|
|||||||
buf.put_u32_le(crc);
|
buf.put_u32_le(crc);
|
||||||
let path = format!("pg_twophase/{:>08X}", xid);
|
let path = format!("pg_twophase/{:>08X}", xid);
|
||||||
let header = new_tar_header(&path, buf.len() as u64)?;
|
let header = new_tar_header(&path, buf.len() as u64)?;
|
||||||
self.ar.append(&header, &buf[..]).await?;
|
self.ar
|
||||||
|
.append(&header, &buf[..])
|
||||||
|
.await
|
||||||
|
.map_err(BasebackupError::Client)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -518,24 +597,28 @@ where
|
|||||||
// Add generated pg_control file and bootstrap WAL segment.
|
// Add generated pg_control file and bootstrap WAL segment.
|
||||||
// Also send zenith.signal file with extra bootstrap data.
|
// Also send zenith.signal file with extra bootstrap data.
|
||||||
//
|
//
|
||||||
async fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
|
async fn add_pgcontrol_file(&mut self) -> Result<(), BasebackupError> {
|
||||||
// add zenith.signal file
|
// add zenith.signal file
|
||||||
let mut zenith_signal = String::new();
|
let mut zenith_signal = String::new();
|
||||||
if self.prev_record_lsn == Lsn(0) {
|
if self.prev_record_lsn == Lsn(0) {
|
||||||
if self.lsn == self.timeline.get_ancestor_lsn() {
|
if self.timeline.is_ancestor_lsn(self.lsn) {
|
||||||
write!(zenith_signal, "PREV LSN: none")?;
|
write!(zenith_signal, "PREV LSN: none")
|
||||||
|
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||||
} else {
|
} else {
|
||||||
write!(zenith_signal, "PREV LSN: invalid")?;
|
write!(zenith_signal, "PREV LSN: invalid")
|
||||||
|
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)?;
|
write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)
|
||||||
|
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||||
}
|
}
|
||||||
self.ar
|
self.ar
|
||||||
.append(
|
.append(
|
||||||
&new_tar_header("zenith.signal", zenith_signal.len() as u64)?,
|
&new_tar_header("zenith.signal", zenith_signal.len() as u64)?,
|
||||||
zenith_signal.as_bytes(),
|
zenith_signal.as_bytes(),
|
||||||
)
|
)
|
||||||
.await?;
|
.await
|
||||||
|
.map_err(BasebackupError::Client)?;
|
||||||
|
|
||||||
let checkpoint_bytes = self
|
let checkpoint_bytes = self
|
||||||
.timeline
|
.timeline
|
||||||
@@ -557,7 +640,10 @@ where
|
|||||||
|
|
||||||
//send pg_control
|
//send pg_control
|
||||||
let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?;
|
let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?;
|
||||||
self.ar.append(&header, &pg_control_bytes[..]).await?;
|
self.ar
|
||||||
|
.append(&header, &pg_control_bytes[..])
|
||||||
|
.await
|
||||||
|
.map_err(BasebackupError::Client)?;
|
||||||
|
|
||||||
//send wal segment
|
//send wal segment
|
||||||
let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE);
|
let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE);
|
||||||
@@ -572,8 +658,16 @@ where
|
|||||||
self.lsn,
|
self.lsn,
|
||||||
)
|
)
|
||||||
.map_err(|e| anyhow!(e).context("Failed generating wal segment"))?;
|
.map_err(|e| anyhow!(e).context("Failed generating wal segment"))?;
|
||||||
ensure!(wal_seg.len() == WAL_SEGMENT_SIZE);
|
if wal_seg.len() != WAL_SEGMENT_SIZE {
|
||||||
self.ar.append(&header, &wal_seg[..]).await?;
|
return Err(BasebackupError::Server(anyhow!(
|
||||||
|
"wal_seg.len() != WAL_SEGMENT_SIZE, wal_seg.len()={}",
|
||||||
|
wal_seg.len()
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
self.ar
|
||||||
|
.append(&header, &wal_seg[..])
|
||||||
|
.await
|
||||||
|
.map_err(BasebackupError::Client)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,6 +3,7 @@
|
|||||||
//! Main entry point for the Page Server executable.
|
//! Main entry point for the Page Server executable.
|
||||||
|
|
||||||
use std::env::{var, VarError};
|
use std::env::{var, VarError};
|
||||||
|
use std::io::Read;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use std::{env, ops::ControlFlow, str::FromStr};
|
use std::{env, ops::ControlFlow, str::FromStr};
|
||||||
@@ -121,8 +122,10 @@ fn main() -> anyhow::Result<()> {
|
|||||||
&[("node_id", &conf.id.to_string())],
|
&[("node_id", &conf.id.to_string())],
|
||||||
);
|
);
|
||||||
|
|
||||||
// after setting up logging, log the effective IO engine choice
|
// after setting up logging, log the effective IO engine choice and read path implementations
|
||||||
info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
|
info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
|
||||||
|
info!(?conf.get_impl, "starting with get page implementation");
|
||||||
|
info!(?conf.get_vectored_impl, "starting with vectored get page implementation");
|
||||||
|
|
||||||
let tenants_path = conf.tenants_path();
|
let tenants_path = conf.tenants_path();
|
||||||
if !tenants_path.exists() {
|
if !tenants_path.exists() {
|
||||||
@@ -149,37 +152,34 @@ fn initialize_config(
|
|||||||
workdir: &Utf8Path,
|
workdir: &Utf8Path,
|
||||||
) -> anyhow::Result<ControlFlow<(), &'static PageServerConf>> {
|
) -> anyhow::Result<ControlFlow<(), &'static PageServerConf>> {
|
||||||
let init = arg_matches.get_flag("init");
|
let init = arg_matches.get_flag("init");
|
||||||
let update_config = init || arg_matches.get_flag("update-config");
|
|
||||||
|
|
||||||
let (mut toml, config_file_exists) = if cfg_file_path.is_file() {
|
let file_contents: Option<toml_edit::Document> = match std::fs::File::open(cfg_file_path) {
|
||||||
if init {
|
Ok(mut f) => {
|
||||||
anyhow::bail!(
|
if init {
|
||||||
"Config file '{cfg_file_path}' already exists, cannot init it, use --update-config to update it",
|
anyhow::bail!("config file already exists: {cfg_file_path}");
|
||||||
);
|
}
|
||||||
|
let md = f.metadata().context("stat config file")?;
|
||||||
|
if md.is_file() {
|
||||||
|
let mut s = String::new();
|
||||||
|
f.read_to_string(&mut s).context("read config file")?;
|
||||||
|
Some(s.parse().context("parse config file toml")?)
|
||||||
|
} else {
|
||||||
|
anyhow::bail!("directory entry exists but is not a file: {cfg_file_path}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) if e.kind() == std::io::ErrorKind::NotFound => None,
|
||||||
|
Err(e) => {
|
||||||
|
anyhow::bail!("open pageserver config: {e}: {cfg_file_path}");
|
||||||
}
|
}
|
||||||
// Supplement the CLI arguments with the config file
|
|
||||||
let cfg_file_contents = std::fs::read_to_string(cfg_file_path)
|
|
||||||
.with_context(|| format!("Failed to read pageserver config at '{cfg_file_path}'"))?;
|
|
||||||
(
|
|
||||||
cfg_file_contents
|
|
||||||
.parse::<toml_edit::Document>()
|
|
||||||
.with_context(|| {
|
|
||||||
format!("Failed to parse '{cfg_file_path}' as pageserver config")
|
|
||||||
})?,
|
|
||||||
true,
|
|
||||||
)
|
|
||||||
} else if cfg_file_path.exists() {
|
|
||||||
anyhow::bail!("Config file '{cfg_file_path}' exists but is not a regular file");
|
|
||||||
} else {
|
|
||||||
// We're initializing the tenant, so there's no config file yet
|
|
||||||
(
|
|
||||||
DEFAULT_CONFIG_FILE
|
|
||||||
.parse::<toml_edit::Document>()
|
|
||||||
.context("could not parse built-in config file")?,
|
|
||||||
false,
|
|
||||||
)
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let mut effective_config = file_contents.unwrap_or_else(|| {
|
||||||
|
DEFAULT_CONFIG_FILE
|
||||||
|
.parse()
|
||||||
|
.expect("unit tests ensure this works")
|
||||||
|
});
|
||||||
|
|
||||||
|
// Patch with overrides from the command line
|
||||||
if let Some(values) = arg_matches.get_many::<String>("config-override") {
|
if let Some(values) = arg_matches.get_many::<String>("config-override") {
|
||||||
for option_line in values {
|
for option_line in values {
|
||||||
let doc = toml_edit::Document::from_str(option_line).with_context(|| {
|
let doc = toml_edit::Document::from_str(option_line).with_context(|| {
|
||||||
@@ -187,22 +187,21 @@ fn initialize_config(
|
|||||||
})?;
|
})?;
|
||||||
|
|
||||||
for (key, item) in doc.iter() {
|
for (key, item) in doc.iter() {
|
||||||
if config_file_exists && update_config && key == "id" && toml.contains_key(key) {
|
effective_config.insert(key, item.clone());
|
||||||
anyhow::bail!("Pageserver config file exists at '{cfg_file_path}' and has node id already, it cannot be overridden");
|
|
||||||
}
|
|
||||||
toml.insert(key, item.clone());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
debug!("Resulting toml: {toml}");
|
debug!("Resulting toml: {effective_config}");
|
||||||
let conf = PageServerConf::parse_and_validate(&toml, workdir)
|
|
||||||
|
// Construct the runtime representation
|
||||||
|
let conf = PageServerConf::parse_and_validate(&effective_config, workdir)
|
||||||
.context("Failed to parse pageserver configuration")?;
|
.context("Failed to parse pageserver configuration")?;
|
||||||
|
|
||||||
if update_config {
|
if init {
|
||||||
info!("Writing pageserver config to '{cfg_file_path}'");
|
info!("Writing pageserver config to '{cfg_file_path}'");
|
||||||
|
|
||||||
std::fs::write(cfg_file_path, toml.to_string())
|
std::fs::write(cfg_file_path, effective_config.to_string())
|
||||||
.with_context(|| format!("Failed to write pageserver config to '{cfg_file_path}'"))?;
|
.with_context(|| format!("Failed to write pageserver config to '{cfg_file_path}'"))?;
|
||||||
info!("Config successfully written to '{cfg_file_path}'")
|
info!("Config successfully written to '{cfg_file_path}'")
|
||||||
}
|
}
|
||||||
@@ -285,7 +284,6 @@ fn start_pageserver(
|
|||||||
))
|
))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
pageserver::preinitialize_metrics();
|
pageserver::preinitialize_metrics();
|
||||||
pageserver::metrics::wal_redo::set_process_kind_metric(conf.walredo_process_kind);
|
|
||||||
|
|
||||||
// If any failpoints were set from FAILPOINTS environment variable,
|
// If any failpoints were set from FAILPOINTS environment variable,
|
||||||
// print them to the log for debugging purposes
|
// print them to the log for debugging purposes
|
||||||
@@ -517,16 +515,12 @@ fn start_pageserver(
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
let secondary_controller = if let Some(remote_storage) = &remote_storage {
|
let secondary_controller = secondary::spawn_tasks(
|
||||||
secondary::spawn_tasks(
|
tenant_manager.clone(),
|
||||||
tenant_manager.clone(),
|
remote_storage.clone(),
|
||||||
remote_storage.clone(),
|
background_jobs_barrier.clone(),
|
||||||
background_jobs_barrier.clone(),
|
shutdown_pageserver.clone(),
|
||||||
shutdown_pageserver.clone(),
|
);
|
||||||
)
|
|
||||||
} else {
|
|
||||||
secondary::null_controller()
|
|
||||||
};
|
|
||||||
|
|
||||||
// shared state between the disk-usage backed eviction background task and the http endpoint
|
// shared state between the disk-usage backed eviction background task and the http endpoint
|
||||||
// that allows triggering disk-usage based eviction manually. note that the http endpoint
|
// that allows triggering disk-usage based eviction manually. note that the http endpoint
|
||||||
@@ -534,15 +528,13 @@ fn start_pageserver(
|
|||||||
// been configured.
|
// been configured.
|
||||||
let disk_usage_eviction_state: Arc<disk_usage_eviction_task::State> = Arc::default();
|
let disk_usage_eviction_state: Arc<disk_usage_eviction_task::State> = Arc::default();
|
||||||
|
|
||||||
if let Some(remote_storage) = &remote_storage {
|
launch_disk_usage_global_eviction_task(
|
||||||
launch_disk_usage_global_eviction_task(
|
conf,
|
||||||
conf,
|
remote_storage.clone(),
|
||||||
remote_storage.clone(),
|
disk_usage_eviction_state.clone(),
|
||||||
disk_usage_eviction_state.clone(),
|
tenant_manager.clone(),
|
||||||
tenant_manager.clone(),
|
background_jobs_barrier.clone(),
|
||||||
background_jobs_barrier.clone(),
|
)?;
|
||||||
)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Start up the service to handle HTTP mgmt API request. We created the
|
// Start up the service to handle HTTP mgmt API request. We created the
|
||||||
// listener earlier already.
|
// listener earlier already.
|
||||||
@@ -655,17 +647,20 @@ fn start_pageserver(
|
|||||||
None,
|
None,
|
||||||
"libpq endpoint listener",
|
"libpq endpoint listener",
|
||||||
true,
|
true,
|
||||||
async move {
|
{
|
||||||
page_service::libpq_listener_main(
|
let tenant_manager = tenant_manager.clone();
|
||||||
conf,
|
async move {
|
||||||
broker_client,
|
page_service::libpq_listener_main(
|
||||||
pg_auth,
|
tenant_manager,
|
||||||
pageserver_listener,
|
broker_client,
|
||||||
conf.pg_auth_type,
|
pg_auth,
|
||||||
libpq_ctx,
|
pageserver_listener,
|
||||||
task_mgr::shutdown_token(),
|
conf.pg_auth_type,
|
||||||
)
|
libpq_ctx,
|
||||||
.await
|
task_mgr::shutdown_token(),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
}
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -694,14 +689,7 @@ fn start_pageserver(
|
|||||||
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
|
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
|
||||||
// The plan is to change that over time.
|
// The plan is to change that over time.
|
||||||
shutdown_pageserver.take();
|
shutdown_pageserver.take();
|
||||||
let bg_remote_storage = remote_storage.clone();
|
pageserver::shutdown_pageserver(&tenant_manager, deletion_queue.clone(), 0).await;
|
||||||
let bg_deletion_queue = deletion_queue.clone();
|
|
||||||
pageserver::shutdown_pageserver(
|
|
||||||
&tenant_manager,
|
|
||||||
bg_remote_storage.map(|_| bg_deletion_queue),
|
|
||||||
0,
|
|
||||||
)
|
|
||||||
.await;
|
|
||||||
unreachable!()
|
unreachable!()
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@@ -709,12 +697,11 @@ fn start_pageserver(
|
|||||||
|
|
||||||
fn create_remote_storage_client(
|
fn create_remote_storage_client(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
) -> anyhow::Result<Option<GenericRemoteStorage>> {
|
) -> anyhow::Result<GenericRemoteStorage> {
|
||||||
let config = if let Some(config) = &conf.remote_storage_config {
|
let config = if let Some(config) = &conf.remote_storage_config {
|
||||||
config
|
config
|
||||||
} else {
|
} else {
|
||||||
tracing::warn!("no remote storage configured, this is a deprecated configuration");
|
anyhow::bail!("no remote storage configured, this is a deprecated configuration");
|
||||||
return Ok(None);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// Create the client
|
// Create the client
|
||||||
@@ -734,7 +721,7 @@ fn create_remote_storage_client(
|
|||||||
GenericRemoteStorage::unreliable_wrapper(remote_storage, conf.test_remote_failures);
|
GenericRemoteStorage::unreliable_wrapper(remote_storage, conf.test_remote_failures);
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(Some(remote_storage))
|
Ok(remote_storage)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn cli() -> Command {
|
fn cli() -> Command {
|
||||||
@@ -756,18 +743,13 @@ fn cli() -> Command {
|
|||||||
// See `settings.md` for more details on the extra configuration patameters pageserver can process
|
// See `settings.md` for more details on the extra configuration patameters pageserver can process
|
||||||
.arg(
|
.arg(
|
||||||
Arg::new("config-override")
|
Arg::new("config-override")
|
||||||
|
.long("config-override")
|
||||||
.short('c')
|
.short('c')
|
||||||
.num_args(1)
|
.num_args(1)
|
||||||
.action(ArgAction::Append)
|
.action(ArgAction::Append)
|
||||||
.help("Additional configuration overrides of the ones from the toml config file (or new ones to add there). \
|
.help("Additional configuration overrides of the ones from the toml config file (or new ones to add there). \
|
||||||
Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"),
|
Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"),
|
||||||
)
|
)
|
||||||
.arg(
|
|
||||||
Arg::new("update-config")
|
|
||||||
.long("update-config")
|
|
||||||
.action(ArgAction::SetTrue)
|
|
||||||
.help("Update the config file when started"),
|
|
||||||
)
|
|
||||||
.arg(
|
.arg(
|
||||||
Arg::new("enabled-features")
|
Arg::new("enabled-features")
|
||||||
.long("enabled-features")
|
.long("enabled-features")
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ use pageserver_api::shard::TenantShardId;
|
|||||||
use remote_storage::{RemotePath, RemoteStorageConfig};
|
use remote_storage::{RemotePath, RemoteStorageConfig};
|
||||||
use serde;
|
use serde;
|
||||||
use serde::de::IntoDeserializer;
|
use serde::de::IntoDeserializer;
|
||||||
use std::{collections::HashMap, env};
|
use std::env;
|
||||||
use storage_broker::Uri;
|
use storage_broker::Uri;
|
||||||
use utils::crashsafe::path_with_suffix_extension;
|
use utils::crashsafe::path_with_suffix_extension;
|
||||||
use utils::id::ConnectionId;
|
use utils::id::ConnectionId;
|
||||||
@@ -30,9 +30,9 @@ use utils::{
|
|||||||
logging::LogFormat,
|
logging::LogFormat,
|
||||||
};
|
};
|
||||||
|
|
||||||
use crate::tenant::config::TenantConfOpt;
|
|
||||||
use crate::tenant::timeline::GetVectoredImpl;
|
use crate::tenant::timeline::GetVectoredImpl;
|
||||||
use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
|
use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
|
||||||
|
use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
|
||||||
use crate::tenant::{
|
use crate::tenant::{
|
||||||
TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
|
TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
|
||||||
};
|
};
|
||||||
@@ -51,7 +51,7 @@ pub mod defaults {
|
|||||||
use crate::tenant::config::defaults::*;
|
use crate::tenant::config::defaults::*;
|
||||||
use const_format::formatcp;
|
use const_format::formatcp;
|
||||||
|
|
||||||
pub use pageserver_api::{
|
pub use pageserver_api::config::{
|
||||||
DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
|
DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
|
||||||
DEFAULT_PG_LISTEN_PORT,
|
DEFAULT_PG_LISTEN_PORT,
|
||||||
};
|
};
|
||||||
@@ -91,13 +91,15 @@ pub mod defaults {
|
|||||||
|
|
||||||
pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential";
|
pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential";
|
||||||
|
|
||||||
|
pub const DEFAULT_GET_IMPL: &str = "legacy";
|
||||||
|
|
||||||
pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
|
pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
|
||||||
|
|
||||||
pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
|
pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
|
||||||
|
|
||||||
pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
|
pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
|
||||||
|
|
||||||
pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "sync";
|
pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "async";
|
||||||
|
|
||||||
///
|
///
|
||||||
/// Default built-in configuration file.
|
/// Default built-in configuration file.
|
||||||
@@ -138,6 +140,8 @@ pub mod defaults {
|
|||||||
|
|
||||||
#get_vectored_impl = '{DEFAULT_GET_VECTORED_IMPL}'
|
#get_vectored_impl = '{DEFAULT_GET_VECTORED_IMPL}'
|
||||||
|
|
||||||
|
#get_impl = '{DEFAULT_GET_IMPL}'
|
||||||
|
|
||||||
#max_vectored_read_bytes = '{DEFAULT_MAX_VECTORED_READ_BYTES}'
|
#max_vectored_read_bytes = '{DEFAULT_MAX_VECTORED_READ_BYTES}'
|
||||||
|
|
||||||
#validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
|
#validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
|
||||||
@@ -284,6 +288,8 @@ pub struct PageServerConf {
|
|||||||
|
|
||||||
pub get_vectored_impl: GetVectoredImpl,
|
pub get_vectored_impl: GetVectoredImpl,
|
||||||
|
|
||||||
|
pub get_impl: GetImpl,
|
||||||
|
|
||||||
pub max_vectored_read_bytes: MaxVectoredReadBytes,
|
pub max_vectored_read_bytes: MaxVectoredReadBytes,
|
||||||
|
|
||||||
pub validate_vectored_get: bool,
|
pub validate_vectored_get: bool,
|
||||||
@@ -329,26 +335,6 @@ impl<T: Clone> BuilderValue<T> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Certain metadata (e.g. externally-addressable name, AZ) is delivered
|
|
||||||
// as a separate structure. This information is not neeed by the pageserver
|
|
||||||
// itself, it is only used for registering the pageserver with the control
|
|
||||||
// plane and/or storage controller.
|
|
||||||
//
|
|
||||||
#[derive(serde::Deserialize)]
|
|
||||||
pub(crate) struct NodeMetadata {
|
|
||||||
#[serde(rename = "host")]
|
|
||||||
pub(crate) postgres_host: String,
|
|
||||||
#[serde(rename = "port")]
|
|
||||||
pub(crate) postgres_port: u16,
|
|
||||||
pub(crate) http_host: String,
|
|
||||||
pub(crate) http_port: u16,
|
|
||||||
|
|
||||||
// Deployment tools may write fields to the metadata file beyond what we
|
|
||||||
// use in this type: this type intentionally only names fields that require.
|
|
||||||
#[serde(flatten)]
|
|
||||||
pub(crate) other: HashMap<String, serde_json::Value>,
|
|
||||||
}
|
|
||||||
|
|
||||||
// needed to simplify config construction
|
// needed to simplify config construction
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
struct PageServerConfigBuilder {
|
struct PageServerConfigBuilder {
|
||||||
@@ -414,6 +400,8 @@ struct PageServerConfigBuilder {
|
|||||||
|
|
||||||
get_vectored_impl: BuilderValue<GetVectoredImpl>,
|
get_vectored_impl: BuilderValue<GetVectoredImpl>,
|
||||||
|
|
||||||
|
get_impl: BuilderValue<GetImpl>,
|
||||||
|
|
||||||
max_vectored_read_bytes: BuilderValue<MaxVectoredReadBytes>,
|
max_vectored_read_bytes: BuilderValue<MaxVectoredReadBytes>,
|
||||||
|
|
||||||
validate_vectored_get: BuilderValue<bool>,
|
validate_vectored_get: BuilderValue<bool>,
|
||||||
@@ -503,6 +491,7 @@ impl PageServerConfigBuilder {
|
|||||||
virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),
|
virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),
|
||||||
|
|
||||||
get_vectored_impl: Set(DEFAULT_GET_VECTORED_IMPL.parse().unwrap()),
|
get_vectored_impl: Set(DEFAULT_GET_VECTORED_IMPL.parse().unwrap()),
|
||||||
|
get_impl: Set(DEFAULT_GET_IMPL.parse().unwrap()),
|
||||||
max_vectored_read_bytes: Set(MaxVectoredReadBytes(
|
max_vectored_read_bytes: Set(MaxVectoredReadBytes(
|
||||||
NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
|
NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
|
||||||
)),
|
)),
|
||||||
@@ -681,6 +670,10 @@ impl PageServerConfigBuilder {
|
|||||||
self.get_vectored_impl = BuilderValue::Set(value);
|
self.get_vectored_impl = BuilderValue::Set(value);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn get_impl(&mut self, value: GetImpl) {
|
||||||
|
self.get_impl = BuilderValue::Set(value);
|
||||||
|
}
|
||||||
|
|
||||||
pub fn get_max_vectored_read_bytes(&mut self, value: MaxVectoredReadBytes) {
|
pub fn get_max_vectored_read_bytes(&mut self, value: MaxVectoredReadBytes) {
|
||||||
self.max_vectored_read_bytes = BuilderValue::Set(value);
|
self.max_vectored_read_bytes = BuilderValue::Set(value);
|
||||||
}
|
}
|
||||||
@@ -750,6 +743,7 @@ impl PageServerConfigBuilder {
|
|||||||
secondary_download_concurrency,
|
secondary_download_concurrency,
|
||||||
ingest_batch_size,
|
ingest_batch_size,
|
||||||
get_vectored_impl,
|
get_vectored_impl,
|
||||||
|
get_impl,
|
||||||
max_vectored_read_bytes,
|
max_vectored_read_bytes,
|
||||||
validate_vectored_get,
|
validate_vectored_get,
|
||||||
ephemeral_bytes_per_memory_kb,
|
ephemeral_bytes_per_memory_kb,
|
||||||
@@ -1035,6 +1029,9 @@ impl PageServerConf {
|
|||||||
"get_vectored_impl" => {
|
"get_vectored_impl" => {
|
||||||
builder.get_vectored_impl(parse_toml_from_str("get_vectored_impl", item)?)
|
builder.get_vectored_impl(parse_toml_from_str("get_vectored_impl", item)?)
|
||||||
}
|
}
|
||||||
|
"get_impl" => {
|
||||||
|
builder.get_impl(parse_toml_from_str("get_impl", item)?)
|
||||||
|
}
|
||||||
"max_vectored_read_bytes" => {
|
"max_vectored_read_bytes" => {
|
||||||
let bytes = parse_toml_u64("max_vectored_read_bytes", item)? as usize;
|
let bytes = parse_toml_u64("max_vectored_read_bytes", item)? as usize;
|
||||||
builder.get_max_vectored_read_bytes(
|
builder.get_max_vectored_read_bytes(
|
||||||
@@ -1126,6 +1123,7 @@ impl PageServerConf {
|
|||||||
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
|
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
|
||||||
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
|
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
|
||||||
get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
|
get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
|
||||||
|
get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
|
||||||
max_vectored_read_bytes: MaxVectoredReadBytes(
|
max_vectored_read_bytes: MaxVectoredReadBytes(
|
||||||
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
|
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
|
||||||
.expect("Invalid default constant"),
|
.expect("Invalid default constant"),
|
||||||
@@ -1365,6 +1363,7 @@ background_task_maximum_delay = '334 s'
|
|||||||
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
|
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
|
||||||
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
|
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
|
||||||
get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
|
get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
|
||||||
|
get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
|
||||||
max_vectored_read_bytes: MaxVectoredReadBytes(
|
max_vectored_read_bytes: MaxVectoredReadBytes(
|
||||||
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
|
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
|
||||||
.expect("Invalid default constant")
|
.expect("Invalid default constant")
|
||||||
@@ -1438,6 +1437,7 @@ background_task_maximum_delay = '334 s'
|
|||||||
ingest_batch_size: 100,
|
ingest_batch_size: 100,
|
||||||
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
|
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
|
||||||
get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
|
get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
|
||||||
|
get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
|
||||||
max_vectored_read_bytes: MaxVectoredReadBytes(
|
max_vectored_read_bytes: MaxVectoredReadBytes(
|
||||||
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
|
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
|
||||||
.expect("Invalid default constant")
|
.expect("Invalid default constant")
|
||||||
@@ -1557,6 +1557,7 @@ broker_endpoint = '{broker_endpoint}'
|
|||||||
endpoint: Some(endpoint.clone()),
|
endpoint: Some(endpoint.clone()),
|
||||||
concurrency_limit: s3_concurrency_limit,
|
concurrency_limit: s3_concurrency_limit,
|
||||||
max_keys_per_list_response: None,
|
max_keys_per_list_response: None,
|
||||||
|
upload_storage_class: None,
|
||||||
}),
|
}),
|
||||||
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
|
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -14,10 +14,8 @@ use tokio_util::sync::CancellationToken;
|
|||||||
use url::Url;
|
use url::Url;
|
||||||
use utils::{backoff, failpoint_support, generation::Generation, id::NodeId};
|
use utils::{backoff, failpoint_support, generation::Generation, id::NodeId};
|
||||||
|
|
||||||
use crate::{
|
use crate::{config::PageServerConf, virtual_file::on_fatal_io_error};
|
||||||
config::{NodeMetadata, PageServerConf},
|
use pageserver_api::config::NodeMetadata;
|
||||||
virtual_file::on_fatal_io_error,
|
|
||||||
};
|
|
||||||
|
|
||||||
/// The Pageserver's client for using the control plane API: this is a small subset
|
/// The Pageserver's client for using the control plane API: this is a small subset
|
||||||
/// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md)
|
/// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md)
|
||||||
@@ -65,7 +63,7 @@ impl ControlPlaneClient {
|
|||||||
let mut client = reqwest::ClientBuilder::new();
|
let mut client = reqwest::ClientBuilder::new();
|
||||||
|
|
||||||
if let Some(jwt) = &conf.control_plane_api_token {
|
if let Some(jwt) = &conf.control_plane_api_token {
|
||||||
let mut headers = hyper::HeaderMap::new();
|
let mut headers = reqwest::header::HeaderMap::new();
|
||||||
headers.insert(
|
headers.insert(
|
||||||
"Authorization",
|
"Authorization",
|
||||||
format!("Bearer {}", jwt.get_contents()).parse().unwrap(),
|
format!("Bearer {}", jwt.get_contents()).parse().unwrap(),
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ use deleter::DeleterMessage;
|
|||||||
use list_writer::ListWriterQueueMessage;
|
use list_writer::ListWriterQueueMessage;
|
||||||
use validator::ValidatorQueueMessage;
|
use validator::ValidatorQueueMessage;
|
||||||
|
|
||||||
use crate::{config::PageServerConf, tenant::storage_layer::LayerFileName};
|
use crate::{config::PageServerConf, tenant::storage_layer::LayerName};
|
||||||
|
|
||||||
// TODO: configurable for how long to wait before executing deletions
|
// TODO: configurable for how long to wait before executing deletions
|
||||||
|
|
||||||
@@ -479,7 +479,7 @@ impl DeletionQueueClient {
|
|||||||
tenant_shard_id: TenantShardId,
|
tenant_shard_id: TenantShardId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
current_generation: Generation,
|
current_generation: Generation,
|
||||||
layers: Vec<(LayerFileName, LayerFileMetadata)>,
|
layers: Vec<(LayerName, LayerFileMetadata)>,
|
||||||
) -> Result<(), DeletionQueueError> {
|
) -> Result<(), DeletionQueueError> {
|
||||||
if current_generation.is_none() {
|
if current_generation.is_none() {
|
||||||
debug!("Enqueuing deletions in legacy mode, skipping queue");
|
debug!("Enqueuing deletions in legacy mode, skipping queue");
|
||||||
@@ -511,7 +511,7 @@ impl DeletionQueueClient {
|
|||||||
tenant_shard_id: TenantShardId,
|
tenant_shard_id: TenantShardId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
current_generation: Generation,
|
current_generation: Generation,
|
||||||
layers: Vec<(LayerFileName, LayerFileMetadata)>,
|
layers: Vec<(LayerName, LayerFileMetadata)>,
|
||||||
) -> Result<(), DeletionQueueError> {
|
) -> Result<(), DeletionQueueError> {
|
||||||
metrics::DELETION_QUEUE
|
metrics::DELETION_QUEUE
|
||||||
.keys_submitted
|
.keys_submitted
|
||||||
@@ -632,7 +632,7 @@ impl DeletionQueue {
|
|||||||
///
|
///
|
||||||
/// If remote_storage is None, then the returned workers will also be None.
|
/// If remote_storage is None, then the returned workers will also be None.
|
||||||
pub fn new<C>(
|
pub fn new<C>(
|
||||||
remote_storage: Option<GenericRemoteStorage>,
|
remote_storage: GenericRemoteStorage,
|
||||||
control_plane_client: Option<C>,
|
control_plane_client: Option<C>,
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
) -> (Self, Option<DeletionQueueWorkers<C>>)
|
) -> (Self, Option<DeletionQueueWorkers<C>>)
|
||||||
@@ -658,23 +658,6 @@ impl DeletionQueue {
|
|||||||
// longer to flush after Tenants have all been torn down.
|
// longer to flush after Tenants have all been torn down.
|
||||||
let cancel = CancellationToken::new();
|
let cancel = CancellationToken::new();
|
||||||
|
|
||||||
let remote_storage = match remote_storage {
|
|
||||||
None => {
|
|
||||||
return (
|
|
||||||
Self {
|
|
||||||
client: DeletionQueueClient {
|
|
||||||
tx,
|
|
||||||
executor_tx,
|
|
||||||
lsn_table: lsn_table.clone(),
|
|
||||||
},
|
|
||||||
cancel,
|
|
||||||
},
|
|
||||||
None,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
Some(r) => r,
|
|
||||||
};
|
|
||||||
|
|
||||||
(
|
(
|
||||||
Self {
|
Self {
|
||||||
client: DeletionQueueClient {
|
client: DeletionQueueClient {
|
||||||
@@ -734,20 +717,20 @@ mod test {
|
|||||||
use crate::{
|
use crate::{
|
||||||
control_plane_client::RetryForeverError,
|
control_plane_client::RetryForeverError,
|
||||||
repository::Key,
|
repository::Key,
|
||||||
tenant::{harness::TenantHarness, storage_layer::DeltaFileName},
|
tenant::{harness::TenantHarness, storage_layer::DeltaLayerName},
|
||||||
};
|
};
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
pub const TIMELINE_ID: TimelineId =
|
pub const TIMELINE_ID: TimelineId =
|
||||||
TimelineId::from_array(hex!("11223344556677881122334455667788"));
|
TimelineId::from_array(hex!("11223344556677881122334455667788"));
|
||||||
|
|
||||||
pub const EXAMPLE_LAYER_NAME: LayerFileName = LayerFileName::Delta(DeltaFileName {
|
pub const EXAMPLE_LAYER_NAME: LayerName = LayerName::Delta(DeltaLayerName {
|
||||||
key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF),
|
key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF),
|
||||||
lsn_range: Lsn(0x00000000016B59D8)..Lsn(0x00000000016B5A51),
|
lsn_range: Lsn(0x00000000016B59D8)..Lsn(0x00000000016B5A51),
|
||||||
});
|
});
|
||||||
|
|
||||||
// When you need a second layer in a test.
|
// When you need a second layer in a test.
|
||||||
pub const EXAMPLE_LAYER_NAME_ALT: LayerFileName = LayerFileName::Delta(DeltaFileName {
|
pub const EXAMPLE_LAYER_NAME_ALT: LayerName = LayerName::Delta(DeltaLayerName {
|
||||||
key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF),
|
key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF),
|
||||||
lsn_range: Lsn(0x00000000016B5A51)..Lsn(0x00000000016B5A61),
|
lsn_range: Lsn(0x00000000016B5A51)..Lsn(0x00000000016B5A61),
|
||||||
});
|
});
|
||||||
@@ -765,7 +748,7 @@ mod test {
|
|||||||
/// Simulate a pageserver restart by destroying and recreating the deletion queue
|
/// Simulate a pageserver restart by destroying and recreating the deletion queue
|
||||||
async fn restart(&mut self) {
|
async fn restart(&mut self) {
|
||||||
let (deletion_queue, workers) = DeletionQueue::new(
|
let (deletion_queue, workers) = DeletionQueue::new(
|
||||||
Some(self.storage.clone()),
|
self.storage.clone(),
|
||||||
Some(self.mock_control_plane.clone()),
|
Some(self.mock_control_plane.clone()),
|
||||||
self.harness.conf,
|
self.harness.conf,
|
||||||
);
|
);
|
||||||
@@ -797,7 +780,7 @@ mod test {
|
|||||||
/// Returns remote layer file name, suitable for use in assert_remote_files
|
/// Returns remote layer file name, suitable for use in assert_remote_files
|
||||||
fn write_remote_layer(
|
fn write_remote_layer(
|
||||||
&self,
|
&self,
|
||||||
file_name: LayerFileName,
|
file_name: LayerName,
|
||||||
gen: Generation,
|
gen: Generation,
|
||||||
) -> anyhow::Result<String> {
|
) -> anyhow::Result<String> {
|
||||||
let tenant_shard_id = self.harness.tenant_shard_id;
|
let tenant_shard_id = self.harness.tenant_shard_id;
|
||||||
@@ -875,7 +858,7 @@ mod test {
|
|||||||
let mock_control_plane = MockControlPlane::new();
|
let mock_control_plane = MockControlPlane::new();
|
||||||
|
|
||||||
let (deletion_queue, worker) = DeletionQueue::new(
|
let (deletion_queue, worker) = DeletionQueue::new(
|
||||||
Some(storage.clone()),
|
storage.clone(),
|
||||||
Some(mock_control_plane.clone()),
|
Some(mock_control_plane.clone()),
|
||||||
harness.conf,
|
harness.conf,
|
||||||
);
|
);
|
||||||
@@ -952,7 +935,7 @@ mod test {
|
|||||||
let client = ctx.deletion_queue.new_client();
|
let client = ctx.deletion_queue.new_client();
|
||||||
client.recover(HashMap::new())?;
|
client.recover(HashMap::new())?;
|
||||||
|
|
||||||
let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
|
let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
|
||||||
let tenant_shard_id = ctx.harness.tenant_shard_id;
|
let tenant_shard_id = ctx.harness.tenant_shard_id;
|
||||||
|
|
||||||
let content: Vec<u8> = "victim1 contents".into();
|
let content: Vec<u8> = "victim1 contents".into();
|
||||||
|
|||||||
@@ -34,7 +34,7 @@ use crate::deletion_queue::TEMP_SUFFIX;
|
|||||||
use crate::metrics;
|
use crate::metrics;
|
||||||
use crate::tenant::remote_timeline_client::remote_layer_path;
|
use crate::tenant::remote_timeline_client::remote_layer_path;
|
||||||
use crate::tenant::remote_timeline_client::LayerFileMetadata;
|
use crate::tenant::remote_timeline_client::LayerFileMetadata;
|
||||||
use crate::tenant::storage_layer::LayerFileName;
|
use crate::tenant::storage_layer::LayerName;
|
||||||
use crate::virtual_file::on_fatal_io_error;
|
use crate::virtual_file::on_fatal_io_error;
|
||||||
use crate::virtual_file::MaybeFatalIo;
|
use crate::virtual_file::MaybeFatalIo;
|
||||||
|
|
||||||
@@ -59,7 +59,7 @@ pub(super) struct DeletionOp {
|
|||||||
// `layers` and `objects` are both just lists of objects. `layers` is used if you do not
|
// `layers` and `objects` are both just lists of objects. `layers` is used if you do not
|
||||||
// have a config object handy to project it to a remote key, and need the consuming worker
|
// have a config object handy to project it to a remote key, and need the consuming worker
|
||||||
// to do it for you.
|
// to do it for you.
|
||||||
pub(super) layers: Vec<(LayerFileName, LayerFileMetadata)>,
|
pub(super) layers: Vec<(LayerName, LayerFileMetadata)>,
|
||||||
pub(super) objects: Vec<RemotePath>,
|
pub(super) objects: Vec<RemotePath>,
|
||||||
|
|
||||||
/// The _current_ generation of the Tenant shard attachment in which we are enqueuing
|
/// The _current_ generation of the Tenant shard attachment in which we are enqueuing
|
||||||
|
|||||||
@@ -64,7 +64,7 @@ use crate::{
|
|||||||
mgr::TenantManager,
|
mgr::TenantManager,
|
||||||
remote_timeline_client::LayerFileMetadata,
|
remote_timeline_client::LayerFileMetadata,
|
||||||
secondary::SecondaryTenant,
|
secondary::SecondaryTenant,
|
||||||
storage_layer::{AsLayerDesc, EvictionError, Layer, LayerFileName},
|
storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName},
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -534,13 +534,12 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
EvictionLayer::Secondary(layer) => {
|
EvictionLayer::Secondary(layer) => {
|
||||||
let file_size = layer.metadata.file_size();
|
let file_size = layer.metadata.file_size;
|
||||||
let tenant_manager = tenant_manager.clone();
|
|
||||||
|
|
||||||
js.spawn(async move {
|
js.spawn(async move {
|
||||||
layer
|
layer
|
||||||
.secondary_tenant
|
.secondary_tenant
|
||||||
.evict_layer(tenant_manager.get_conf(), layer.timeline_id, layer.name)
|
.evict_layer(layer.timeline_id, layer.name)
|
||||||
.await;
|
.await;
|
||||||
Ok(file_size)
|
Ok(file_size)
|
||||||
});
|
});
|
||||||
@@ -599,7 +598,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
|||||||
pub(crate) struct EvictionSecondaryLayer {
|
pub(crate) struct EvictionSecondaryLayer {
|
||||||
pub(crate) secondary_tenant: Arc<SecondaryTenant>,
|
pub(crate) secondary_tenant: Arc<SecondaryTenant>,
|
||||||
pub(crate) timeline_id: TimelineId,
|
pub(crate) timeline_id: TimelineId,
|
||||||
pub(crate) name: LayerFileName,
|
pub(crate) name: LayerName,
|
||||||
pub(crate) metadata: LayerFileMetadata,
|
pub(crate) metadata: LayerFileMetadata,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -632,9 +631,9 @@ impl EvictionLayer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn get_name(&self) -> LayerFileName {
|
pub(crate) fn get_name(&self) -> LayerName {
|
||||||
match self {
|
match self {
|
||||||
Self::Attached(l) => l.layer_desc().filename(),
|
Self::Attached(l) => l.layer_desc().layer_name(),
|
||||||
Self::Secondary(sl) => sl.name.clone(),
|
Self::Secondary(sl) => sl.name.clone(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -642,7 +641,7 @@ impl EvictionLayer {
|
|||||||
pub(crate) fn get_file_size(&self) -> u64 {
|
pub(crate) fn get_file_size(&self) -> u64 {
|
||||||
match self {
|
match self {
|
||||||
Self::Attached(l) => l.layer_desc().file_size,
|
Self::Attached(l) => l.layer_desc().file_size,
|
||||||
Self::Secondary(sl) => sl.metadata.file_size(),
|
Self::Secondary(sl) => sl.metadata.file_size,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -257,6 +257,37 @@ paths:
|
|||||||
schema:
|
schema:
|
||||||
$ref: "#/components/schemas/LsnByTimestampResponse"
|
$ref: "#/components/schemas/LsnByTimestampResponse"
|
||||||
|
|
||||||
|
/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/lsn_lease:
|
||||||
|
parameters:
|
||||||
|
- name: tenant_shard_id
|
||||||
|
in: path
|
||||||
|
required: true
|
||||||
|
schema:
|
||||||
|
type: string
|
||||||
|
- name: timeline_id
|
||||||
|
in: path
|
||||||
|
required: true
|
||||||
|
schema:
|
||||||
|
type: string
|
||||||
|
format: hex
|
||||||
|
post:
|
||||||
|
description: Obtain lease for the given LSN
|
||||||
|
parameters:
|
||||||
|
- name: lsn
|
||||||
|
in: query
|
||||||
|
required: true
|
||||||
|
schema:
|
||||||
|
type: string
|
||||||
|
format: hex
|
||||||
|
description: A LSN to obtain the lease for
|
||||||
|
responses:
|
||||||
|
"200":
|
||||||
|
description: OK
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/LsnLease"
|
||||||
|
|
||||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc:
|
/v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc:
|
||||||
parameters:
|
parameters:
|
||||||
- name: tenant_id
|
- name: tenant_id
|
||||||
@@ -420,25 +451,6 @@ paths:
|
|||||||
description: Tenant scheduled to load successfully
|
description: Tenant scheduled to load successfully
|
||||||
|
|
||||||
/v1/tenant/{tenant_id}/synthetic_size:
|
/v1/tenant/{tenant_id}/synthetic_size:
|
||||||
parameters:
|
|
||||||
- name: tenant_id
|
|
||||||
in: path
|
|
||||||
required: true
|
|
||||||
schema:
|
|
||||||
type: string
|
|
||||||
get:
|
|
||||||
description: |
|
|
||||||
Calculate tenant's synthetic size
|
|
||||||
responses:
|
|
||||||
"200":
|
|
||||||
description: Tenant's synthetic size
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: "#/components/schemas/SyntheticSizeResponse"
|
|
||||||
|
|
||||||
# This route has no handler. TODO: remove?
|
|
||||||
/v1/tenant/{tenant_id}/size:
|
|
||||||
parameters:
|
parameters:
|
||||||
- name: tenant_id
|
- name: tenant_id
|
||||||
in: path
|
in: path
|
||||||
@@ -468,19 +480,9 @@ paths:
|
|||||||
content:
|
content:
|
||||||
application/json:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
type: object
|
$ref: "#/components/schemas/SyntheticSizeResponse"
|
||||||
required:
|
text/html:
|
||||||
- id
|
description: SVG representation of the tenant and it's timelines.
|
||||||
- size
|
|
||||||
properties:
|
|
||||||
id:
|
|
||||||
type: string
|
|
||||||
format: hex
|
|
||||||
size:
|
|
||||||
type: integer
|
|
||||||
nullable: true
|
|
||||||
description: |
|
|
||||||
Size metric in bytes or null if inputs_only=true was given.
|
|
||||||
"401":
|
"401":
|
||||||
description: Unauthorized Error
|
description: Unauthorized Error
|
||||||
content:
|
content:
|
||||||
@@ -610,6 +612,80 @@ paths:
|
|||||||
schema:
|
schema:
|
||||||
$ref: "#/components/schemas/Error"
|
$ref: "#/components/schemas/Error"
|
||||||
|
|
||||||
|
/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor:
|
||||||
|
parameters:
|
||||||
|
- name: tenant_shard_id
|
||||||
|
in: path
|
||||||
|
required: true
|
||||||
|
schema:
|
||||||
|
type: string
|
||||||
|
- name: timeline_id
|
||||||
|
in: path
|
||||||
|
ŕequired: true
|
||||||
|
schema:
|
||||||
|
type: string
|
||||||
|
|
||||||
|
put:
|
||||||
|
description: |
|
||||||
|
Detach a timeline from its ancestor and reparent all ancestors timelines with lower `ancestor_lsn`.
|
||||||
|
Current implementation might not be retryable across failure cases, but will be enhanced in future.
|
||||||
|
Detaching should be expected to be expensive operation. Timeouts should be retried.
|
||||||
|
responses:
|
||||||
|
"200":
|
||||||
|
description: |
|
||||||
|
The timeline has been detached from it's ancestor (now or earlier), and at least the returned timelines have been reparented.
|
||||||
|
If any timelines were deleted after reparenting, they might not be on this list.
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/AncestorDetached"
|
||||||
|
|
||||||
|
"400":
|
||||||
|
description: |
|
||||||
|
Number of early checks meaning the timeline cannot be detached now:
|
||||||
|
- the ancestor of timeline has an ancestor: not supported, see RFC
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/Error"
|
||||||
|
|
||||||
|
"404":
|
||||||
|
description: Tenant or timeline not found.
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/NotFoundError"
|
||||||
|
|
||||||
|
"409":
|
||||||
|
description: |
|
||||||
|
The timeline can never be detached:
|
||||||
|
- timeline has no ancestor, implying that the timeline has never had an ancestor
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/ConflictError"
|
||||||
|
|
||||||
|
"500":
|
||||||
|
description: |
|
||||||
|
Transient error, for example, pageserver shutdown happened while
|
||||||
|
processing the request but we were unable to distinguish that. Must
|
||||||
|
be retried.
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/Error"
|
||||||
|
|
||||||
|
"503":
|
||||||
|
description: |
|
||||||
|
Temporarily unavailable, please retry. Possible reasons:
|
||||||
|
- another timeline detach for the same tenant is underway, please retry later
|
||||||
|
- detected shutdown error
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||||
|
|
||||||
|
|
||||||
/v1/tenant/:
|
/v1/tenant/:
|
||||||
get:
|
get:
|
||||||
description: Get tenants list
|
description: Get tenants list
|
||||||
@@ -782,9 +858,6 @@ components:
|
|||||||
required:
|
required:
|
||||||
- mode
|
- mode
|
||||||
properties:
|
properties:
|
||||||
tenant_id:
|
|
||||||
type: string
|
|
||||||
description: Not used, scheduled for removal.
|
|
||||||
mode:
|
mode:
|
||||||
type: string
|
type: string
|
||||||
enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"]
|
enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"]
|
||||||
@@ -932,6 +1005,9 @@ components:
|
|||||||
format: hex
|
format: hex
|
||||||
size:
|
size:
|
||||||
type: integer
|
type: integer
|
||||||
|
nullable: true
|
||||||
|
description: |
|
||||||
|
Size metric in bytes or null if inputs_only=true was given.
|
||||||
segment_sizes:
|
segment_sizes:
|
||||||
type: array
|
type: array
|
||||||
items:
|
items:
|
||||||
@@ -1009,6 +1085,15 @@ components:
|
|||||||
type: string
|
type: string
|
||||||
enum: [past, present, future, nodata]
|
enum: [past, present, future, nodata]
|
||||||
|
|
||||||
|
LsnLease:
|
||||||
|
type: object
|
||||||
|
required:
|
||||||
|
- valid_until
|
||||||
|
properties:
|
||||||
|
valid_until:
|
||||||
|
type: string
|
||||||
|
format: date-time
|
||||||
|
|
||||||
PageserverUtilization:
|
PageserverUtilization:
|
||||||
type: object
|
type: object
|
||||||
required:
|
required:
|
||||||
@@ -1066,6 +1151,19 @@ components:
|
|||||||
format: int64
|
format: int64
|
||||||
description: How many bytes of layer content were in the latest layer heatmap
|
description: How many bytes of layer content were in the latest layer heatmap
|
||||||
|
|
||||||
|
AncestorDetached:
|
||||||
|
type: object
|
||||||
|
required:
|
||||||
|
- reparented_timelines
|
||||||
|
properties:
|
||||||
|
reparented_timelines:
|
||||||
|
type: array
|
||||||
|
description: Set of reparented timeline ids
|
||||||
|
properties:
|
||||||
|
type: string
|
||||||
|
format: hex
|
||||||
|
description: TimelineId
|
||||||
|
|
||||||
|
|
||||||
Error:
|
Error:
|
||||||
type: object
|
type: object
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user