mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-17 13:10:38 +00:00
Compare commits
394 Commits
fcdm/image
...
release-pr
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2804f5323b | ||
|
|
676adc6b32 | ||
|
|
3fd77eb0d4 | ||
|
|
3114be034a | ||
|
|
8dc7dc79dd | ||
|
|
fad9be4598 | ||
|
|
20d0939b00 | ||
|
|
ea0d35f3ca | ||
|
|
e34059cd18 | ||
|
|
d999c46692 | ||
|
|
82853cc1d1 | ||
|
|
1efaa16260 | ||
|
|
4dbb74b559 | ||
|
|
5ab10d051d | ||
|
|
f8bdce1015 | ||
|
|
7ba50708e3 | ||
|
|
e9e77ee744 | ||
|
|
ee93700a0f | ||
|
|
502b69b33b | ||
|
|
76ab57f33f | ||
|
|
5984edaecd | ||
|
|
3eb83a0ebb | ||
|
|
4d426f6fbe | ||
|
|
d04af08567 | ||
|
|
54586d6b57 | ||
|
|
e5384ebefc | ||
|
|
60a232400b | ||
|
|
edd809747b | ||
|
|
48957e23b7 | ||
|
|
1d5e476c96 | ||
|
|
2b11466b59 | ||
|
|
b6bd75964f | ||
|
|
fcb77f3d8f | ||
|
|
c3a40a06f3 | ||
|
|
1b1320a263 | ||
|
|
e1b4d96b5b | ||
|
|
a8ec18c0f4 | ||
|
|
045bc6af8b | ||
|
|
c8ac4c054e | ||
|
|
896d51367e | ||
|
|
a691786ce2 | ||
|
|
2991d01b61 | ||
|
|
e895644555 | ||
|
|
62d77e263f | ||
|
|
b2bbc20311 | ||
|
|
5accf6e24a | ||
|
|
0881d4f9e3 | ||
|
|
975786265c | ||
|
|
c4059939e6 | ||
|
|
75baf83fce | ||
|
|
459c2af8c1 | ||
|
|
51a43b121c | ||
|
|
256058f2ab | ||
|
|
ceedc3ef73 | ||
|
|
5273c94c59 | ||
|
|
dedf66ba5b | ||
|
|
8283779ee8 | ||
|
|
b8f9e3a9eb | ||
|
|
ec3efc56a8 | ||
|
|
94f6b488ed | ||
|
|
a12e4261a3 | ||
|
|
cd449d66ea | ||
|
|
6f8f7c7de9 | ||
|
|
12487e662d | ||
|
|
5bcae3a86e | ||
|
|
47657f2df4 | ||
|
|
d669dacd71 | ||
|
|
837988b6c9 | ||
|
|
9c6145f0a9 | ||
|
|
2424d90883 | ||
|
|
cf3baf6039 | ||
|
|
9c48b5c4ab | ||
|
|
c671aeacd4 | ||
|
|
bc7a82caf2 | ||
|
|
96a4e8de66 | ||
|
|
b5246753bf | ||
|
|
c1095f4c52 | ||
|
|
1718c0b59b | ||
|
|
8107ae8377 | ||
|
|
555ee9fdd0 | ||
|
|
6921577cec | ||
|
|
20fff05699 | ||
|
|
f2767d2056 | ||
|
|
76b92e3389 | ||
|
|
03f8a42ed9 | ||
|
|
60e5a56a5a | ||
|
|
afda4420bd | ||
|
|
ce1673a8c4 | ||
|
|
532b0fa52b | ||
|
|
4de2f0f3e0 | ||
|
|
41464325c7 | ||
|
|
7257ffbf75 | ||
|
|
84f027357d | ||
|
|
428d9fe69e | ||
|
|
e0af945f8f | ||
|
|
e7452d3756 | ||
|
|
5d6083bfc6 | ||
|
|
3882f57001 | ||
|
|
04190a1fea | ||
|
|
fcbe9fb184 | ||
|
|
cbb599f353 | ||
|
|
e49602ecf5 | ||
|
|
eb02f4619e | ||
|
|
9b8df2634f | ||
|
|
d152d4f16f | ||
|
|
b467d8067b | ||
|
|
a48b23d777 | ||
|
|
21a86487a2 | ||
|
|
686b3c79c8 | ||
|
|
02a8b7fbe0 | ||
|
|
feb359b459 | ||
|
|
0c105ef352 | ||
|
|
4f7704af24 | ||
|
|
e0c12faabd | ||
|
|
2f8a2681b8 | ||
|
|
7e4280955e | ||
|
|
349b375010 | ||
|
|
d0d4871682 | ||
|
|
01180666b0 | ||
|
|
587cb705b8 | ||
|
|
4d2bf55e6c | ||
|
|
5667372c61 | ||
|
|
61f99d703d | ||
|
|
24014d8383 | ||
|
|
e3ded64d1b | ||
|
|
9b714c8572 | ||
|
|
29fb675432 | ||
|
|
ca07fa5f8b | ||
|
|
6c94269c32 | ||
|
|
edc691647d | ||
|
|
855d7b4781 | ||
|
|
c49c9707ce | ||
|
|
2227540a0d | ||
|
|
f1347f2417 | ||
|
|
30b295b017 | ||
|
|
1cef395266 | ||
|
|
78d160f76d | ||
|
|
b9238059d6 | ||
|
|
d0cb4b88c8 | ||
|
|
1ec3e39d4e | ||
|
|
a1a74eef2c | ||
|
|
90e689adda | ||
|
|
f0b2d4b053 | ||
|
|
299d9474c9 | ||
|
|
7234208b36 | ||
|
|
93450f11f5 | ||
|
|
2f0f9edf33 | ||
|
|
d424f2b7c8 | ||
|
|
21315e80bc | ||
|
|
483b66d383 | ||
|
|
aa72a22661 | ||
|
|
5c0264b591 | ||
|
|
9f13277729 | ||
|
|
54aa319805 | ||
|
|
4a227484bf | ||
|
|
2f83f85291 | ||
|
|
d6cfcb0d93 | ||
|
|
392843ad2a | ||
|
|
bd4dae8f4a | ||
|
|
b05fe53cfd | ||
|
|
c13a2f0df1 | ||
|
|
39be366fc5 | ||
|
|
6eda0a3158 | ||
|
|
306c7a1813 | ||
|
|
80be423a58 | ||
|
|
5dcfef82f2 | ||
|
|
e67b8f69c0 | ||
|
|
e546872ab4 | ||
|
|
322ea1cf7c | ||
|
|
3633742de9 | ||
|
|
079d3a37ba | ||
|
|
a46e77b476 | ||
|
|
a92702b01e | ||
|
|
8ff3253f20 | ||
|
|
04b82c92a7 | ||
|
|
e5bf423e68 | ||
|
|
60af392e45 | ||
|
|
661fc41e71 | ||
|
|
702c488f32 | ||
|
|
45c5122754 | ||
|
|
558394f710 | ||
|
|
73b0898608 | ||
|
|
e65be4c2dc | ||
|
|
40087b8164 | ||
|
|
c762b59483 | ||
|
|
5d71601ca9 | ||
|
|
a113c3e433 | ||
|
|
e81fc598f4 | ||
|
|
48b845fa76 | ||
|
|
27096858dc | ||
|
|
4430d0ae7d | ||
|
|
6e183aa0de | ||
|
|
fd6d0b7635 | ||
|
|
3710c32aae | ||
|
|
be83bee49d | ||
|
|
cf28e5922a | ||
|
|
7d384d6953 | ||
|
|
4b3b37b912 | ||
|
|
1d8d200f4d | ||
|
|
0d80d6ce18 | ||
|
|
f653ee039f | ||
|
|
e614a95853 | ||
|
|
850db4cc13 | ||
|
|
8a316b1277 | ||
|
|
4d13bae449 | ||
|
|
49377abd98 | ||
|
|
a6b2f4e54e | ||
|
|
face60d50b | ||
|
|
9768aa27f2 | ||
|
|
96b2e575e1 | ||
|
|
7222777784 | ||
|
|
5469fdede0 | ||
|
|
72aa6b9fdd | ||
|
|
ae0634b7be | ||
|
|
70711f32fa | ||
|
|
52a88af0aa | ||
|
|
b7a43bf817 | ||
|
|
dce91b33a4 | ||
|
|
23ee4f3050 | ||
|
|
46857e8282 | ||
|
|
368ab0ce54 | ||
|
|
a5987eebfd | ||
|
|
6686ede30f | ||
|
|
373c7057cc | ||
|
|
7d6ec16166 | ||
|
|
0e6fdc8a58 | ||
|
|
521438a5c6 | ||
|
|
07d7874bc8 | ||
|
|
1804111a02 | ||
|
|
cd0178efed | ||
|
|
333574be57 | ||
|
|
79a799a143 | ||
|
|
9da06af6c9 | ||
|
|
ce1753d036 | ||
|
|
67db8432b4 | ||
|
|
4e2e44e524 | ||
|
|
ed786104f3 | ||
|
|
84b74f2bd1 | ||
|
|
fec2ad6283 | ||
|
|
98eebd4682 | ||
|
|
2f74287c9b | ||
|
|
aee1bf95e3 | ||
|
|
b9de9d75ff | ||
|
|
7943b709e6 | ||
|
|
d7d066d493 | ||
|
|
e78ac22107 | ||
|
|
76a8f2bb44 | ||
|
|
8d59a8581f | ||
|
|
b1ddd01289 | ||
|
|
6eae4fc9aa | ||
|
|
765455bca2 | ||
|
|
4204960942 | ||
|
|
67345d66ea | ||
|
|
2266ee5971 | ||
|
|
b58445d855 | ||
|
|
36050e7f3d | ||
|
|
33360ed96d | ||
|
|
39a28d1108 | ||
|
|
efa6aa134f | ||
|
|
2c724e56e2 | ||
|
|
feff887c6f | ||
|
|
353d915fcf | ||
|
|
2e38098cbc | ||
|
|
a6fe5ea1ac | ||
|
|
05b0aed0c1 | ||
|
|
cd1705357d | ||
|
|
6bc7561290 | ||
|
|
fbd3ac14b5 | ||
|
|
e437787c8f | ||
|
|
3460dbf90b | ||
|
|
6b89d99677 | ||
|
|
6cc8ea86e4 | ||
|
|
e62a492d6f | ||
|
|
a475cdf642 | ||
|
|
7002c79a47 | ||
|
|
ee6cf357b4 | ||
|
|
e5c2086b5f | ||
|
|
5f1208296a | ||
|
|
88e8e473cd | ||
|
|
b0a77844f6 | ||
|
|
1baf464307 | ||
|
|
e9b8e81cea | ||
|
|
85d6194aa4 | ||
|
|
333a7a68ef | ||
|
|
6aa4e41bee | ||
|
|
840183e51f | ||
|
|
cbccc94b03 | ||
|
|
fce227df22 | ||
|
|
bd787e800f | ||
|
|
4a7704b4a3 | ||
|
|
ff1119da66 | ||
|
|
4c3ba1627b | ||
|
|
1407174fb2 | ||
|
|
ec9dcb1889 | ||
|
|
d11d781afc | ||
|
|
4e44565b71 | ||
|
|
4ed51ad33b | ||
|
|
1c1ebe5537 | ||
|
|
c19cb7f386 | ||
|
|
4b97d31b16 | ||
|
|
923ade3dd7 | ||
|
|
b04e711975 | ||
|
|
afd0a6b39a | ||
|
|
99752286d8 | ||
|
|
15df93363c | ||
|
|
bc0ab741af | ||
|
|
51d9dfeaa3 | ||
|
|
f63cb18155 | ||
|
|
0de603d88e | ||
|
|
240913912a | ||
|
|
91a4ea0de2 | ||
|
|
8608704f49 | ||
|
|
efef68ce99 | ||
|
|
8daefd24da | ||
|
|
46cc8b7982 | ||
|
|
38cd90dd0c | ||
|
|
a51b269f15 | ||
|
|
43bf6d0a0f | ||
|
|
15273a9b66 | ||
|
|
78aca668d0 | ||
|
|
acbf4148ea | ||
|
|
6508540561 | ||
|
|
a41b5244a8 | ||
|
|
2b3189be95 | ||
|
|
248563c595 | ||
|
|
14cd6ca933 | ||
|
|
eb36403e71 | ||
|
|
3c6f779698 | ||
|
|
f67f0c1c11 | ||
|
|
edb02d3299 | ||
|
|
664a69e65b | ||
|
|
478322ebf9 | ||
|
|
802f174072 | ||
|
|
47f9890bae | ||
|
|
262265daad | ||
|
|
300da5b872 | ||
|
|
7b22b5c433 | ||
|
|
ffca97bc1e | ||
|
|
cb356f3259 | ||
|
|
c85374295f | ||
|
|
4992160677 | ||
|
|
bd535b3371 | ||
|
|
d90c5a03af | ||
|
|
2d02cc9079 | ||
|
|
49ad94b99f | ||
|
|
948a217398 | ||
|
|
125381eae7 | ||
|
|
cd01bbc715 | ||
|
|
d8b5e3b88d | ||
|
|
06d25f2186 | ||
|
|
f759b561f3 | ||
|
|
ece0555600 | ||
|
|
73ea0a0b01 | ||
|
|
d8f6d6fd6f | ||
|
|
d24de169a7 | ||
|
|
0816168296 | ||
|
|
277b44d57a | ||
|
|
68c2c3880e | ||
|
|
49da498f65 | ||
|
|
2c76ba3dd7 | ||
|
|
dbe3dc69ad | ||
|
|
8e5bb3ed49 | ||
|
|
ab0be7b8da | ||
|
|
b4c55f5d24 | ||
|
|
ede70d833c | ||
|
|
70c3d18bb0 | ||
|
|
7a491f52c4 | ||
|
|
323c4ecb4f | ||
|
|
3d2466607e | ||
|
|
ed478b39f4 | ||
|
|
91585a558d | ||
|
|
93467eae1f | ||
|
|
f3aac81d19 | ||
|
|
979ad60c19 | ||
|
|
9316cb1b1f | ||
|
|
e7939a527a | ||
|
|
36d26665e1 | ||
|
|
873347f977 | ||
|
|
e814ac16f9 | ||
|
|
ad3055d386 | ||
|
|
94e03eb452 | ||
|
|
380f26ef79 | ||
|
|
3c5b7f59d7 | ||
|
|
fee89f80b5 | ||
|
|
41cce8eaf1 | ||
|
|
f88fe0218d | ||
|
|
cc856eca85 | ||
|
|
cf350c6002 | ||
|
|
0ce6b6a0a3 | ||
|
|
73f247d537 | ||
|
|
960be82183 | ||
|
|
806e5a6c19 | ||
|
|
8d5df07cce | ||
|
|
df7a9d1407 |
@@ -39,7 +39,7 @@ runs:
|
||||
PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
|
||||
if [ "${PR_NUMBER}" != "null" ]; then
|
||||
BRANCH_OR_PR=pr-${PR_NUMBER}
|
||||
elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then
|
||||
elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then
|
||||
# Shortcut for special branches
|
||||
BRANCH_OR_PR=${GITHUB_REF_NAME}
|
||||
else
|
||||
@@ -59,7 +59,7 @@ runs:
|
||||
BUCKET: neon-github-public-dev
|
||||
|
||||
# TODO: We can replace with a special docker image with Java and Allure pre-installed
|
||||
- uses: actions/setup-java@v3
|
||||
- uses: actions/setup-java@v4
|
||||
with:
|
||||
distribution: 'temurin'
|
||||
java-version: '17'
|
||||
@@ -76,8 +76,8 @@ runs:
|
||||
rm -f ${ALLURE_ZIP}
|
||||
fi
|
||||
env:
|
||||
ALLURE_VERSION: 2.24.0
|
||||
ALLURE_ZIP_SHA256: 60b1d6ce65d9ef24b23cf9c2c19fd736a123487c38e54759f1ed1a7a77353c90
|
||||
ALLURE_VERSION: 2.27.0
|
||||
ALLURE_ZIP_SHA256: b071858fb2fa542c65d8f152c5c40d26267b2dfb74df1f1608a589ecca38e777
|
||||
|
||||
# Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this
|
||||
- name: Acquire lock
|
||||
@@ -180,7 +180,7 @@ runs:
|
||||
fi
|
||||
|
||||
- name: Cache poetry deps
|
||||
uses: actions/cache@v3
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||
@@ -215,7 +215,7 @@ runs:
|
||||
rm -rf ${WORKDIR}
|
||||
fi
|
||||
|
||||
- uses: actions/github-script@v6
|
||||
- uses: actions/github-script@v7
|
||||
if: always()
|
||||
env:
|
||||
REPORT_URL: ${{ steps.generate-report.outputs.report-url }}
|
||||
|
||||
@@ -19,7 +19,7 @@ runs:
|
||||
PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
|
||||
if [ "${PR_NUMBER}" != "null" ]; then
|
||||
BRANCH_OR_PR=pr-${PR_NUMBER}
|
||||
elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then
|
||||
elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then
|
||||
# Shortcut for special branches
|
||||
BRANCH_OR_PR=${GITHUB_REF_NAME}
|
||||
else
|
||||
|
||||
@@ -80,13 +80,13 @@ runs:
|
||||
|
||||
- name: Checkout
|
||||
if: inputs.needs_postgres_source == 'true'
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Cache poetry deps
|
||||
uses: actions/cache@v3
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||
|
||||
8
.github/workflows/actionlint.yml
vendored
8
.github/workflows/actionlint.yml
vendored
@@ -16,8 +16,14 @@ concurrency:
|
||||
cancel-in-progress: ${{ github.event_name == 'pull_request' }}
|
||||
|
||||
jobs:
|
||||
actionlint:
|
||||
check-permissions:
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
|
||||
uses: ./.github/workflows/check-permissions.yml
|
||||
with:
|
||||
github-event-name: ${{ github.event_name}}
|
||||
|
||||
actionlint:
|
||||
needs: [ check-permissions ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
2
.github/workflows/approved-for-ci-run.yml
vendored
2
.github/workflows/approved-for-ci-run.yml
vendored
@@ -64,7 +64,7 @@ jobs:
|
||||
steps:
|
||||
- run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
|
||||
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
ref: main
|
||||
token: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
|
||||
20
.github/workflows/benchmarking.yml
vendored
20
.github/workflows/benchmarking.yml
vendored
@@ -62,11 +62,11 @@ jobs:
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
@@ -214,14 +214,14 @@ jobs:
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
options: --init
|
||||
|
||||
# Increase timeout to 8h, default timeout is 6h
|
||||
timeout-minutes: 480
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
@@ -362,11 +362,11 @@ jobs:
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
@@ -461,11 +461,11 @@ jobs:
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
@@ -558,11 +558,11 @@ jobs:
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
|
||||
105
.github/workflows/build-build-tools-image.yml
vendored
Normal file
105
.github/workflows/build-build-tools-image.yml
vendored
Normal file
@@ -0,0 +1,105 @@
|
||||
name: Build build-tools image
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
image-tag:
|
||||
description: "build-tools image tag"
|
||||
required: true
|
||||
type: string
|
||||
outputs:
|
||||
image-tag:
|
||||
description: "build-tools tag"
|
||||
value: ${{ inputs.image-tag }}
|
||||
image:
|
||||
description: "build-tools image"
|
||||
value: neondatabase/build-tools:${{ inputs.image-tag }}
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euo pipefail {0}
|
||||
|
||||
concurrency:
|
||||
group: build-build-tools-image-${{ inputs.image-tag }}
|
||||
|
||||
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
check-image:
|
||||
uses: ./.github/workflows/check-build-tools-image.yml
|
||||
|
||||
# This job uses older version of GitHub Actions because it's run on gen2 runners, which don't support node 20 (for newer versions)
|
||||
build-image:
|
||||
needs: [ check-image ]
|
||||
if: needs.check-image.outputs.found == 'false'
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
arch: [ x64, arm64 ]
|
||||
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "dev", "{0}"]', matrix.arch)) }}
|
||||
|
||||
env:
|
||||
IMAGE_TAG: ${{ inputs.image-tag }}
|
||||
|
||||
steps:
|
||||
- name: Check `input.tag` is correct
|
||||
env:
|
||||
INPUTS_IMAGE_TAG: ${{ inputs.image-tag }}
|
||||
CHECK_IMAGE_TAG : ${{ needs.check-image.outputs.image-tag }}
|
||||
run: |
|
||||
if [ "${INPUTS_IMAGE_TAG}" != "${CHECK_IMAGE_TAG}" ]; then
|
||||
echo "'inputs.image-tag' (${INPUTS_IMAGE_TAG}) does not match the tag of the latest build-tools image 'inputs.image-tag' (${CHECK_IMAGE_TAG})"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
|
||||
# The default value is ~/.docker
|
||||
- name: Set custom docker config directory
|
||||
run: |
|
||||
mkdir -p /tmp/.docker-custom
|
||||
echo DOCKER_CONFIG=/tmp/.docker-custom >> $GITHUB_ENV
|
||||
|
||||
- uses: docker/setup-buildx-action@v2
|
||||
|
||||
- uses: docker/login-action@v2
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- uses: docker/build-push-action@v4
|
||||
with:
|
||||
context: .
|
||||
provenance: false
|
||||
push: true
|
||||
pull: true
|
||||
file: Dockerfile.build-tools
|
||||
cache-from: type=registry,ref=neondatabase/build-tools:cache-${{ matrix.arch }}
|
||||
cache-to: type=registry,ref=neondatabase/build-tools:cache-${{ matrix.arch }},mode=max
|
||||
tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }}
|
||||
|
||||
- name: Remove custom docker config directory
|
||||
run: |
|
||||
rm -rf /tmp/.docker-custom
|
||||
|
||||
merge-images:
|
||||
needs: [ build-image ]
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
env:
|
||||
IMAGE_TAG: ${{ inputs.image-tag }}
|
||||
|
||||
steps:
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Create multi-arch image
|
||||
run: |
|
||||
docker buildx imagetools create -t neondatabase/build-tools:${IMAGE_TAG} \
|
||||
neondatabase/build-tools:${IMAGE_TAG}-x64 \
|
||||
neondatabase/build-tools:${IMAGE_TAG}-arm64
|
||||
124
.github/workflows/build_and_push_docker_image.yml
vendored
124
.github/workflows/build_and_push_docker_image.yml
vendored
@@ -1,124 +0,0 @@
|
||||
name: Build and Push Docker Image
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
dockerfile-path:
|
||||
required: true
|
||||
type: string
|
||||
image-name:
|
||||
required: true
|
||||
type: string
|
||||
outputs:
|
||||
build-tools-tag:
|
||||
description: "tag generated for build tools"
|
||||
value: ${{ jobs.tag.outputs.build-tools-tag }}
|
||||
|
||||
jobs:
|
||||
check-if-build-tools-dockerfile-changed:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
docker_file_changed: ${{ steps.dockerfile.outputs.docker_file_changed }}
|
||||
steps:
|
||||
- name: Check if Dockerfile.buildtools has changed
|
||||
id: dockerfile
|
||||
run: |
|
||||
if [[ "$GITHUB_EVENT_NAME" != "pull_request" ]]; then
|
||||
echo "docker_file_changed=false" >> $GITHUB_OUTPUT
|
||||
exit
|
||||
fi
|
||||
updated_files=$(gh pr --repo neondatabase/neon diff ${{ github.event.pull_request.number }} --name-only)
|
||||
if [[ $updated_files == *"Dockerfile.buildtools"* ]]; then
|
||||
echo "docker_file_changed=true" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
tag:
|
||||
runs-on: ubuntu-latest
|
||||
needs: [ check-if-build-tools-dockerfile-changed ]
|
||||
outputs:
|
||||
build-tools-tag: ${{steps.buildtools-tag.outputs.image_tag}}
|
||||
|
||||
steps:
|
||||
- name: Get buildtools tag
|
||||
env:
|
||||
DOCKERFILE_CHANGED: ${{ needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed }}
|
||||
run: |
|
||||
if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]] && [[ "${DOCKERFILE_CHANGED}" == "true" ]]; then
|
||||
IMAGE_TAG=$GITHUB_RUN_ID
|
||||
else
|
||||
IMAGE_TAG=pinned
|
||||
fi
|
||||
|
||||
echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
|
||||
shell: bash
|
||||
id: buildtools-tag
|
||||
|
||||
kaniko:
|
||||
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
|
||||
needs: [ tag, check-if-build-tools-dockerfile-changed ]
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container: gcr.io/kaniko-project/executor:v1.7.0-debug
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Configure ECR login
|
||||
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
||||
|
||||
- name: Kaniko build
|
||||
run: |
|
||||
/kaniko/executor \
|
||||
--reproducible \
|
||||
--snapshotMode=redo \
|
||||
--skip-unused-stages \
|
||||
--dockerfile ${{ inputs.dockerfile-path }} \
|
||||
--cache=true \
|
||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
|
||||
|
||||
kaniko-arm:
|
||||
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
|
||||
needs: [ tag, check-if-build-tools-dockerfile-changed ]
|
||||
runs-on: [ self-hosted, dev, arm64 ]
|
||||
container: gcr.io/kaniko-project/executor:v1.7.0-debug
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Configure ECR login
|
||||
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
||||
|
||||
- name: Kaniko build
|
||||
run: |
|
||||
/kaniko/executor \
|
||||
--reproducible \
|
||||
--snapshotMode=redo \
|
||||
--skip-unused-stages \
|
||||
--dockerfile ${{ inputs.dockerfile-path }} \
|
||||
--cache=true \
|
||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
|
||||
|
||||
manifest:
|
||||
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
|
||||
name: 'manifest'
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
needs:
|
||||
- tag
|
||||
- kaniko
|
||||
- kaniko-arm
|
||||
- check-if-build-tools-dockerfile-changed
|
||||
|
||||
steps:
|
||||
- name: Create manifest
|
||||
run: |
|
||||
docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} \
|
||||
--amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 \
|
||||
--amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
|
||||
|
||||
- name: Push manifest
|
||||
run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}
|
||||
414
.github/workflows/build_and_test.yml
vendored
414
.github/workflows/build_and_test.yml
vendored
@@ -5,6 +5,7 @@ on:
|
||||
branches:
|
||||
- main
|
||||
- release
|
||||
- release-proxy
|
||||
pull_request:
|
||||
|
||||
defaults:
|
||||
@@ -27,24 +28,9 @@ env:
|
||||
jobs:
|
||||
check-permissions:
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Disallow PRs from forks
|
||||
if: |
|
||||
github.event_name == 'pull_request' &&
|
||||
github.event.pull_request.head.repo.full_name != github.repository
|
||||
|
||||
run: |
|
||||
if [ "${{ contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association) }}" = "true" ]; then
|
||||
MESSAGE="Please create a PR from a branch of ${GITHUB_REPOSITORY} instead of a fork"
|
||||
else
|
||||
MESSAGE="The PR should be reviewed and labelled with 'approved-for-ci-run' to trigger a CI run"
|
||||
fi
|
||||
|
||||
echo >&2 "We don't run CI for PRs from forks"
|
||||
echo >&2 "${MESSAGE}"
|
||||
|
||||
exit 1
|
||||
uses: ./.github/workflows/check-permissions.yml
|
||||
with:
|
||||
github-event-name: ${{ github.event_name}}
|
||||
|
||||
cancel-previous-e2e-tests:
|
||||
needs: [ check-permissions ]
|
||||
@@ -69,7 +55,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -82,6 +68,8 @@ jobs:
|
||||
echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
||||
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
|
||||
echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||
echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT
|
||||
@@ -89,30 +77,36 @@ jobs:
|
||||
shell: bash
|
||||
id: build-tag
|
||||
|
||||
build-buildtools-image:
|
||||
check-build-tools-image:
|
||||
needs: [ check-permissions ]
|
||||
uses: ./.github/workflows/build_and_push_docker_image.yml
|
||||
uses: ./.github/workflows/check-build-tools-image.yml
|
||||
|
||||
build-build-tools-image:
|
||||
needs: [ check-build-tools-image ]
|
||||
uses: ./.github/workflows/build-build-tools-image.yml
|
||||
with:
|
||||
dockerfile-path: Dockerfile.buildtools
|
||||
image-name: build-tools
|
||||
image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
|
||||
secrets: inherit
|
||||
|
||||
check-codestyle-python:
|
||||
needs: [ check-permissions, build-buildtools-image ]
|
||||
needs: [ check-permissions, build-build-tools-image ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: false
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Cache poetry deps
|
||||
uses: actions/cache@v3
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||
@@ -130,15 +124,18 @@ jobs:
|
||||
run: poetry run mypy .
|
||||
|
||||
check-codestyle-rust:
|
||||
needs: [ check-permissions, build-buildtools-image ]
|
||||
needs: [ check-permissions, build-build-tools-image ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
@@ -146,7 +143,7 @@ jobs:
|
||||
# Disabled for now
|
||||
# - name: Restore cargo deps cache
|
||||
# id: cache_cargo
|
||||
# uses: actions/cache@v3
|
||||
# uses: actions/cache@v4
|
||||
# with:
|
||||
# path: |
|
||||
# !~/.cargo/registry/src
|
||||
@@ -197,10 +194,13 @@ jobs:
|
||||
run: cargo deny check --hide-inclusion-graph
|
||||
|
||||
build-neon:
|
||||
needs: [ check-permissions, tag, build-buildtools-image ]
|
||||
needs: [ check-permissions, tag, build-build-tools-image ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
# Raise locked memory limit for tokio-epoll-uring.
|
||||
# On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12),
|
||||
# io_uring will account the memory of the CQ and SQ as locked.
|
||||
@@ -231,7 +231,7 @@ jobs:
|
||||
done
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
@@ -303,7 +303,7 @@ jobs:
|
||||
# compressed crates.
|
||||
# - name: Cache cargo deps
|
||||
# id: cache_cargo
|
||||
# uses: actions/cache@v3
|
||||
# uses: actions/cache@v4
|
||||
# with:
|
||||
# path: |
|
||||
# ~/.cargo/registry/
|
||||
@@ -317,21 +317,21 @@ jobs:
|
||||
|
||||
- name: Cache postgres v14 build
|
||||
id: cache_pg_14
|
||||
uses: actions/cache@v3
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v14
|
||||
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Cache postgres v15 build
|
||||
id: cache_pg_15
|
||||
uses: actions/cache@v3
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v15
|
||||
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Cache postgres v16 build
|
||||
id: cache_pg_16
|
||||
uses: actions/cache@v3
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v16
|
||||
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
@@ -438,10 +438,13 @@ jobs:
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
|
||||
regress-tests:
|
||||
needs: [ check-permissions, build-neon, build-buildtools-image, tag ]
|
||||
needs: [ check-permissions, build-neon, build-build-tools-image, tag ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
# for changed limits, see comments on `options:` earlier in this file
|
||||
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
|
||||
strategy:
|
||||
@@ -451,7 +454,7 @@ jobs:
|
||||
pg_version: [ v14, v15, v16 ]
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
@@ -472,26 +475,34 @@ jobs:
|
||||
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
|
||||
BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
|
||||
PAGESERVER_GET_VECTORED_IMPL: vectored
|
||||
|
||||
# Temporary disable this step until we figure out why it's so flaky
|
||||
# Ref https://github.com/neondatabase/neon/issues/4540
|
||||
- name: Merge and upload coverage data
|
||||
if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
|
||||
if: |
|
||||
false &&
|
||||
matrix.build_type == 'debug' && matrix.pg_version == 'v14'
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
|
||||
get-benchmarks-durations:
|
||||
outputs:
|
||||
json: ${{ steps.get-benchmark-durations.outputs.json }}
|
||||
needs: [ check-permissions, build-buildtools-image ]
|
||||
needs: [ check-permissions, build-build-tools-image ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Cache poetry deps
|
||||
uses: actions/cache@v3
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||
@@ -510,10 +521,13 @@ jobs:
|
||||
echo "json=$(jq --compact-output '.' /tmp/benchmark_durations.json)" >> $GITHUB_OUTPUT
|
||||
|
||||
benchmarks:
|
||||
needs: [ check-permissions, build-neon, build-buildtools-image, get-benchmarks-durations ]
|
||||
needs: [ check-permissions, build-neon, build-build-tools-image, get-benchmarks-durations ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
# for changed limits, see comments on `options:` earlier in this file
|
||||
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
|
||||
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
|
||||
@@ -525,7 +539,7 @@ jobs:
|
||||
build_type: [ release ]
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Pytest benchmarks
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
@@ -545,16 +559,19 @@ jobs:
|
||||
# while coverage is currently collected for the debug ones
|
||||
|
||||
create-test-report:
|
||||
needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-buildtools-image ]
|
||||
needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-build-tools-image ]
|
||||
if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
|
||||
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Create Allure report
|
||||
if: ${{ !cancelled() }}
|
||||
@@ -565,7 +582,7 @@ jobs:
|
||||
env:
|
||||
REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
|
||||
- uses: actions/github-script@v6
|
||||
- uses: actions/github-script@v7
|
||||
if: ${{ !cancelled() }}
|
||||
with:
|
||||
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
|
||||
@@ -591,10 +608,13 @@ jobs:
|
||||
})
|
||||
|
||||
coverage-report:
|
||||
needs: [ check-permissions, regress-tests, build-buildtools-image ]
|
||||
needs: [ check-permissions, regress-tests, build-build-tools-image ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
strategy:
|
||||
fail-fast: false
|
||||
@@ -605,7 +625,7 @@ jobs:
|
||||
coverage-json: ${{ steps.upload-coverage-report-new.outputs.summary-json }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
@@ -674,7 +694,7 @@ jobs:
|
||||
REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/lcov/summary.json
|
||||
echo "summary-json=${REPORT_URL}" >> $GITHUB_OUTPUT
|
||||
|
||||
- uses: actions/github-script@v6
|
||||
- uses: actions/github-script@v7
|
||||
env:
|
||||
REPORT_URL_NEW: ${{ steps.upload-coverage-report-new.outputs.report-url }}
|
||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
@@ -692,166 +712,146 @@ jobs:
|
||||
})
|
||||
|
||||
trigger-e2e-tests:
|
||||
if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' }}
|
||||
if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' }}
|
||||
needs: [ check-permissions, promote-images, tag ]
|
||||
uses: ./.github/workflows/trigger-e2e-tests.yml
|
||||
secrets: inherit
|
||||
|
||||
neon-image:
|
||||
needs: [ check-permissions, build-buildtools-image, tag ]
|
||||
needs: [ check-permissions, build-build-tools-image, tag ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container: gcr.io/kaniko-project/executor:v1.9.2-debug
|
||||
defaults:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v1 # v3 won't work with kaniko
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Configure ECR and Docker Hub login
|
||||
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
|
||||
# The default value is ~/.docker
|
||||
- name: Set custom docker config directory
|
||||
run: |
|
||||
DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
|
||||
echo "::add-mask::${DOCKERHUB_AUTH}"
|
||||
mkdir -p .docker-custom
|
||||
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
|
||||
- uses: docker/setup-buildx-action@v3
|
||||
|
||||
cat <<-EOF > /kaniko/.docker/config.json
|
||||
{
|
||||
"auths": {
|
||||
"https://index.docker.io/v1/": {
|
||||
"auth": "${DOCKERHUB_AUTH}"
|
||||
}
|
||||
},
|
||||
"credHelpers": {
|
||||
"369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
|
||||
}
|
||||
}
|
||||
EOF
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Kaniko build neon
|
||||
run:
|
||||
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
|
||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
|
||||
--context .
|
||||
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
--build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
|
||||
--build-arg TAG=${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
|
||||
--destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
# Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
|
||||
- name: Cleanup ECR folder
|
||||
run: rm -rf ~/.ecr
|
||||
- uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
build-args: |
|
||||
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
BUILD_TAG=${{ needs.tag.outputs.build-tag }}
|
||||
TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
|
||||
provenance: false
|
||||
push: true
|
||||
pull: true
|
||||
file: Dockerfile
|
||||
cache-from: type=registry,ref=neondatabase/neon:cache
|
||||
cache-to: type=registry,ref=neondatabase/neon:cache,mode=max
|
||||
tags: |
|
||||
369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
|
||||
neondatabase/neon:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
compute-tools-image:
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
needs: [ check-permissions, build-buildtools-image, tag ]
|
||||
container: gcr.io/kaniko-project/executor:v1.9.2-debug
|
||||
defaults:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v1 # v3 won't work with kaniko
|
||||
|
||||
- name: Configure ECR and Docker Hub login
|
||||
- name: Remove custom docker config directory
|
||||
if: always()
|
||||
run: |
|
||||
DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
|
||||
echo "::add-mask::${DOCKERHUB_AUTH}"
|
||||
|
||||
cat <<-EOF > /kaniko/.docker/config.json
|
||||
{
|
||||
"auths": {
|
||||
"https://index.docker.io/v1/": {
|
||||
"auth": "${DOCKERHUB_AUTH}"
|
||||
}
|
||||
},
|
||||
"credHelpers": {
|
||||
"369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
|
||||
}
|
||||
}
|
||||
EOF
|
||||
|
||||
- name: Kaniko build compute tools
|
||||
run:
|
||||
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
|
||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
|
||||
--context .
|
||||
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
|
||||
--build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
|
||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
--dockerfile Dockerfile.compute-tools
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
|
||||
--destination neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
# Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
|
||||
- name: Cleanup ECR folder
|
||||
run: rm -rf ~/.ecr
|
||||
rm -rf .docker-custom
|
||||
|
||||
compute-node-image:
|
||||
needs: [ check-permissions, build-buildtools-image, tag ]
|
||||
needs: [ check-permissions, build-build-tools-image, tag ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container:
|
||||
image: gcr.io/kaniko-project/executor:v1.9.2-debug
|
||||
# Workaround for "Resolving download.osgeo.org (download.osgeo.org)... failed: Temporary failure in name resolution.""
|
||||
# Should be prevented by https://github.com/neondatabase/neon/issues/4281
|
||||
options: --add-host=download.osgeo.org:140.211.15.30
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
version: [ v14, v15, v16 ]
|
||||
defaults:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v1 # v3 won't work with kaniko
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Configure ECR and Docker Hub login
|
||||
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
|
||||
# The default value is ~/.docker
|
||||
- name: Set custom docker config directory
|
||||
run: |
|
||||
DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
|
||||
echo "::add-mask::${DOCKERHUB_AUTH}"
|
||||
mkdir -p .docker-custom
|
||||
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
|
||||
- uses: docker/setup-buildx-action@v3
|
||||
with:
|
||||
# Disable parallelism for docker buildkit.
|
||||
# As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.
|
||||
config-inline: |
|
||||
[worker.oci]
|
||||
max-parallelism = 1
|
||||
|
||||
cat <<-EOF > /kaniko/.docker/config.json
|
||||
{
|
||||
"auths": {
|
||||
"https://index.docker.io/v1/": {
|
||||
"auth": "${DOCKERHUB_AUTH}"
|
||||
}
|
||||
},
|
||||
"credHelpers": {
|
||||
"369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
|
||||
}
|
||||
}
|
||||
EOF
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Kaniko build compute node with extensions
|
||||
run:
|
||||
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
|
||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
|
||||
--context .
|
||||
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
--build-arg PG_VERSION=${{ matrix.version }}
|
||||
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
|
||||
--build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
|
||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
--dockerfile Dockerfile.compute-node
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
--destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
--cleanup
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
# Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
|
||||
- name: Cleanup ECR folder
|
||||
run: rm -rf ~/.ecr
|
||||
- name: Build compute-node image
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
build-args: |
|
||||
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
PG_VERSION=${{ matrix.version }}
|
||||
BUILD_TAG=${{ needs.tag.outputs.build-tag }}
|
||||
TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
|
||||
provenance: false
|
||||
push: true
|
||||
pull: true
|
||||
file: Dockerfile.compute-node
|
||||
cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache
|
||||
cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache,mode=max
|
||||
tags: |
|
||||
369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Build compute-tools image
|
||||
# compute-tools are Postgres independent, so build it only once
|
||||
if: ${{ matrix.version == 'v16' }}
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
target: compute-tools-image
|
||||
context: .
|
||||
build-args: |
|
||||
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
BUILD_TAG=${{ needs.tag.outputs.build-tag }}
|
||||
TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
|
||||
provenance: false
|
||||
push: true
|
||||
pull: true
|
||||
file: Dockerfile.compute-node
|
||||
tags: |
|
||||
369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }}
|
||||
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
|
||||
|
||||
- name: Remove custom docker config directory
|
||||
if: always()
|
||||
run: |
|
||||
rm -rf .docker-custom
|
||||
|
||||
vm-compute-node-image:
|
||||
needs: [ check-permissions, tag, compute-node-image ]
|
||||
@@ -895,12 +895,12 @@ jobs:
|
||||
docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
test-images:
|
||||
needs: [ check-permissions, tag, neon-image, compute-node-image, compute-tools-image ]
|
||||
needs: [ check-permissions, tag, neon-image, compute-node-image ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -929,7 +929,8 @@ jobs:
|
||||
fi
|
||||
|
||||
- name: Verify docker-compose example
|
||||
run: env REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh
|
||||
timeout-minutes: 20
|
||||
run: env TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh
|
||||
|
||||
- name: Print logs and clean up
|
||||
if: always()
|
||||
@@ -962,9 +963,7 @@ jobs:
|
||||
crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} vm-compute-node-v16
|
||||
|
||||
- name: Add latest tag to images
|
||||
if: |
|
||||
(github.ref_name == 'main' || github.ref_name == 'release') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy'
|
||||
run: |
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
|
||||
@@ -976,9 +975,7 @@ jobs:
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest
|
||||
|
||||
- name: Push images to production ECR
|
||||
if: |
|
||||
(github.ref_name == 'main' || github.ref_name == 'release') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
|
||||
run: |
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
|
||||
@@ -1002,9 +999,7 @@ jobs:
|
||||
crane push vm-compute-node-v16 neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Push latest tags to Docker Hub
|
||||
if: |
|
||||
(github.ref_name == 'main' || github.ref_name == 'release') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
|
||||
run: |
|
||||
crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
|
||||
@@ -1094,7 +1089,7 @@ jobs:
|
||||
|
||||
deploy:
|
||||
needs: [ check-permissions, promote-images, tag, regress-tests, trigger-custom-extensions-build-and-wait ]
|
||||
if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
|
||||
if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
|
||||
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
||||
@@ -1114,7 +1109,7 @@ jobs:
|
||||
done
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: false
|
||||
fetch-depth: 0
|
||||
@@ -1129,15 +1124,29 @@ jobs:
|
||||
# TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions
|
||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
|
||||
-f deployPgSniRouter=false \
|
||||
-f deployProxy=false \
|
||||
-f deployStorage=true \
|
||||
-f deployStorageBroker=true \
|
||||
-f branch=main \
|
||||
-f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
|
||||
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
|
||||
-f deployPgSniRouter=true \
|
||||
-f deployProxy=true \
|
||||
-f deployStorage=false \
|
||||
-f deployStorageBroker=false \
|
||||
-f branch=main \
|
||||
-f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||
else
|
||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Create git tag
|
||||
if: github.ref_name == 'release'
|
||||
uses: actions/github-script@v6
|
||||
if: github.ref_name == 'release' || github.ref_name == 'release-proxy'
|
||||
uses: actions/github-script@v7
|
||||
with:
|
||||
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
|
||||
retries: 5
|
||||
@@ -1149,9 +1158,10 @@ jobs:
|
||||
sha: context.sha,
|
||||
})
|
||||
|
||||
# TODO: check how GitHub releases looks for proxy releases and enable it if it's ok
|
||||
- name: Create GitHub release
|
||||
if: github.ref_name == 'release'
|
||||
uses: actions/github-script@v6
|
||||
uses: actions/github-script@v7
|
||||
with:
|
||||
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
|
||||
retries: 5
|
||||
@@ -1200,3 +1210,11 @@ jobs:
|
||||
|
||||
time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/${PREFIX}/${FILENAME}
|
||||
done
|
||||
|
||||
pin-build-tools-image:
|
||||
needs: [ build-build-tools-image, promote-images, regress-tests ]
|
||||
if: github.ref_name == 'main'
|
||||
uses: ./.github/workflows/pin-build-tools-image.yml
|
||||
with:
|
||||
from-tag: ${{ needs.build-build-tools-image.outputs.image-tag }}
|
||||
secrets: inherit
|
||||
|
||||
58
.github/workflows/check-build-tools-image.yml
vendored
Normal file
58
.github/workflows/check-build-tools-image.yml
vendored
Normal file
@@ -0,0 +1,58 @@
|
||||
name: Check build-tools image
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
outputs:
|
||||
image-tag:
|
||||
description: "build-tools image tag"
|
||||
value: ${{ jobs.check-image.outputs.tag }}
|
||||
found:
|
||||
description: "Whether the image is found in the registry"
|
||||
value: ${{ jobs.check-image.outputs.found }}
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euo pipefail {0}
|
||||
|
||||
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
check-image:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
tag: ${{ steps.get-build-tools-tag.outputs.image-tag }}
|
||||
found: ${{ steps.check-image.outputs.found }}
|
||||
|
||||
steps:
|
||||
- name: Get build-tools image tag for the current commit
|
||||
id: get-build-tools-tag
|
||||
env:
|
||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
LAST_BUILD_TOOLS_SHA=$(
|
||||
gh api \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
--method GET \
|
||||
--field path=Dockerfile.build-tools \
|
||||
--field sha=${COMMIT_SHA} \
|
||||
--field per_page=1 \
|
||||
--jq ".[0].sha" \
|
||||
"/repos/${GITHUB_REPOSITORY}/commits"
|
||||
)
|
||||
echo "image-tag=${LAST_BUILD_TOOLS_SHA}" | tee -a $GITHUB_OUTPUT
|
||||
|
||||
- name: Check if such tag found in the registry
|
||||
id: check-image
|
||||
env:
|
||||
IMAGE_TAG: ${{ steps.get-build-tools-tag.outputs.image-tag }}
|
||||
run: |
|
||||
if docker manifest inspect neondatabase/build-tools:${IMAGE_TAG}; then
|
||||
found=true
|
||||
else
|
||||
found=false
|
||||
fi
|
||||
|
||||
echo "found=${found}" | tee -a $GITHUB_OUTPUT
|
||||
36
.github/workflows/check-permissions.yml
vendored
Normal file
36
.github/workflows/check-permissions.yml
vendored
Normal file
@@ -0,0 +1,36 @@
|
||||
name: Check Permissions
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
github-event-name:
|
||||
required: true
|
||||
type: string
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euo pipefail {0}
|
||||
|
||||
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
check-permissions:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Disallow CI runs on PRs from forks
|
||||
if: |
|
||||
inputs.github-event-name == 'pull_request' &&
|
||||
github.event.pull_request.head.repo.full_name != github.repository
|
||||
run: |
|
||||
if [ "${{ contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association) }}" = "true" ]; then
|
||||
MESSAGE="Please create a PR from a branch of ${GITHUB_REPOSITORY} instead of a fork"
|
||||
else
|
||||
MESSAGE="The PR should be reviewed and labelled with 'approved-for-ci-run' to trigger a CI run"
|
||||
fi
|
||||
|
||||
# TODO: use actions/github-script to post this message as a PR comment
|
||||
echo >&2 "We don't run CI for PRs from forks"
|
||||
echo >&2 "${MESSAGE}"
|
||||
|
||||
exit 1
|
||||
32
.github/workflows/cleanup-caches-by-a-branch.yml
vendored
Normal file
32
.github/workflows/cleanup-caches-by-a-branch.yml
vendored
Normal file
@@ -0,0 +1,32 @@
|
||||
# A workflow from
|
||||
# https://docs.github.com/en/actions/using-workflows/caching-dependencies-to-speed-up-workflows#force-deleting-cache-entries
|
||||
|
||||
name: cleanup caches by a branch
|
||||
on:
|
||||
pull_request:
|
||||
types:
|
||||
- closed
|
||||
|
||||
jobs:
|
||||
cleanup:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Cleanup
|
||||
run: |
|
||||
gh extension install actions/gh-actions-cache
|
||||
|
||||
echo "Fetching list of cache key"
|
||||
cacheKeysForPR=$(gh actions-cache list -R $REPO -B $BRANCH -L 100 | cut -f 1 )
|
||||
|
||||
## Setting this to not fail the workflow while deleting cache keys.
|
||||
set +e
|
||||
echo "Deleting caches..."
|
||||
for cacheKey in $cacheKeysForPR
|
||||
do
|
||||
gh actions-cache delete $cacheKey -R $REPO -B $BRANCH --confirm
|
||||
done
|
||||
echo "Done"
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
REPO: ${{ github.repository }}
|
||||
BRANCH: refs/pull/${{ github.event.pull_request.number }}/merge
|
||||
54
.github/workflows/neon_extra_builds.yml
vendored
54
.github/workflows/neon_extra_builds.yml
vendored
@@ -20,7 +20,25 @@ env:
|
||||
COPT: '-Werror'
|
||||
|
||||
jobs:
|
||||
check-permissions:
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
|
||||
uses: ./.github/workflows/check-permissions.yml
|
||||
with:
|
||||
github-event-name: ${{ github.event_name}}
|
||||
|
||||
check-build-tools-image:
|
||||
needs: [ check-permissions ]
|
||||
uses: ./.github/workflows/check-build-tools-image.yml
|
||||
|
||||
build-build-tools-image:
|
||||
needs: [ check-build-tools-image ]
|
||||
uses: ./.github/workflows/build-build-tools-image.yml
|
||||
with:
|
||||
image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
|
||||
secrets: inherit
|
||||
|
||||
check-macos-build:
|
||||
needs: [ check-permissions ]
|
||||
if: |
|
||||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') ||
|
||||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
|
||||
@@ -57,21 +75,21 @@ jobs:
|
||||
|
||||
- name: Cache postgres v14 build
|
||||
id: cache_pg_14
|
||||
uses: actions/cache@v3
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v14
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Cache postgres v15 build
|
||||
id: cache_pg_15
|
||||
uses: actions/cache@v3
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v15
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Cache postgres v16 build
|
||||
id: cache_pg_16
|
||||
uses: actions/cache@v3
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v16
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
@@ -82,7 +100,7 @@ jobs:
|
||||
echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
|
||||
|
||||
- name: Cache cargo deps
|
||||
uses: actions/cache@v3
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: |
|
||||
~/.cargo/registry
|
||||
@@ -116,8 +134,8 @@ jobs:
|
||||
run: ./run_clippy.sh
|
||||
|
||||
check-linux-arm-build:
|
||||
needs: [ check-permissions, build-build-tools-image ]
|
||||
timeout-minutes: 90
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
|
||||
runs-on: [ self-hosted, dev, arm64 ]
|
||||
|
||||
env:
|
||||
@@ -130,7 +148,10 @@ jobs:
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
@@ -172,21 +193,21 @@ jobs:
|
||||
|
||||
- name: Cache postgres v14 build
|
||||
id: cache_pg_14
|
||||
uses: actions/cache@v3
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v14
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Cache postgres v15 build
|
||||
id: cache_pg_15
|
||||
uses: actions/cache@v3
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v15
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Cache postgres v16 build
|
||||
id: cache_pg_16
|
||||
uses: actions/cache@v3
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v16
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
@@ -237,12 +258,15 @@ jobs:
|
||||
cargo nextest run --package remote_storage --test test_real_azure
|
||||
|
||||
check-codestyle-rust-arm:
|
||||
needs: [ check-permissions, build-build-tools-image ]
|
||||
timeout-minutes: 90
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
|
||||
runs-on: [ self-hosted, dev, arm64 ]
|
||||
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
@@ -309,13 +333,17 @@ jobs:
|
||||
run: cargo deny check
|
||||
|
||||
gather-rust-build-stats:
|
||||
needs: [ check-permissions, build-build-tools-image ]
|
||||
if: |
|
||||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
|
||||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
|
||||
github.ref_name == 'main'
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
env:
|
||||
@@ -356,7 +384,7 @@ jobs:
|
||||
echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Publish build stats report
|
||||
uses: actions/github-script@v6
|
||||
uses: actions/github-script@v7
|
||||
env:
|
||||
REPORT_URL: ${{ steps.upload-stats.outputs.report-url }}
|
||||
SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
|
||||
6
.github/workflows/pg_clients.yml
vendored
6
.github/workflows/pg_clients.yml
vendored
@@ -28,7 +28,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- uses: actions/setup-python@v4
|
||||
with:
|
||||
@@ -38,7 +38,7 @@ jobs:
|
||||
uses: snok/install-poetry@v1
|
||||
|
||||
- name: Cache poetry deps
|
||||
uses: actions/cache@v3
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v2-${{ runner.os }}-python-deps-ubunutu-latest-${{ hashFiles('poetry.lock') }}
|
||||
@@ -82,7 +82,7 @@ jobs:
|
||||
# It will be fixed after switching to gen2 runner
|
||||
- name: Upload python test logs
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v3
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
retention-days: 7
|
||||
name: python-test-pg_clients-${{ runner.os }}-stage-logs
|
||||
|
||||
72
.github/workflows/pin-build-tools-image.yml
vendored
Normal file
72
.github/workflows/pin-build-tools-image.yml
vendored
Normal file
@@ -0,0 +1,72 @@
|
||||
name: 'Pin build-tools image'
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
from-tag:
|
||||
description: 'Source tag'
|
||||
required: true
|
||||
type: string
|
||||
workflow_call:
|
||||
inputs:
|
||||
from-tag:
|
||||
description: 'Source tag'
|
||||
required: true
|
||||
type: string
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euo pipefail {0}
|
||||
|
||||
concurrency:
|
||||
group: pin-build-tools-image-${{ inputs.from-tag }}
|
||||
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
tag-image:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
env:
|
||||
FROM_TAG: ${{ inputs.from-tag }}
|
||||
TO_TAG: pinned
|
||||
|
||||
steps:
|
||||
- name: Check if we really need to pin the image
|
||||
id: check-manifests
|
||||
run: |
|
||||
docker manifest inspect neondatabase/build-tools:${FROM_TAG} > ${FROM_TAG}.json
|
||||
docker manifest inspect neondatabase/build-tools:${TO_TAG} > ${TO_TAG}.json
|
||||
|
||||
if diff ${FROM_TAG}.json ${TO_TAG}.json; then
|
||||
skip=true
|
||||
else
|
||||
skip=false
|
||||
fi
|
||||
|
||||
echo "skip=${skip}" | tee -a $GITHUB_OUTPUT
|
||||
|
||||
- uses: docker/login-action@v3
|
||||
if: steps.check-manifests.outputs.skip == 'false'
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub
|
||||
if: steps.check-manifests.outputs.skip == 'false'
|
||||
run: |
|
||||
docker buildx imagetools create -t neondatabase/build-tools:${TO_TAG} \
|
||||
neondatabase/build-tools:${FROM_TAG}
|
||||
|
||||
- uses: docker/login-action@v3
|
||||
if: steps.check-manifests.outputs.skip == 'false'
|
||||
with:
|
||||
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
- name: Tag build-tools with `${{ env.TO_TAG }}` in ECR
|
||||
if: steps.check-manifests.outputs.skip == 'false'
|
||||
run: |
|
||||
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \
|
||||
neondatabase/build-tools:${FROM_TAG}
|
||||
83
.github/workflows/release.yml
vendored
83
.github/workflows/release.yml
vendored
@@ -2,12 +2,31 @@ name: Create Release Branch
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 6 * * 1'
|
||||
# It should be kept in sync with if-condition in jobs
|
||||
- cron: '0 6 * * MON' # Storage release
|
||||
- cron: '0 6 * * THU' # Proxy release
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
create-storage-release-branch:
|
||||
type: boolean
|
||||
description: 'Create Storage release PR'
|
||||
required: false
|
||||
create-proxy-release-branch:
|
||||
type: boolean
|
||||
description: 'Create Proxy release PR'
|
||||
required: false
|
||||
|
||||
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
|
||||
permissions: {}
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euo pipefail {0}
|
||||
|
||||
jobs:
|
||||
create_release_branch:
|
||||
runs-on: [ ubuntu-latest ]
|
||||
create-storage-release-branch:
|
||||
if: ${{ github.event.schedule == '0 6 * * MON' || format('{0}', inputs.create-storage-release-branch) == 'true' }}
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
permissions:
|
||||
contents: write # for `git push`
|
||||
@@ -18,27 +37,67 @@ jobs:
|
||||
with:
|
||||
ref: main
|
||||
|
||||
- name: Get current date
|
||||
id: date
|
||||
run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
|
||||
- name: Set environment variables
|
||||
run: |
|
||||
echo "RELEASE_DATE=$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
|
||||
echo "RELEASE_BRANCH=rc/$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
|
||||
|
||||
- name: Create release branch
|
||||
run: git checkout -b releases/${{ steps.date.outputs.date }}
|
||||
run: git checkout -b $RELEASE_BRANCH
|
||||
|
||||
- name: Push new branch
|
||||
run: git push origin releases/${{ steps.date.outputs.date }}
|
||||
run: git push origin $RELEASE_BRANCH
|
||||
|
||||
- name: Create pull request into release
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
run: |
|
||||
cat << EOF > body.md
|
||||
## Release ${{ steps.date.outputs.date }}
|
||||
## Release ${RELEASE_DATE}
|
||||
|
||||
**Please merge this PR using 'Create a merge commit'!**
|
||||
**Please merge this Pull Request using 'Create a merge commit' button**
|
||||
EOF
|
||||
|
||||
gh pr create --title "Release ${{ steps.date.outputs.date }}" \
|
||||
gh pr create --title "Release ${RELEASE_DATE}" \
|
||||
--body-file "body.md" \
|
||||
--head "releases/${{ steps.date.outputs.date }}" \
|
||||
--head "${RELEASE_BRANCH}" \
|
||||
--base "release"
|
||||
|
||||
create-proxy-release-branch:
|
||||
if: ${{ github.event.schedule == '0 6 * * THU' || format('{0}', inputs.create-proxy-release-branch) == 'true' }}
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
permissions:
|
||||
contents: write # for `git push`
|
||||
|
||||
steps:
|
||||
- name: Check out code
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: main
|
||||
|
||||
- name: Set environment variables
|
||||
run: |
|
||||
echo "RELEASE_DATE=$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
|
||||
echo "RELEASE_BRANCH=rc/proxy/$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
|
||||
|
||||
- name: Create release branch
|
||||
run: git checkout -b $RELEASE_BRANCH
|
||||
|
||||
- name: Push new branch
|
||||
run: git push origin $RELEASE_BRANCH
|
||||
|
||||
- name: Create pull request into release
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
run: |
|
||||
cat << EOF > body.md
|
||||
## Proxy release ${RELEASE_DATE}
|
||||
|
||||
**Please merge this Pull Request using 'Create a merge commit' button**
|
||||
EOF
|
||||
|
||||
gh pr create --title "Proxy release ${RELEASE_DATE}}" \
|
||||
--body-file "body.md" \
|
||||
--head "${RELEASE_BRANCH}" \
|
||||
--base "release-proxy"
|
||||
|
||||
7
.github/workflows/trigger-e2e-tests.yml
vendored
7
.github/workflows/trigger-e2e-tests.yml
vendored
@@ -9,7 +9,7 @@ on:
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
|
||||
env:
|
||||
# A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
|
||||
E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
|
||||
@@ -37,7 +37,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -51,6 +51,8 @@ jobs:
|
||||
echo "tag=$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
echo "tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
|
||||
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
|
||||
echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||
BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId')
|
||||
@@ -115,4 +117,3 @@ jobs:
|
||||
\"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
|
||||
}
|
||||
}"
|
||||
|
||||
70
.github/workflows/update_build_tools_image.yml
vendored
70
.github/workflows/update_build_tools_image.yml
vendored
@@ -1,70 +0,0 @@
|
||||
name: 'Update build tools image tag'
|
||||
|
||||
# This workflow it used to update tag of build tools in ECR.
|
||||
# The most common use case is adding/moving `pinned` tag to `${GITHUB_RUN_IT}` image.
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
from-tag:
|
||||
description: 'Source tag'
|
||||
required: true
|
||||
type: string
|
||||
to-tag:
|
||||
description: 'Destination tag'
|
||||
required: true
|
||||
type: string
|
||||
default: 'pinned'
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euo pipefail {0}
|
||||
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
tag-image:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
|
||||
env:
|
||||
ECR_IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
|
||||
DOCKER_HUB_IMAGE: docker.io/neondatabase/build-tools
|
||||
FROM_TAG: ${{ inputs.from-tag }}
|
||||
TO_TAG: ${{ inputs.to-tag }}
|
||||
|
||||
steps:
|
||||
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
|
||||
# The default value is ~/.docker
|
||||
- name: Set custom docker config directory
|
||||
run: |
|
||||
mkdir -p .docker-custom
|
||||
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
|
||||
|
||||
- uses: docker/login-action@v2
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- uses: docker/login-action@v2
|
||||
with:
|
||||
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: '1.21'
|
||||
|
||||
- name: Install crane
|
||||
run: |
|
||||
go install github.com/google/go-containerregistry/cmd/crane@a0658aa1d0cc7a7f1bcc4a3af9155335b6943f40 # v0.18.0
|
||||
|
||||
- name: Copy images
|
||||
run: |
|
||||
crane copy "${ECR_IMAGE}:${FROM_TAG}" "${ECR_IMAGE}:${TO_TAG}"
|
||||
crane copy "${ECR_IMAGE}:${FROM_TAG}" "${DOCKER_HUB_IMAGE}:${TO_TAG}"
|
||||
|
||||
- name: Remove custom docker config directory
|
||||
if: always()
|
||||
run: |
|
||||
rm -rf .docker-custom
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -9,6 +9,7 @@ test_output/
|
||||
neon.iml
|
||||
/.neon
|
||||
/integration_tests/.neon
|
||||
compaction-suite-results.*
|
||||
|
||||
# Coverage
|
||||
*.profraw
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
/compute_tools/ @neondatabase/control-plane @neondatabase/compute
|
||||
/control_plane/ @neondatabase/compute @neondatabase/storage
|
||||
/libs/pageserver_api/ @neondatabase/compute @neondatabase/storage
|
||||
/control_plane/attachment_service @neondatabase/storage
|
||||
/libs/pageserver_api/ @neondatabase/storage
|
||||
/libs/postgres_ffi/ @neondatabase/compute
|
||||
/libs/remote_storage/ @neondatabase/storage
|
||||
/libs/safekeeper_api/ @neondatabase/safekeepers
|
||||
/libs/vm_monitor/ @neondatabase/autoscaling @neondatabase/compute
|
||||
/libs/vm_monitor/ @neondatabase/autoscaling
|
||||
/pageserver/ @neondatabase/storage
|
||||
/pgxn/ @neondatabase/compute
|
||||
/proxy/ @neondatabase/proxy
|
||||
|
||||
@@ -74,16 +74,11 @@ We're using the following approach to make it work:
|
||||
|
||||
For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml)
|
||||
|
||||
## How do I add the "pinned" tag to an buildtools image?
|
||||
We use the `pinned` tag for `Dockerfile.buildtools` build images in our CI/CD setup, currently adding the `pinned` tag is a manual operation.
|
||||
## How do I make build-tools image "pinned"
|
||||
|
||||
You can call it from GitHub UI: https://github.com/neondatabase/neon/actions/workflows/update_build_tools_image.yml,
|
||||
or using GitHub CLI:
|
||||
It's possible to update the `pinned` tag of the `build-tools` image using the `pin-build-tools-image.yml` workflow.
|
||||
|
||||
```bash
|
||||
gh workflow -R neondatabase/neon run update_build_tools_image.yml \
|
||||
-f from-tag=6254913013 \
|
||||
-f to-tag=pinned \
|
||||
|
||||
# Default `-f to-tag` is `pinned`, so the parameter can be omitted.
|
||||
```
|
||||
gh workflow -R neondatabase/neon run pin-build-tools-image.yml \
|
||||
-f from-tag=cc98d9b00d670f182c507ae3783342bd7e64c31e
|
||||
```
|
||||
|
||||
75
Cargo.lock
generated
75
Cargo.lock
generated
@@ -25,9 +25,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
|
||||
|
||||
[[package]]
|
||||
name = "ahash"
|
||||
version = "0.8.5"
|
||||
version = "0.8.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cd7d5a2cecb58716e47d67d5703a249964b14c7be1ec3cad3affc295b2d1c35d"
|
||||
checksum = "d713b3834d76b85304d4d525563c1276e2e30dc97cc67bfb4585a4a29fc2c89f"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"const-random",
|
||||
@@ -284,8 +284,10 @@ dependencies = [
|
||||
"diesel_migrations",
|
||||
"futures",
|
||||
"git-version",
|
||||
"humantime",
|
||||
"hyper",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"pageserver_api",
|
||||
"pageserver_client",
|
||||
"postgres_connection",
|
||||
@@ -1387,9 +1389,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "crc32c"
|
||||
version = "0.6.3"
|
||||
version = "0.6.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3dfea2db42e9927a3845fb268a10a72faed6d416065f77873f05e411457c363e"
|
||||
checksum = "89254598aa9b9fa608de44b3ae54c810f0f06d755e24c50177f1f8f31ff50ce2"
|
||||
dependencies = [
|
||||
"rustc_version",
|
||||
]
|
||||
@@ -1813,6 +1815,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e875f1719c16de097dee81ed675e2d9bb63096823ed3f0ca827b7dea3028bbbb"
|
||||
dependencies = [
|
||||
"enumset_derive",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2757,6 +2760,17 @@ version = "1.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
|
||||
|
||||
[[package]]
|
||||
name = "leaky-bucket"
|
||||
version = "1.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8eb491abd89e9794d50f93c8db610a29509123e3fbbc9c8c67a528e9391cd853"
|
||||
dependencies = [
|
||||
"parking_lot 0.12.1",
|
||||
"tokio",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.150"
|
||||
@@ -3448,6 +3462,7 @@ name = "pageserver"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"arc-swap",
|
||||
"async-compression",
|
||||
"async-stream",
|
||||
"async-trait",
|
||||
@@ -3475,6 +3490,7 @@ dependencies = [
|
||||
"humantime-serde",
|
||||
"hyper",
|
||||
"itertools",
|
||||
"leaky-bucket",
|
||||
"md5",
|
||||
"metrics",
|
||||
"nix 0.27.1",
|
||||
@@ -3482,6 +3498,7 @@ dependencies = [
|
||||
"num_cpus",
|
||||
"once_cell",
|
||||
"pageserver_api",
|
||||
"pageserver_compaction",
|
||||
"pin-project-lite",
|
||||
"postgres",
|
||||
"postgres-protocol",
|
||||
@@ -3536,7 +3553,9 @@ dependencies = [
|
||||
"const_format",
|
||||
"enum-map",
|
||||
"hex",
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"itertools",
|
||||
"postgres_ffi",
|
||||
"rand 0.8.5",
|
||||
"serde",
|
||||
@@ -3570,6 +3589,53 @@ dependencies = [
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pageserver_compaction"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-compression",
|
||||
"async-stream",
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"clap",
|
||||
"const_format",
|
||||
"consumption_metrics",
|
||||
"criterion",
|
||||
"crossbeam-utils",
|
||||
"either",
|
||||
"fail",
|
||||
"flate2",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hex",
|
||||
"hex-literal",
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"itertools",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"pageserver_api",
|
||||
"pin-project-lite",
|
||||
"rand 0.8.5",
|
||||
"smallvec",
|
||||
"svg_fmt",
|
||||
"sync_wrapper",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-io-timeout",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"tracing-error",
|
||||
"tracing-subscriber",
|
||||
"url",
|
||||
"utils",
|
||||
"walkdir",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parking"
|
||||
version = "2.1.1"
|
||||
@@ -6347,6 +6413,7 @@ dependencies = [
|
||||
"hex-literal",
|
||||
"hyper",
|
||||
"jsonwebtoken",
|
||||
"leaky-bucket",
|
||||
"metrics",
|
||||
"nix 0.27.1",
|
||||
"once_cell",
|
||||
|
||||
@@ -5,6 +5,7 @@ members = [
|
||||
"control_plane",
|
||||
"control_plane/attachment_service",
|
||||
"pageserver",
|
||||
"pageserver/compaction",
|
||||
"pageserver/ctl",
|
||||
"pageserver/client",
|
||||
"pageserver/pagebench",
|
||||
@@ -97,6 +98,7 @@ ipnet = "2.9.0"
|
||||
itertools = "0.10"
|
||||
jsonwebtoken = "9"
|
||||
lasso = "0.7"
|
||||
leaky-bucket = "1.0.1"
|
||||
libc = "0.2"
|
||||
md5 = "0.7.0"
|
||||
memoffset = "0.8"
|
||||
@@ -198,6 +200,7 @@ consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
|
||||
metrics = { version = "0.1", path = "./libs/metrics/" }
|
||||
pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
|
||||
pageserver_client = { path = "./pageserver/client" }
|
||||
pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
|
||||
postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
|
||||
postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
|
||||
postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
|
||||
|
||||
@@ -72,6 +72,737 @@ RUN cd postgres && \
|
||||
fi; \
|
||||
done
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "postgis-build"
|
||||
# Build PostGIS from the upstream PostGIS mirror.
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS postgis-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
RUN apt update && \
|
||||
apt install -y cmake gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \
|
||||
libboost-system-dev libboost-iostreams-dev libboost-program-options-dev libboost-timer-dev \
|
||||
libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \
|
||||
protobuf-c-compiler xsltproc
|
||||
|
||||
# SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2
|
||||
RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \
|
||||
echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \
|
||||
mkdir sfcgal-src && cd sfcgal-src && tar xvzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
|
||||
cmake -DCMAKE_BUILD_TYPE=Release . && make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make clean && cp -R /sfcgal/* /
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin:$PATH"
|
||||
|
||||
RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \
|
||||
echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \
|
||||
mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
|
||||
./autogen.sh && \
|
||||
./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
cd extensions/postgis && \
|
||||
make clean && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_raster.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_sfcgal.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control && \
|
||||
mkdir -p /extensions/postgis && \
|
||||
cp /usr/local/pgsql/share/extension/postgis.control /extensions/postgis && \
|
||||
cp /usr/local/pgsql/share/extension/postgis_raster.control /extensions/postgis && \
|
||||
cp /usr/local/pgsql/share/extension/postgis_sfcgal.control /extensions/postgis && \
|
||||
cp /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control /extensions/postgis && \
|
||||
cp /usr/local/pgsql/share/extension/postgis_topology.control /extensions/postgis && \
|
||||
cp /usr/local/pgsql/share/extension/address_standardizer.control /extensions/postgis && \
|
||||
cp /usr/local/pgsql/share/extension/address_standardizer_data_us.control /extensions/postgis
|
||||
|
||||
RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \
|
||||
echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \
|
||||
mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \
|
||||
mkdir build && cd build && \
|
||||
cmake -DCMAKE_BUILD_TYPE=Release .. && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
|
||||
cp /usr/local/pgsql/share/extension/pgrouting.control /extensions/postgis && \
|
||||
sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
|
||||
comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/postgis.tar.zst -T -
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "plv8-build"
|
||||
# Build plv8
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS plv8-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN apt update && \
|
||||
apt install -y ninja-build python3-dev libncurses5 binutils clang
|
||||
|
||||
RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \
|
||||
echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \
|
||||
mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \
|
||||
# generate and copy upgrade scripts
|
||||
mkdir -p upgrade && ./generate_upgrade.sh 3.1.10 && \
|
||||
cp upgrade/* /usr/local/pgsql/share/extension/ && \
|
||||
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||
make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
rm -rf /plv8-* && \
|
||||
find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \
|
||||
# don't break computes with installed old version of plv8
|
||||
cd /usr/local/pgsql/lib/ && \
|
||||
ln -s plv8-3.1.10.so plv8-3.1.5.so && \
|
||||
ln -s plv8-3.1.10.so plv8-3.1.8.so && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plcoffee.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plls.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "h3-pg-build"
|
||||
# Build h3_pg
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS h3-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN case "$(uname -m)" in \
|
||||
"x86_64") \
|
||||
export CMAKE_CHECKSUM=739d372726cb23129d57a539ce1432453448816e345e1545f6127296926b6754 \
|
||||
;; \
|
||||
"aarch64") \
|
||||
export CMAKE_CHECKSUM=281b42627c9a1beed03e29706574d04c6c53fae4994472e90985ef018dd29c02 \
|
||||
;; \
|
||||
*) \
|
||||
echo "Unsupported architecture '$(uname -m)'. Supported are x86_64 and aarch64" && exit 1 \
|
||||
;; \
|
||||
esac && \
|
||||
wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-$(uname -m).sh \
|
||||
-q -O /tmp/cmake-install.sh \
|
||||
&& echo "${CMAKE_CHECKSUM} /tmp/cmake-install.sh" | sha256sum --check \
|
||||
&& chmod u+x /tmp/cmake-install.sh \
|
||||
&& /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
|
||||
&& rm /tmp/cmake-install.sh
|
||||
|
||||
RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \
|
||||
echo "ec99f1f5974846bde64f4513cf8d2ea1b8d172d2218ab41803bf6a63532272bc h3.tar.gz" | sha256sum --check && \
|
||||
mkdir h3-src && cd h3-src && tar xvzf ../h3.tar.gz --strip-components=1 -C . && \
|
||||
mkdir build && cd build && \
|
||||
cmake .. -DCMAKE_BUILD_TYPE=Release && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
DESTDIR=/h3 make install && \
|
||||
cp -R /h3/usr / && \
|
||||
rm -rf build
|
||||
|
||||
RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
|
||||
echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \
|
||||
mkdir h3-pg-src && cd h3-pg-src && tar xvzf ../h3-pg.tar.gz --strip-components=1 -C . && \
|
||||
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "unit-pg-build"
|
||||
# compile unit extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS unit-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \
|
||||
echo "411d05beeb97e5a4abf17572bfcfbb5a68d98d1018918feff995f6ee3bb03e79 postgresql-unit.tar.gz" | sha256sum --check && \
|
||||
mkdir postgresql-unit-src && cd postgresql-unit-src && tar xvzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
# unit extension's "create extension" script relies on absolute install path to fill some reference tables.
|
||||
# We move the extension from '/usr/local/pgsql/' to '/usr/local/' after it is build. So we need to adjust the path.
|
||||
# This one-liner removes pgsql/ part of the path.
|
||||
# NOTE: Other extensions that rely on MODULEDIR variable after building phase will need the same fix.
|
||||
find /usr/local/pgsql/share/extension/ -name "unit*.sql" -print0 | xargs -0 sed -i "s|pgsql/||g" && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/unit.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "vector-pg-build"
|
||||
# compile pgvector extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS vector-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \
|
||||
echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \
|
||||
mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pgjwt-pg-build"
|
||||
# compile pgjwt extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pgjwt-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# 9742dab1b2f297ad3811120db7b21451bca2d3c9 made on 13/11/2021
|
||||
RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \
|
||||
echo "cfdefb15007286f67d3d45510f04a6a7a495004be5b3aecb12cda667e774203f pgjwt.tar.gz" | sha256sum --check && \
|
||||
mkdir pgjwt-src && cd pgjwt-src && tar xvzf ../pgjwt.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "hypopg-pg-build"
|
||||
# compile hypopg extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS hypopg-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \
|
||||
echo "0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 hypopg.tar.gz" | sha256sum --check && \
|
||||
mkdir hypopg-src && cd hypopg-src && tar xvzf ../hypopg.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-hashids-pg-build"
|
||||
# compile pg_hashids extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-hashids-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \
|
||||
echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_hashids-src && cd pg_hashids-src && tar xvzf ../pg_hashids.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_hashids.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "rum-pg-build"
|
||||
# compile rum extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS rum-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \
|
||||
echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \
|
||||
mkdir rum-src && cd rum-src && tar xvzf ../rum.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pgtap-pg-build"
|
||||
# compile pgTAP extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pgtap-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \
|
||||
echo "9c7c3de67ea41638e14f06da5da57bac6f5bd03fea05c165a0ec862205a5c052 pgtap.tar.gz" | sha256sum --check && \
|
||||
mkdir pgtap-src && cd pgtap-src && tar xvzf ../pgtap.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgtap.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "ip4r-pg-build"
|
||||
# compile ip4r extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS ip4r-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
|
||||
echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \
|
||||
mkdir ip4r-src && cd ip4r-src && tar xvzf ../ip4r.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/ip4r.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "prefix-pg-build"
|
||||
# compile Prefix extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS prefix-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
|
||||
echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \
|
||||
mkdir prefix-src && cd prefix-src && tar xvzf ../prefix.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/prefix.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "hll-pg-build"
|
||||
# compile hll extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS hll-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
|
||||
echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \
|
||||
mkdir hll-src && cd hll-src && tar xvzf ../hll.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/hll.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "plpgsql-check-pg-build"
|
||||
# compile plpgsql_check extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS plpgsql-check-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \
|
||||
echo "6631ec3e7fb3769eaaf56e3dfedb829aa761abf163d13dba354b4c218508e1c0 plpgsql_check.tar.gz" | sha256sum --check && \
|
||||
mkdir plpgsql_check-src && cd plpgsql_check-src && tar xvzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "timescaledb-pg-build"
|
||||
# compile timescaledb extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS timescaledb-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ARG PG_VERSION
|
||||
ENV PATH "/usr/local/pgsql/bin:$PATH"
|
||||
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v14" | "v15") \
|
||||
export TIMESCALEDB_VERSION=2.10.1 \
|
||||
export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \
|
||||
;; \
|
||||
*) \
|
||||
export TIMESCALEDB_VERSION=2.13.0 \
|
||||
export TIMESCALEDB_CHECKSUM=584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d \
|
||||
;; \
|
||||
esac && \
|
||||
apt-get update && \
|
||||
apt-get install -y cmake && \
|
||||
wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
|
||||
echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \
|
||||
mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
|
||||
./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
|
||||
cd build && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make install -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/timescaledb.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-hint-plan-pg-build"
|
||||
# compile pg_hint_plan extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-hint-plan-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ARG PG_VERSION
|
||||
ENV PATH "/usr/local/pgsql/bin:$PATH"
|
||||
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v14") \
|
||||
export PG_HINT_PLAN_VERSION=14_1_4_1 \
|
||||
export PG_HINT_PLAN_CHECKSUM=c3501becf70ead27f70626bce80ea401ceac6a77e2083ee5f3ff1f1444ec1ad1 \
|
||||
;; \
|
||||
"v15") \
|
||||
export PG_HINT_PLAN_VERSION=15_1_5_0 \
|
||||
export PG_HINT_PLAN_CHECKSUM=564cbbf4820973ffece63fbf76e3c0af62c4ab23543142c7caaa682bc48918be \
|
||||
;; \
|
||||
"v16") \
|
||||
export PG_HINT_PLAN_VERSION=16_1_6_0 \
|
||||
export PG_HINT_PLAN_CHECKSUM=fc85a9212e7d2819d4ae4ac75817481101833c3cfa9f0fe1f980984e12347d00 \
|
||||
;; \
|
||||
*) \
|
||||
echo "Export the valid PG_HINT_PLAN_VERSION variable" && exit 1 \
|
||||
;; \
|
||||
esac && \
|
||||
wget https://github.com/ossc-db/pg_hint_plan/archive/refs/tags/REL${PG_HINT_PLAN_VERSION}.tar.gz -O pg_hint_plan.tar.gz && \
|
||||
echo "${PG_HINT_PLAN_CHECKSUM} pg_hint_plan.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_hint_plan-src && cd pg_hint_plan-src && tar xvzf ../pg_hint_plan.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make install -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_hint_plan.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "kq-imcx-pg-build"
|
||||
# compile kq_imcx extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS kq-imcx-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN apt-get update && \
|
||||
apt-get install -y git libgtk2.0-dev libpq-dev libpam-dev libxslt-dev libkrb5-dev cmake && \
|
||||
wget https://github.com/ketteq-neon/postgres-exts/archive/e0bd1a9d9313d7120c1b9c7bb15c48c0dede4c4e.tar.gz -O kq_imcx.tar.gz && \
|
||||
echo "dc93a97ff32d152d32737ba7e196d9687041cda15e58ab31344c2f2de8855336 kq_imcx.tar.gz" | sha256sum --check && \
|
||||
mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
|
||||
mkdir build && cd build && \
|
||||
cmake -DCMAKE_BUILD_TYPE=Release .. && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
|
||||
mkdir -p /extensions/kq_imcx && cp /usr/local/pgsql/share/extension/kq_imcx.control /extensions/kq_imcx && \
|
||||
sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
|
||||
comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/kq_imcx.tar.zst -T -
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-cron-pg-build"
|
||||
# compile pg_cron extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-cron-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
|
||||
echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_cron-src && cd pg_cron-src && tar xvzf ../pg_cron.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_cron.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "rdkit-pg-build"
|
||||
# compile rdkit extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS rdkit-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y \
|
||||
cmake \
|
||||
libboost-iostreams1.74-dev \
|
||||
libboost-regex1.74-dev \
|
||||
libboost-serialization1.74-dev \
|
||||
libboost-system1.74-dev \
|
||||
libeigen3-dev
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
|
||||
RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
|
||||
echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \
|
||||
mkdir rdkit-src && cd rdkit-src && tar xvzf ../rdkit.tar.gz --strip-components=1 -C . && \
|
||||
cmake \
|
||||
-D RDK_BUILD_CAIRO_SUPPORT=OFF \
|
||||
-D RDK_BUILD_INCHI_SUPPORT=ON \
|
||||
-D RDK_BUILD_AVALON_SUPPORT=ON \
|
||||
-D RDK_BUILD_PYTHON_WRAPPERS=OFF \
|
||||
-D RDK_BUILD_DESCRIPTORS3D=OFF \
|
||||
-D RDK_BUILD_FREESASA_SUPPORT=OFF \
|
||||
-D RDK_BUILD_COORDGEN_SUPPORT=ON \
|
||||
-D RDK_BUILD_MOLINTERCHANGE_SUPPORT=OFF \
|
||||
-D RDK_BUILD_YAEHMOP_SUPPORT=OFF \
|
||||
-D RDK_BUILD_STRUCTCHECKER_SUPPORT=OFF \
|
||||
-D RDK_USE_URF=OFF \
|
||||
-D RDK_BUILD_PGSQL=ON \
|
||||
-D RDK_PGSQL_STATIC=ON \
|
||||
-D PostgreSQL_CONFIG=pg_config \
|
||||
-D PostgreSQL_INCLUDE_DIR=`pg_config --includedir` \
|
||||
-D PostgreSQL_TYPE_INCLUDE_DIR=`pg_config --includedir-server` \
|
||||
-D PostgreSQL_LIBRARY_DIR=`pg_config --libdir` \
|
||||
-D RDK_INSTALL_INTREE=OFF \
|
||||
-D RDK_INSTALL_COMIC_FONTS=OFF \
|
||||
-D RDK_BUILD_FREETYPE_SUPPORT=OFF \
|
||||
-D CMAKE_BUILD_TYPE=Release \
|
||||
. && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/rdkit.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-uuidv7-pg-build"
|
||||
# compile pg_uuidv7 extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-uuidv7-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \
|
||||
echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xvzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_uuidv7.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-roaringbitmap-pg-build"
|
||||
# compile pg_roaringbitmap extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-roaringbitmap-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
|
||||
echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xvzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-semver-pg-build"
|
||||
# compile pg_semver extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-semver-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \
|
||||
echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_semver-src && cd pg_semver-src && tar xvzf ../pg_semver.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/semver.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-embedding-pg-build"
|
||||
# compile pg_embedding extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-embedding-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ARG PG_VERSION
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v14" | "v15") \
|
||||
export PG_EMBEDDING_VERSION=0.3.5 \
|
||||
export PG_EMBEDDING_CHECKSUM=0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 \
|
||||
;; \
|
||||
*) \
|
||||
echo "pg_embedding not supported on this PostgreSQL version. Use pgvector instead." && exit 0;; \
|
||||
esac && \
|
||||
wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/${PG_EMBEDDING_VERSION}.tar.gz -O pg_embedding.tar.gz && \
|
||||
echo "${PG_EMBEDDING_CHECKSUM} pg_embedding.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-anon-pg-build"
|
||||
# compile anon extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-anon-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
|
||||
echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9 pg_anon.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
|
||||
mkdir -p /extensions/anon && cp /usr/local/pgsql/share/extension/anon.control /extensions/anon && \
|
||||
sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
|
||||
comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/anon.tar.zst -T -
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "rust extensions"
|
||||
# This layer is used to build `pgrx` deps
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS rust-extensions-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y curl libclang-dev cmake && \
|
||||
useradd -ms /bin/bash nonroot -b /home
|
||||
|
||||
ENV HOME=/home/nonroot
|
||||
ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
|
||||
USER nonroot
|
||||
WORKDIR /home/nonroot
|
||||
ARG PG_VERSION
|
||||
|
||||
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
|
||||
chmod +x rustup-init && \
|
||||
./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
|
||||
rm rustup-init && \
|
||||
cargo install --locked --version 0.10.2 cargo-pgrx && \
|
||||
/bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
|
||||
|
||||
USER root
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-jsonschema-pg-build"
|
||||
# Compile "pg_jsonschema" extension
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM rust-extensions-build AS pg-jsonschema-pg-build
|
||||
ARG PG_VERSION
|
||||
|
||||
RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.2.0.tar.gz -O pg_jsonschema.tar.gz && \
|
||||
echo "9118fc508a6e231e7a39acaa6f066fcd79af17a5db757b47d2eefbe14f7794f0 pg_jsonschema.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xvzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
|
||||
sed -i 's/pgrx = "0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
cargo pgrx install --release && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-graphql-pg-build"
|
||||
# Compile "pg_graphql" extension
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM rust-extensions-build AS pg-graphql-pg-build
|
||||
ARG PG_VERSION
|
||||
|
||||
RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.4.0.tar.gz -O pg_graphql.tar.gz && \
|
||||
echo "bd8dc7230282b3efa9ae5baf053a54151ed0e66881c7c53750e2d0c765776edc pg_graphql.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_graphql-src && cd pg_graphql-src && tar xvzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
|
||||
sed -i 's/pgrx = "=0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
cargo pgrx install --release && \
|
||||
# it's needed to enable extension because it uses untrusted C language
|
||||
sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_graphql.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-tiktoken-build"
|
||||
# Compile "pg_tiktoken" extension
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM rust-extensions-build AS pg-tiktoken-pg-build
|
||||
ARG PG_VERSION
|
||||
|
||||
# 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023
|
||||
RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \
|
||||
echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xvzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
|
||||
cargo pgrx install --release && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-pgx-ulid-build"
|
||||
# Compile "pgx_ulid" extension
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM rust-extensions-build AS pg-pgx-ulid-build
|
||||
ARG PG_VERSION
|
||||
|
||||
RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz -O pgx_ulid.tar.gz && \
|
||||
echo "ee5db82945d2d9f2d15597a80cf32de9dca67b897f605beb830561705f12683c pgx_ulid.tar.gz" | sha256sum --check && \
|
||||
mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
|
||||
echo "******************* Apply a patch for Postgres 16 support; delete in the next release ******************" && \
|
||||
wget https://github.com/pksunkara/pgx_ulid/commit/f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
|
||||
patch -p1 < f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
|
||||
echo "********************************************************************************************************" && \
|
||||
sed -i 's/pgrx = "=0.10.2"/pgrx = { version = "=0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
cargo pgrx install --release && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "wal2json-build"
|
||||
# Compile "wal2json" extension
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM build-deps AS wal2json-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \
|
||||
echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
|
||||
mkdir wal2json-src && cd wal2json-src && tar xvzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg_ivm"
|
||||
# compile pg_ivm extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-ivm-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \
|
||||
echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_ivm-src && cd pg_ivm-src && tar xvzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_ivm.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg_partman"
|
||||
# compile pg_partman extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-partman-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \
|
||||
echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_partman-src && cd pg_partman-src && tar xvzf ../pg_partman.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "neon-pg-ext-build"
|
||||
@@ -81,6 +812,40 @@ RUN cd postgres && \
|
||||
FROM build-deps AS neon-pg-ext-build
|
||||
ARG PG_VERSION
|
||||
|
||||
# Public extensions
|
||||
COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=postgis-build /sfcgal/* /
|
||||
COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=h3-pg-build /h3/usr /
|
||||
COPY --from=unit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=vector-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pgjwt-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-jsonschema-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-graphql-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-tiktoken-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=hypopg-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-hashids-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=rum-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pgtap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=ip4r-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=prefix-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=hll-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=plpgsql-check-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=kq-imcx-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-semver-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
|
||||
COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY pgxn/ pgxn/
|
||||
|
||||
RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
@@ -126,7 +891,17 @@ ENV BUILD_TAG=$BUILD_TAG
|
||||
USER nonroot
|
||||
# Copy entire project to get Cargo.* files with proper dependencies for the whole project
|
||||
COPY --chown=nonroot . .
|
||||
RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto
|
||||
RUN cd compute_tools && mold -run cargo build --locked --profile release-line-debug-size-lto
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Final compute-tools image
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM debian:bullseye-slim AS compute-tools-image
|
||||
|
||||
COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -166,7 +941,7 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
|
||||
# create folder for file cache
|
||||
mkdir -p -m 777 /neon/cache
|
||||
|
||||
COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local/pgsql-$PG_VERSION
|
||||
COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
|
||||
COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
|
||||
|
||||
# Install:
|
||||
|
||||
@@ -1,179 +0,0 @@
|
||||
ARG REPOSITORY=neondatabase
|
||||
ARG IMAGE=build-tools
|
||||
ARG TAG=pinned
|
||||
ARG BUILD_TAG
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "build-deps"
|
||||
#
|
||||
#########################################################################################
|
||||
FROM debian:bullseye-slim AS build-deps
|
||||
RUN apt update && \
|
||||
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
|
||||
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
|
||||
libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-build"
|
||||
# Build Postgres from the neon postgres repository.
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-build
|
||||
COPY "vendor/postgres-v14" /postgres-v14
|
||||
COPY "vendor/postgres-v15" /postgres-v15
|
||||
COPY "vendor/postgres-v16" /postgres-v16
|
||||
RUN for pg_version in v14 v15 v16; do \
|
||||
install_dir="/postgres-$pg_version"; \
|
||||
cd "$install_dir"; \
|
||||
prefix="/usr/local/pgsql-${pg_version}"; \
|
||||
export CONFIGURE_CMD="./configure --prefix ${prefix} CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp \
|
||||
--with-icu --with-libxml --with-libxslt --with-lz4" && \
|
||||
if [ "${pg_version}" != "v14" ]; then \
|
||||
# zstd is available only from PG15
|
||||
export CONFIGURE_CMD="${CONFIGURE_CMD} --with-zstd"; \
|
||||
fi && \
|
||||
eval $CONFIGURE_CMD && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
|
||||
# Install headers
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \
|
||||
extension_dir="${prefix}/share/extension" && \
|
||||
# Enable some of contrib extensions
|
||||
echo 'trusted = true' >> $extension_dir/autoinc.control && \
|
||||
echo 'trusted = true' >> $extension_dir/bloom.control && \
|
||||
echo 'trusted = true' >> $extension_dir/earthdistance.control && \
|
||||
echo 'trusted = true' >> $extension_dir/insert_username.control && \
|
||||
echo 'trusted = true' >> $extension_dir/intagg.control && \
|
||||
echo 'trusted = true' >> $extension_dir/moddatetime.control && \
|
||||
echo 'trusted = true' >> $extension_dir/pg_stat_statements.control && \
|
||||
echo 'trusted = true' >> $extension_dir/pgrowlocks.control && \
|
||||
echo 'trusted = true' >> $extension_dir/pgstattuple.control && \
|
||||
echo 'trusted = true' >> $extension_dir/refint.control && \
|
||||
echo 'trusted = true' >> $extension_dir/xml2.control && \
|
||||
# We need to grant EXECUTE on pg_stat_statements_reset() to neon_superuser.
|
||||
# In vanilla postgres this function is limited to Postgres role superuser.
|
||||
# In neon we have neon_superuser role that is not a superuser but replaces superuser in some cases.
|
||||
# We could add the additional grant statements to the postgres repository but it would be hard to maintain,
|
||||
# whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork,
|
||||
# so we do it here.
|
||||
old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \
|
||||
# the first loop is for pg_stat_statement extension version <= 1.6
|
||||
for file in $prefix/share/extension/pg_stat_statements--*.sql; do \
|
||||
filename=$(basename "$file"); \
|
||||
if echo "$old_list" | grep -q -F "$filename"; then \
|
||||
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \
|
||||
fi; \
|
||||
done; \
|
||||
# the second loop is for pg_stat_statement extension versions >= 1.7,
|
||||
# where pg_stat_statement_reset() got 3 additional arguments
|
||||
for file in $prefix/share/extension/pg_stat_statements--*.sql; do \
|
||||
filename=$(basename "$file"); \
|
||||
if ! echo "$old_list" | grep -q -F "$filename"; then \
|
||||
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \
|
||||
fi; \
|
||||
done; \
|
||||
# Go back to root dir from `/postgres-v<version>` dir
|
||||
cd ..; \
|
||||
done
|
||||
|
||||
# #########################################################################################
|
||||
# #
|
||||
# # Compile and run the Neon-specific `compute_ctl` binary
|
||||
# #
|
||||
# #########################################################################################
|
||||
# FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools
|
||||
# ARG BUILD_TAG
|
||||
# ENV BUILD_TAG=$BUILD_TAG
|
||||
#
|
||||
# USER nonroot
|
||||
# # Copy entire project to get Cargo.* files with proper dependencies for the whole project
|
||||
# COPY --chown=nonroot . .
|
||||
# RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto
|
||||
#
|
||||
# #########################################################################################
|
||||
# #
|
||||
# # Clean up postgres folder before inclusion
|
||||
# #
|
||||
# #########################################################################################
|
||||
# FROM pg-build AS postgres-cleanup-layer
|
||||
# # COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql
|
||||
#
|
||||
# RUN for pg_version in v14 v15 v16; do \
|
||||
# prefix="/usr/local/pgsql-${pg_version}"; \
|
||||
# # Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise)
|
||||
# cd "${prefix}/bin" && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp; \
|
||||
# cd ..; \
|
||||
# # Remove headers that we won't need anymore - we've completed installation of all extensions
|
||||
# rm -r "${prefix}/include"; \
|
||||
# # Remove static postgresql libraries - all compilation is finished, so we
|
||||
# # can now remove these files - they must be included in other binaries by now
|
||||
# # if they were to be used by other libraries.
|
||||
# rm ${prefix}/lib/lib*.a; \
|
||||
# done
|
||||
#
|
||||
# #########################################################################################
|
||||
# #
|
||||
# # Final layer
|
||||
# # Put it all together into the final image
|
||||
# #
|
||||
# #########################################################################################
|
||||
# FROM debian:bullseye-slim
|
||||
# # Add user postgres
|
||||
# RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
|
||||
# echo "postgres:test_console_pass" | chpasswd && \
|
||||
# mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
|
||||
# mkdir /var/db/postgres/pgbouncer && \
|
||||
# chown -R postgres:postgres /var/db/postgres && \
|
||||
# chmod 0750 /var/db/postgres/compute && \
|
||||
# chmod 0750 /var/db/postgres/pgbouncer && \
|
||||
# echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig && \
|
||||
# # create folder for file cache
|
||||
# mkdir -p -m 777 /neon/cache
|
||||
#
|
||||
# COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql-v14 /usr/local/pgsql-v14
|
||||
# COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql-v15 /usr/local/pgsql-v15
|
||||
# COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql-v16 /usr/local/pgsql-v16
|
||||
# COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
|
||||
#
|
||||
# # Install:
|
||||
# # libreadline8 for psql
|
||||
# # libicu67, locales for collations (including ICU and plpgsql_check)
|
||||
# # liblz4-1 for lz4
|
||||
# # libossp-uuid16 for extension ossp-uuid
|
||||
# # libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
|
||||
# # libxml2, libxslt1.1 for xml2
|
||||
# # libzstd1 for zstd
|
||||
# # libboost* for rdkit
|
||||
# # ca-certificates for communicating with s3 by compute_ctl
|
||||
# RUN apt update && \
|
||||
# apt install --no-install-recommends -y \
|
||||
# gdb \
|
||||
# libicu67 \
|
||||
# liblz4-1 \
|
||||
# libreadline8 \
|
||||
# libboost-iostreams1.74.0 \
|
||||
# libboost-regex1.74.0 \
|
||||
# libboost-serialization1.74.0 \
|
||||
# libboost-system1.74.0 \
|
||||
# libossp-uuid16 \
|
||||
# libgeos-c1v5 \
|
||||
# libgdal28 \
|
||||
# libproj19 \
|
||||
# libprotobuf-c1 \
|
||||
# libsfcgal1 \
|
||||
# libxml2 \
|
||||
# libxslt1.1 \
|
||||
# libzstd1 \
|
||||
# libcurl4-openssl-dev \
|
||||
# locales \
|
||||
# procps \
|
||||
# ca-certificates && \
|
||||
# rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
|
||||
# localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
|
||||
#
|
||||
# ENV LANG en_US.utf8
|
||||
# USER postgres
|
||||
# ENTRYPOINT ["/usr/local/bin/compute_ctl"]
|
||||
@@ -1,65 +0,0 @@
|
||||
FROM debian:bullseye-slim
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Final layer
|
||||
# Put it all together into the final image
|
||||
#
|
||||
#########################################################################################
|
||||
FROM debian:bullseye-slim
|
||||
# Add user postgres
|
||||
RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
|
||||
echo "postgres:test_console_pass" | chpasswd && \
|
||||
mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
|
||||
mkdir /var/db/postgres/pgbouncer && \
|
||||
chown -R postgres:postgres /var/db/postgres && \
|
||||
chmod 0750 /var/db/postgres/compute && \
|
||||
chmod 0750 /var/db/postgres/pgbouncer && \
|
||||
echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig && \
|
||||
# create folder for file cache
|
||||
mkdir -p -m 777 /neon/cache
|
||||
|
||||
COPY --from=base:v14 --chown=postgres /usr/local/pgsql /usr/local/pgsql-v14
|
||||
COPY --from=base:v15 --chown=postgres /usr/local/pgsql /usr/local/pgsql-v15
|
||||
COPY --from=base:v16 --chown=postgres /usr/local/pgsql /usr/local/pgsql-v16
|
||||
COPY --from=tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
|
||||
|
||||
# Install:
|
||||
# libreadline8 for psql
|
||||
# libicu67, locales for collations (including ICU and plpgsql_check)
|
||||
# liblz4-1 for lz4
|
||||
# libossp-uuid16 for extension ossp-uuid
|
||||
# libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
|
||||
# libxml2, libxslt1.1 for xml2
|
||||
# libzstd1 for zstd
|
||||
# libboost* for rdkit
|
||||
# ca-certificates for communicating with s3 by compute_ctl
|
||||
RUN apt update && \
|
||||
apt install --no-install-recommends -y \
|
||||
gdb \
|
||||
libicu67 \
|
||||
liblz4-1 \
|
||||
libreadline8 \
|
||||
libboost-iostreams1.74.0 \
|
||||
libboost-regex1.74.0 \
|
||||
libboost-serialization1.74.0 \
|
||||
libboost-system1.74.0 \
|
||||
libossp-uuid16 \
|
||||
libgeos-c1v5 \
|
||||
libgdal28 \
|
||||
libproj19 \
|
||||
libprotobuf-c1 \
|
||||
libsfcgal1 \
|
||||
libxml2 \
|
||||
libxslt1.1 \
|
||||
libzstd1 \
|
||||
libcurl4-openssl-dev \
|
||||
locales \
|
||||
procps \
|
||||
ca-certificates && \
|
||||
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
|
||||
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
|
||||
|
||||
ENV LANG en_US.utf8
|
||||
USER postgres
|
||||
ENTRYPOINT ["/usr/local/bin/compute_ctl"]
|
||||
@@ -1,159 +0,0 @@
|
||||
ARG PG_VERSION
|
||||
ARG REPOSITORY=neondatabase
|
||||
ARG IMAGE=build-tools
|
||||
ARG TAG=pinned
|
||||
ARG BUILD_TAG
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "build-deps"
|
||||
#
|
||||
#########################################################################################
|
||||
FROM debian:bullseye-slim AS build-deps
|
||||
RUN apt update && \
|
||||
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
|
||||
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
|
||||
libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-build"
|
||||
# Build Postgres from the neon postgres repository.
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-build
|
||||
ARG PG_VERSION
|
||||
COPY vendor/postgres-${PG_VERSION} postgres
|
||||
RUN cd postgres && \
|
||||
export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp \
|
||||
--with-icu --with-libxml --with-libxslt --with-lz4" && \
|
||||
if [ "${PG_VERSION}" != "v14" ]; then \
|
||||
# zstd is available only from PG15
|
||||
export CONFIGURE_CMD="${CONFIGURE_CMD} --with-zstd"; \
|
||||
fi && \
|
||||
eval $CONFIGURE_CMD && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
|
||||
# Install headers
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \
|
||||
# Enable some of contrib extensions
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/autoinc.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/insert_username.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/moddatetime.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_stat_statements.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/refint.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control && \
|
||||
# We need to grant EXECUTE on pg_stat_statements_reset() to neon_superuser.
|
||||
# In vanilla postgres this function is limited to Postgres role superuser.
|
||||
# In neon we have neon_superuser role that is not a superuser but replaces superuser in some cases.
|
||||
# We could add the additional grant statements to the postgres repository but it would be hard to maintain,
|
||||
# whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork,
|
||||
# so we do it here.
|
||||
old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \
|
||||
# the first loop is for pg_stat_statement extension version <= 1.6
|
||||
for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
|
||||
filename=$(basename "$file"); \
|
||||
if echo "$old_list" | grep -q -F "$filename"; then \
|
||||
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \
|
||||
fi; \
|
||||
done; \
|
||||
# the second loop is for pg_stat_statement extension versions >= 1.7,
|
||||
# where pg_stat_statement_reset() got 3 additional arguments
|
||||
for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
|
||||
filename=$(basename "$file"); \
|
||||
if ! echo "$old_list" | grep -q -F "$filename"; then \
|
||||
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \
|
||||
fi; \
|
||||
done
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "neon-pg-ext-build"
|
||||
# compile neon extensions
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS neon-pg-ext-build
|
||||
ARG PG_VERSION
|
||||
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY pgxn/ pgxn/
|
||||
|
||||
RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
|
||||
-C pgxn/neon \
|
||||
-s install && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
|
||||
-C pgxn/neon_utils \
|
||||
-s install && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
|
||||
-C pgxn/neon_test_utils \
|
||||
-s install && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
|
||||
-C pgxn/neon_rmgr \
|
||||
-s install && \
|
||||
case "${PG_VERSION}" in \
|
||||
"v14" | "v15") \
|
||||
;; \
|
||||
"v16") \
|
||||
echo "Skipping HNSW for PostgreSQL 16" && exit 0 \
|
||||
;; \
|
||||
*) \
|
||||
echo "unexpected PostgreSQL version" && exit 1 \
|
||||
;; \
|
||||
esac && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
|
||||
-C pgxn/hnsw \
|
||||
-s install
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Compile and run the Neon-specific `compute_ctl` binary
|
||||
#
|
||||
#########################################################################################
|
||||
FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools
|
||||
ARG BUILD_TAG
|
||||
ENV BUILD_TAG=$BUILD_TAG
|
||||
|
||||
USER nonroot
|
||||
# Copy entire project to get Cargo.* files with proper dependencies for the whole project
|
||||
COPY --chown=nonroot . .
|
||||
RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Clean up postgres folder before inclusion
|
||||
#
|
||||
#########################################################################################
|
||||
FROM neon-pg-ext-build AS postgres-cleanup-layer
|
||||
COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql
|
||||
|
||||
# Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise)
|
||||
RUN cd /usr/local/pgsql/bin && rm ecpg
|
||||
|
||||
# Remove headers that we won't need anymore - we've completed installation of all extensions
|
||||
RUN rm -r /usr/local/pgsql/include
|
||||
|
||||
# Remove static postgresql libraries - all compilation is finished, so we
|
||||
# can now remove these files - they must be included in other binaries by now
|
||||
# if they were to be used by other libraries.
|
||||
RUN rm /usr/local/pgsql/lib/lib*.a
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Final layer
|
||||
# Put it all together into the final image
|
||||
#
|
||||
#########################################################################################
|
||||
FROM debian:bullseye-slim
|
||||
|
||||
COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local/pgsql
|
||||
@@ -1,18 +0,0 @@
|
||||
ARG REPOSITORY=neondatabase
|
||||
ARG IMAGE=build-tools
|
||||
ARG TAG=pinned
|
||||
ARG BUILD_TAG
|
||||
#########################################################################################
|
||||
#
|
||||
# Compile and run the Neon-specific `compute_ctl` binary
|
||||
#
|
||||
#########################################################################################
|
||||
FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools
|
||||
ARG BUILD_TAG
|
||||
ENV BUILD_TAG=$BUILD_TAG
|
||||
|
||||
USER nonroot
|
||||
# Copy entire project to get Cargo.* files with proper dependencies for the whole project
|
||||
COPY --chown=nonroot . .
|
||||
RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto
|
||||
|
||||
@@ -1,32 +0,0 @@
|
||||
# First transient image to build compute_tools binaries
|
||||
# NB: keep in sync with rust image version in .github/workflows/build_and_test.yml
|
||||
ARG REPOSITORY=neondatabase
|
||||
ARG IMAGE=build-tools
|
||||
ARG TAG=pinned
|
||||
ARG BUILD_TAG
|
||||
|
||||
FROM $REPOSITORY/$IMAGE:$TAG AS rust-build
|
||||
WORKDIR /home/nonroot
|
||||
|
||||
# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
|
||||
# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
|
||||
# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build.
|
||||
ARG RUSTC_WRAPPER=cachepot
|
||||
ENV AWS_REGION=eu-central-1
|
||||
ENV CACHEPOT_S3_KEY_PREFIX=cachepot
|
||||
ARG CACHEPOT_BUCKET=neon-github-dev
|
||||
#ARG AWS_ACCESS_KEY_ID
|
||||
#ARG AWS_SECRET_ACCESS_KEY
|
||||
ARG BUILD_TAG
|
||||
ENV BUILD_TAG=$BUILD_TAG
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN set -e \
|
||||
&& mold -run cargo build -p compute_tools --locked --release \
|
||||
&& cachepot -s
|
||||
|
||||
# Final image that only has one binary
|
||||
FROM debian:bullseye-slim
|
||||
|
||||
COPY --from=rust-build /home/nonroot/target/release/compute_ctl /usr/local/bin/compute_ctl
|
||||
14
README.md
14
README.md
@@ -5,7 +5,7 @@
|
||||
Neon is a serverless open-source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes the PostgreSQL storage layer by redistributing data across a cluster of nodes.
|
||||
|
||||
## Quick start
|
||||
Try the [Neon Free Tier](https://neon.tech/docs/introduction/technical-preview-free-tier/) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions.
|
||||
Try the [Neon Free Tier](https://neon.tech/github) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions.
|
||||
|
||||
Alternatively, compile and run the project [locally](#running-local-installation).
|
||||
|
||||
@@ -230,6 +230,12 @@ postgres=# select * from t;
|
||||
> cargo neon stop
|
||||
```
|
||||
|
||||
More advanced usages can be found at [Control Plane and Neon Local](./control_plane/README.md).
|
||||
|
||||
#### Handling build failures
|
||||
|
||||
If you encounter errors during setting up the initial tenant, it's best to stop everything (`cargo neon stop`) and remove the `.neon` directory. Then fix the problems, and start the setup again.
|
||||
|
||||
## Running tests
|
||||
|
||||
Ensure your dependencies are installed as described [here](https://github.com/neondatabase/neon#dependency-installation-notes).
|
||||
@@ -259,6 +265,12 @@ You can use [`flamegraph-rs`](https://github.com/flamegraph-rs/flamegraph) or th
|
||||
> It's a [general thing with Rust / lld / mold](https://crbug.com/919499#c16), not specific to this repository.
|
||||
> See [this PR for further instructions](https://github.com/neondatabase/neon/pull/6764).
|
||||
|
||||
## Cleanup
|
||||
|
||||
For cleaning up the source tree from build artifacts, run `make clean` in the source directory.
|
||||
|
||||
For removing every artifact from build and configure steps, run `make distclean`, and also consider removing the cargo binaries in the `target` directory, as well as the database in the `.neon` directory. Note that removing the `.neon` directory will remove your database, with all data in it. You have been warned!
|
||||
|
||||
## Documentation
|
||||
|
||||
[docs](/docs) Contains a top-level overview of all available markdown documentation.
|
||||
|
||||
@@ -45,7 +45,6 @@ use std::{thread, time::Duration};
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::Utc;
|
||||
use clap::Arg;
|
||||
use nix::sys::signal::{kill, Signal};
|
||||
use signal_hook::consts::{SIGQUIT, SIGTERM};
|
||||
use signal_hook::{consts::SIGINT, iterator::Signals};
|
||||
use tracing::{error, info};
|
||||
@@ -53,7 +52,9 @@ use url::Url;
|
||||
|
||||
use compute_api::responses::ComputeStatus;
|
||||
|
||||
use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec, PG_PID, SYNC_SAFEKEEPERS_PID};
|
||||
use compute_tools::compute::{
|
||||
forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID,
|
||||
};
|
||||
use compute_tools::configurator::launch_configurator;
|
||||
use compute_tools::extension_server::get_pg_version;
|
||||
use compute_tools::http::api::launch_http_server;
|
||||
@@ -394,6 +395,15 @@ fn main() -> Result<()> {
|
||||
info!("synced safekeepers at lsn {lsn}");
|
||||
}
|
||||
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
if state.status == ComputeStatus::TerminationPending {
|
||||
state.status = ComputeStatus::Terminated;
|
||||
compute.state_changed.notify_all();
|
||||
// we were asked to terminate gracefully, don't exit to avoid restart
|
||||
delay_exit = true
|
||||
}
|
||||
drop(state);
|
||||
|
||||
if let Err(err) = compute.check_for_core_dumps() {
|
||||
error!("error while checking for core dumps: {err:?}");
|
||||
}
|
||||
@@ -523,16 +533,7 @@ fn cli() -> clap::Command {
|
||||
/// wait for termination which would be easy then.
|
||||
fn handle_exit_signal(sig: i32) {
|
||||
info!("received {sig} termination signal");
|
||||
let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
|
||||
if ss_pid != 0 {
|
||||
let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
|
||||
kill(ss_pid, Signal::SIGTERM).ok();
|
||||
}
|
||||
let pg_pid = PG_PID.load(Ordering::SeqCst);
|
||||
if pg_pid != 0 {
|
||||
let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
|
||||
kill(pg_pid, Signal::SIGTERM).ok();
|
||||
}
|
||||
forward_termination_signal();
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
@@ -18,8 +18,6 @@ use futures::future::join_all;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::StreamExt;
|
||||
use postgres::{Client, NoTls};
|
||||
use tokio;
|
||||
use tokio_postgres;
|
||||
use tracing::{debug, error, info, instrument, warn};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
@@ -28,6 +26,8 @@ use compute_api::responses::{ComputeMetrics, ComputeStatus};
|
||||
use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec};
|
||||
use utils::measured_stream::MeasuredReader;
|
||||
|
||||
use nix::sys::signal::{kill, Signal};
|
||||
|
||||
use remote_storage::{DownloadError, RemotePath};
|
||||
|
||||
use crate::checker::create_availability_check_data;
|
||||
@@ -1322,3 +1322,17 @@ LIMIT 100",
|
||||
Ok(remote_ext_metrics)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn forward_termination_signal() {
|
||||
let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
|
||||
if ss_pid != 0 {
|
||||
let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
|
||||
kill(ss_pid, Signal::SIGTERM).ok();
|
||||
}
|
||||
let pg_pid = PG_PID.load(Ordering::SeqCst);
|
||||
if pg_pid != 0 {
|
||||
let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
|
||||
// use 'immediate' shutdown (SIGQUIT): https://www.postgresql.org/docs/current/server-shutdown.html
|
||||
kill(pg_pid, Signal::SIGQUIT).ok();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -51,6 +51,9 @@ pub fn write_postgres_conf(
|
||||
if let Some(s) = &spec.pageserver_connstring {
|
||||
writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
|
||||
}
|
||||
if let Some(stripe_size) = spec.shard_stripe_size {
|
||||
writeln!(file, "neon.stripe_size={stripe_size}")?;
|
||||
}
|
||||
if !spec.safekeeper_connstrings.is_empty() {
|
||||
writeln!(
|
||||
file,
|
||||
@@ -79,6 +82,12 @@ pub fn write_postgres_conf(
|
||||
ComputeMode::Replica => {
|
||||
// hot_standby is 'on' by default, but let's be explicit
|
||||
writeln!(file, "hot_standby=on")?;
|
||||
|
||||
// Inform the replica about the primary state
|
||||
// Default is 'false'
|
||||
if let Some(primary_is_running) = spec.primary_is_running {
|
||||
writeln!(file, "neon.primary_is_running={}", primary_is_running)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -71,7 +71,7 @@ More specifically, here is an example ext_index.json
|
||||
}
|
||||
}
|
||||
*/
|
||||
use anyhow::{self, Result};
|
||||
use anyhow::Result;
|
||||
use anyhow::{bail, Context};
|
||||
use bytes::Bytes;
|
||||
use compute_api::spec::RemoteExtSpec;
|
||||
|
||||
@@ -5,6 +5,7 @@ use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
|
||||
use crate::compute::forward_termination_signal;
|
||||
use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
|
||||
use compute_api::requests::ConfigurationRequest;
|
||||
use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};
|
||||
@@ -12,8 +13,6 @@ use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIErr
|
||||
use anyhow::Result;
|
||||
use hyper::service::{make_service_fn, service_fn};
|
||||
use hyper::{Body, Method, Request, Response, Server, StatusCode};
|
||||
use num_cpus;
|
||||
use serde_json;
|
||||
use tokio::task;
|
||||
use tracing::{error, info, warn};
|
||||
use tracing_utils::http::OtelName;
|
||||
@@ -123,6 +122,17 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
|
||||
}
|
||||
}
|
||||
|
||||
(&Method::POST, "/terminate") => {
|
||||
info!("serving /terminate POST request");
|
||||
match handle_terminate_request(compute).await {
|
||||
Ok(()) => Response::new(Body::empty()),
|
||||
Err((msg, code)) => {
|
||||
error!("error handling /terminate request: {msg}");
|
||||
render_json_error(&msg, code)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// download extension files from remote extension storage on demand
|
||||
(&Method::POST, route) if route.starts_with("/extension_server/") => {
|
||||
info!("serving {:?} POST request", route);
|
||||
@@ -297,6 +307,49 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
async fn handle_terminate_request(compute: &Arc<ComputeNode>) -> Result<(), (String, StatusCode)> {
|
||||
{
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
if state.status == ComputeStatus::Terminated {
|
||||
return Ok(());
|
||||
}
|
||||
if state.status != ComputeStatus::Empty && state.status != ComputeStatus::Running {
|
||||
let msg = format!(
|
||||
"invalid compute status for termination request: {:?}",
|
||||
state.status.clone()
|
||||
);
|
||||
return Err((msg, StatusCode::PRECONDITION_FAILED));
|
||||
}
|
||||
state.status = ComputeStatus::TerminationPending;
|
||||
compute.state_changed.notify_all();
|
||||
drop(state);
|
||||
}
|
||||
forward_termination_signal();
|
||||
info!("sent signal and notified waiters");
|
||||
|
||||
// Spawn a blocking thread to wait for compute to become Terminated.
|
||||
// This is needed to do not block the main pool of workers and
|
||||
// be able to serve other requests while some particular request
|
||||
// is waiting for compute to finish configuration.
|
||||
let c = compute.clone();
|
||||
task::spawn_blocking(move || {
|
||||
let mut state = c.state.lock().unwrap();
|
||||
while state.status != ComputeStatus::Terminated {
|
||||
state = c.state_changed.wait(state).unwrap();
|
||||
info!(
|
||||
"waiting for compute to become Terminated, current status: {:?}",
|
||||
state.status
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
.unwrap()?;
|
||||
info!("terminated Postgres");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Main Hyper HTTP server function that runs it and blocks waiting on it forever.
|
||||
#[tokio::main]
|
||||
async fn serve(port: u16, state: Arc<ComputeNode>) {
|
||||
|
||||
@@ -168,6 +168,29 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/GenericError"
|
||||
|
||||
/terminate:
|
||||
post:
|
||||
tags:
|
||||
- Terminate
|
||||
summary: Terminate Postgres and wait for it to exit
|
||||
description: ""
|
||||
operationId: terminate
|
||||
responses:
|
||||
200:
|
||||
description: Result
|
||||
412:
|
||||
description: "wrong state"
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/GenericError"
|
||||
500:
|
||||
description: "Unexpected error"
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/GenericError"
|
||||
|
||||
components:
|
||||
securitySchemes:
|
||||
JWT:
|
||||
|
||||
@@ -655,6 +655,9 @@ pub fn handle_grants(
|
||||
// remove this code if possible. The worst thing that could happen is that
|
||||
// user won't be able to use public schema in NEW databases created in the
|
||||
// very OLD project.
|
||||
//
|
||||
// Also, alter default permissions so that relations created by extensions can be
|
||||
// used by neon_superuser without permission issues.
|
||||
let grant_query = "DO $$\n\
|
||||
BEGIN\n\
|
||||
IF EXISTS(\n\
|
||||
@@ -673,6 +676,15 @@ pub fn handle_grants(
|
||||
GRANT CREATE ON SCHEMA public TO web_access;\n\
|
||||
END IF;\n\
|
||||
END IF;\n\
|
||||
IF EXISTS(\n\
|
||||
SELECT nspname\n\
|
||||
FROM pg_catalog.pg_namespace\n\
|
||||
WHERE nspname = 'public'\n\
|
||||
)\n\
|
||||
THEN\n\
|
||||
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;\n\
|
||||
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION;\n\
|
||||
END IF;\n\
|
||||
END\n\
|
||||
$$;"
|
||||
.to_string();
|
||||
@@ -777,6 +789,12 @@ BEGIN
|
||||
END
|
||||
$$;"#,
|
||||
"GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION",
|
||||
// Don't remove: these are some SQLs that we originally applied in migrations but turned out to execute somewhere else.
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
// Add new migrations below.
|
||||
];
|
||||
|
||||
let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
|
||||
@@ -803,8 +821,13 @@ $$;"#,
|
||||
client.simple_query(query)?;
|
||||
|
||||
while current_migration < migrations.len() {
|
||||
info!("Running migration:\n{}\n", migrations[current_migration]);
|
||||
client.simple_query(migrations[current_migration])?;
|
||||
let migration = &migrations[current_migration];
|
||||
if migration.is_empty() {
|
||||
info!("Skip migration id={}", current_migration);
|
||||
} else {
|
||||
info!("Running migration:\n{}\n", migration);
|
||||
client.simple_query(migration)?;
|
||||
}
|
||||
current_migration += 1;
|
||||
}
|
||||
let setval = format!(
|
||||
|
||||
26
control_plane/README.md
Normal file
26
control_plane/README.md
Normal file
@@ -0,0 +1,26 @@
|
||||
# Control Plane and Neon Local
|
||||
|
||||
This crate contains tools to start a Neon development environment locally. This utility can be used with the `cargo neon` command.
|
||||
|
||||
## Example: Start with Postgres 16
|
||||
|
||||
To create and start a local development environment with Postgres 16, you will need to provide `--pg-version` flag to 3 of the start-up commands.
|
||||
|
||||
```shell
|
||||
cargo neon init --pg-version 16
|
||||
cargo neon start
|
||||
cargo neon tenant create --set-default --pg-version 16
|
||||
cargo neon endpoint create main --pg-version 16
|
||||
cargo neon endpoint start main
|
||||
```
|
||||
|
||||
## Example: Create Test User and Database
|
||||
|
||||
By default, `cargo neon` starts an endpoint with `cloud_admin` and `postgres` database. If you want to have a role and a database similar to what we have on the cloud service, you can do it with the following commands when starting an endpoint.
|
||||
|
||||
```shell
|
||||
cargo neon endpoint create main --pg-version 16 --update-catalog true
|
||||
cargo neon endpoint start main --create-test-user true
|
||||
```
|
||||
|
||||
The first command creates `neon_superuser` and necessary roles. The second command creates `test` user and `neondb` database. You will see a connection string that connects you to the test user after running the second command.
|
||||
@@ -18,6 +18,8 @@ clap.workspace = true
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
hyper.workspace = true
|
||||
humantime.workspace = true
|
||||
once_cell.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
pageserver_client.workspace = true
|
||||
postgres_connection.workspace = true
|
||||
|
||||
@@ -0,0 +1,2 @@
|
||||
ALTER TABLE tenant_shards ALTER generation SET NOT NULL;
|
||||
ALTER TABLE tenant_shards ALTER generation_pageserver SET NOT NULL;
|
||||
@@ -0,0 +1,4 @@
|
||||
|
||||
|
||||
ALTER TABLE tenant_shards ALTER generation DROP NOT NULL;
|
||||
ALTER TABLE tenant_shards ALTER generation_pageserver DROP NOT NULL;
|
||||
9
control_plane/attachment_service/src/auth.rs
Normal file
9
control_plane/attachment_service/src/auth.rs
Normal file
@@ -0,0 +1,9 @@
|
||||
use utils::auth::{AuthError, Claims, Scope};
|
||||
|
||||
pub fn check_permission(claims: &Claims, required_scope: Scope) -> Result<(), AuthError> {
|
||||
if claims.scope != required_scope {
|
||||
return Err(AuthError("Scope mismatch. Permission denied".into()));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,18 +1,19 @@
|
||||
use crate::reconciler::ReconcileError;
|
||||
use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
|
||||
use crate::PlacementPolicy;
|
||||
use hyper::{Body, Request, Response};
|
||||
use hyper::{StatusCode, Uri};
|
||||
use pageserver_api::models::{
|
||||
TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
|
||||
TimelineCreateRequest,
|
||||
TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
|
||||
TenantTimeTravelRequest, TimelineCreateRequest,
|
||||
};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::mgmt_api;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
use utils::auth::SwappableJwtAuth;
|
||||
use utils::http::endpoint::{auth_middleware, request_span};
|
||||
use utils::http::request::parse_request_param;
|
||||
use utils::auth::{Scope, SwappableJwtAuth};
|
||||
use utils::http::endpoint::{auth_middleware, check_permission_with, request_span};
|
||||
use utils::http::request::{must_get_query_param, parse_request_param};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use utils::{
|
||||
@@ -25,12 +26,12 @@ use utils::{
|
||||
id::NodeId,
|
||||
};
|
||||
|
||||
use pageserver_api::control_api::{ReAttachRequest, ValidateRequest};
|
||||
|
||||
use control_plane::attachment_service::{
|
||||
AttachHookRequest, InspectRequest, NodeConfigureRequest, NodeRegisterRequest,
|
||||
TenantShardMigrateRequest,
|
||||
use pageserver_api::controller_api::{
|
||||
NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
|
||||
};
|
||||
use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
|
||||
|
||||
use control_plane::attachment_service::{AttachHookRequest, InspectRequest};
|
||||
|
||||
/// State available to HTTP request handlers
|
||||
#[derive(Clone)]
|
||||
@@ -64,21 +65,18 @@ fn get_state(request: &Request<Body>) -> &HttpState {
|
||||
|
||||
/// Pageserver calls into this on startup, to learn which tenants it should attach
|
||||
async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::GenerationsApi)?;
|
||||
|
||||
let reattach_req = json_request::<ReAttachRequest>(&mut req).await?;
|
||||
let state = get_state(&req);
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
state
|
||||
.service
|
||||
.re_attach(reattach_req)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?,
|
||||
)
|
||||
json_response(StatusCode::OK, state.service.re_attach(reattach_req).await?)
|
||||
}
|
||||
|
||||
/// Pageserver calls into this before doing deletions, to confirm that it still
|
||||
/// holds the latest generation for the tenants with deletions enqueued
|
||||
async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::GenerationsApi)?;
|
||||
|
||||
let validate_req = json_request::<ValidateRequest>(&mut req).await?;
|
||||
let state = get_state(&req);
|
||||
json_response(StatusCode::OK, state.service.validate(validate_req))
|
||||
@@ -88,6 +86,8 @@ async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiEr
|
||||
/// (in the real control plane this is unnecessary, because the same program is managing
|
||||
/// generation numbers and doing attachments).
|
||||
async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let attach_req = json_request::<AttachHookRequest>(&mut req).await?;
|
||||
let state = get_state(&req);
|
||||
|
||||
@@ -102,6 +102,8 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
|
||||
}
|
||||
|
||||
async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let inspect_req = json_request::<InspectRequest>(&mut req).await?;
|
||||
|
||||
let state = get_state(&req);
|
||||
@@ -113,8 +115,18 @@ async fn handle_tenant_create(
|
||||
service: Arc<Service>,
|
||||
mut req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::PageServerApi)?;
|
||||
|
||||
let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
|
||||
json_response(StatusCode::OK, service.tenant_create(create_req).await?)
|
||||
|
||||
// TODO: enable specifying this. Using Single as a default helps legacy tests to work (they
|
||||
// have no expectation of HA).
|
||||
let placement_policy = PlacementPolicy::Single;
|
||||
|
||||
json_response(
|
||||
StatusCode::CREATED,
|
||||
service.tenant_create(create_req, placement_policy).await?,
|
||||
)
|
||||
}
|
||||
|
||||
// For tenant and timeline deletions, which both implement an "initially return 202, then 404 once
|
||||
@@ -168,6 +180,8 @@ async fn handle_tenant_location_config(
|
||||
mut req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
check_permissions(&req, Scope::PageServerApi)?;
|
||||
|
||||
let config_req = json_request::<TenantLocationConfigRequest>(&mut req).await?;
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
@@ -177,11 +191,76 @@ async fn handle_tenant_location_config(
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_tenant_config_set(
|
||||
service: Arc<Service>,
|
||||
mut req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::PageServerApi)?;
|
||||
|
||||
let config_req = json_request::<TenantConfigRequest>(&mut req).await?;
|
||||
|
||||
json_response(StatusCode::OK, service.tenant_config_set(config_req).await?)
|
||||
}
|
||||
|
||||
async fn handle_tenant_config_get(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
check_permissions(&req, Scope::PageServerApi)?;
|
||||
|
||||
json_response(StatusCode::OK, service.tenant_config_get(tenant_id)?)
|
||||
}
|
||||
|
||||
async fn handle_tenant_time_travel_remote_storage(
|
||||
service: Arc<Service>,
|
||||
mut req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
check_permissions(&req, Scope::PageServerApi)?;
|
||||
|
||||
let time_travel_req = json_request::<TenantTimeTravelRequest>(&mut req).await?;
|
||||
|
||||
let timestamp_raw = must_get_query_param(&req, "travel_to")?;
|
||||
let _timestamp = humantime::parse_rfc3339(×tamp_raw).map_err(|_e| {
|
||||
ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Invalid time for travel_to: {timestamp_raw:?}"
|
||||
))
|
||||
})?;
|
||||
|
||||
let done_if_after_raw = must_get_query_param(&req, "done_if_after")?;
|
||||
let _done_if_after = humantime::parse_rfc3339(&done_if_after_raw).map_err(|_e| {
|
||||
ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Invalid time for done_if_after: {done_if_after_raw:?}"
|
||||
))
|
||||
})?;
|
||||
|
||||
service
|
||||
.tenant_time_travel_remote_storage(
|
||||
&time_travel_req,
|
||||
tenant_id,
|
||||
timestamp_raw,
|
||||
done_if_after_raw,
|
||||
)
|
||||
.await?;
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn handle_tenant_secondary_download(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
service.tenant_secondary_download(tenant_id).await?;
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn handle_tenant_delete(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
check_permissions(&req, Scope::PageServerApi)?;
|
||||
|
||||
deletion_wrapper(service, move |service| async move {
|
||||
service.tenant_delete(tenant_id).await
|
||||
@@ -194,9 +273,11 @@ async fn handle_tenant_timeline_create(
|
||||
mut req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
check_permissions(&req, Scope::PageServerApi)?;
|
||||
|
||||
let create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
StatusCode::CREATED,
|
||||
service
|
||||
.tenant_timeline_create(tenant_id, create_req)
|
||||
.await?,
|
||||
@@ -208,6 +289,8 @@ async fn handle_tenant_timeline_delete(
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
check_permissions(&req, Scope::PageServerApi)?;
|
||||
|
||||
let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
|
||||
|
||||
deletion_wrapper(service, move |service| async move {
|
||||
@@ -221,6 +304,7 @@ async fn handle_tenant_timeline_passthrough(
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
check_permissions(&req, Scope::PageServerApi)?;
|
||||
|
||||
let Some(path) = req.uri().path_and_query() else {
|
||||
// This should never happen, our request router only calls us if there is a path
|
||||
@@ -264,11 +348,15 @@ async fn handle_tenant_locate(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
|
||||
}
|
||||
|
||||
async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let register_req = json_request::<NodeRegisterRequest>(&mut req).await?;
|
||||
let state = get_state(&req);
|
||||
state.service.node_register(register_req).await?;
|
||||
@@ -276,17 +364,23 @@ async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>,
|
||||
}
|
||||
|
||||
async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let state = get_state(&req);
|
||||
json_response(StatusCode::OK, state.service.node_list().await?)
|
||||
}
|
||||
|
||||
async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let state = get_state(&req);
|
||||
let node_id: NodeId = parse_request_param(&req, "node_id")?;
|
||||
json_response(StatusCode::OK, state.service.node_drop(node_id).await?)
|
||||
}
|
||||
|
||||
async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let node_id: NodeId = parse_request_param(&req, "node_id")?;
|
||||
let config_req = json_request::<NodeConfigureRequest>(&mut req).await?;
|
||||
if node_id != config_req.node_id {
|
||||
@@ -296,13 +390,18 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
|
||||
}
|
||||
let state = get_state(&req);
|
||||
|
||||
json_response(StatusCode::OK, state.service.node_configure(config_req)?)
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
state.service.node_configure(config_req).await?,
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_tenant_shard_split(
|
||||
service: Arc<Service>,
|
||||
mut req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
let split_req = json_request::<TenantShardSplitRequest>(&mut req).await?;
|
||||
|
||||
@@ -316,6 +415,8 @@ async fn handle_tenant_shard_migrate(
|
||||
service: Arc<Service>,
|
||||
mut req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
|
||||
let migrate_req = json_request::<TenantShardMigrateRequest>(&mut req).await?;
|
||||
json_response(
|
||||
@@ -328,11 +429,35 @@ async fn handle_tenant_shard_migrate(
|
||||
|
||||
async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
check_permissions(&req, Scope::PageServerApi)?;
|
||||
|
||||
let state = get_state(&req);
|
||||
|
||||
json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?)
|
||||
}
|
||||
|
||||
async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let state = get_state(&req);
|
||||
state.service.tenants_dump()
|
||||
}
|
||||
|
||||
async fn handle_scheduler_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let state = get_state(&req);
|
||||
state.service.scheduler_dump()
|
||||
}
|
||||
|
||||
async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let state = get_state(&req);
|
||||
|
||||
json_response(StatusCode::OK, state.service.consistency_check().await?)
|
||||
}
|
||||
|
||||
/// Status endpoint is just used for checking that our HTTP listener is up
|
||||
async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
json_response(StatusCode::OK, ())
|
||||
@@ -384,6 +509,12 @@ where
|
||||
.await
|
||||
}
|
||||
|
||||
fn check_permissions(request: &Request<Body>, required_scope: Scope) -> Result<(), ApiError> {
|
||||
check_permission_with(request, |claims| {
|
||||
crate::auth::check_permission(claims, required_scope)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn make_router(
|
||||
service: Arc<Service>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
@@ -421,6 +552,13 @@ pub fn make_router(
|
||||
.post("/debug/v1/node/:node_id/drop", |r| {
|
||||
request_span(r, handle_node_drop)
|
||||
})
|
||||
.get("/debug/v1/tenant", |r| request_span(r, handle_tenants_dump))
|
||||
.get("/debug/v1/scheduler", |r| {
|
||||
request_span(r, handle_scheduler_dump)
|
||||
})
|
||||
.post("/debug/v1/consistency_check", |r| {
|
||||
request_span(r, handle_consistency_check)
|
||||
})
|
||||
.get("/control/v1/tenant/:tenant_id/locate", |r| {
|
||||
tenant_service_handler(r, handle_tenant_locate)
|
||||
})
|
||||
@@ -448,9 +586,21 @@ pub fn make_router(
|
||||
.delete("/v1/tenant/:tenant_id", |r| {
|
||||
tenant_service_handler(r, handle_tenant_delete)
|
||||
})
|
||||
.put("/v1/tenant/config", |r| {
|
||||
tenant_service_handler(r, handle_tenant_config_set)
|
||||
})
|
||||
.get("/v1/tenant/:tenant_id/config", |r| {
|
||||
tenant_service_handler(r, handle_tenant_config_get)
|
||||
})
|
||||
.put("/v1/tenant/:tenant_id/location_config", |r| {
|
||||
tenant_service_handler(r, handle_tenant_location_config)
|
||||
})
|
||||
.put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
|
||||
tenant_service_handler(r, handle_tenant_time_travel_remote_storage)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_id/secondary/download", |r| {
|
||||
tenant_service_handler(r, handle_tenant_secondary_download)
|
||||
})
|
||||
// Timeline operations
|
||||
.delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
|
||||
tenant_service_handler(r, handle_tenant_timeline_delete)
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::seqwait::MonotonicCounter;
|
||||
|
||||
mod auth;
|
||||
mod compute_hook;
|
||||
pub mod http;
|
||||
pub mod metrics;
|
||||
mod node;
|
||||
pub mod persistence;
|
||||
mod reconciler;
|
||||
@@ -11,18 +13,24 @@ mod schema;
|
||||
pub mod service;
|
||||
mod tenant_state;
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
|
||||
enum PlacementPolicy {
|
||||
/// Cheapest way to attach a tenant: just one pageserver, no secondary
|
||||
Single,
|
||||
/// Production-ready way to attach a tenant: one attached pageserver and
|
||||
/// some number of secondaries.
|
||||
Double(usize),
|
||||
/// Do not attach to any pageservers
|
||||
/// Create one secondary mode locations. This is useful when onboarding
|
||||
/// a tenant, or for an idle tenant that we might want to bring online quickly.
|
||||
Secondary,
|
||||
|
||||
/// Do not attach to any pageservers. This is appropriate for tenants that
|
||||
/// have been idle for a long time, where we do not mind some delay in making
|
||||
/// them available in future.
|
||||
Detached,
|
||||
}
|
||||
|
||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone)]
|
||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone, Serialize)]
|
||||
struct Sequence(u64);
|
||||
|
||||
impl Sequence {
|
||||
|
||||
@@ -6,9 +6,10 @@
|
||||
///
|
||||
use anyhow::{anyhow, Context};
|
||||
use attachment_service::http::make_router;
|
||||
use attachment_service::metrics::preinitialize_metrics;
|
||||
use attachment_service::persistence::Persistence;
|
||||
use attachment_service::service::{Config, Service};
|
||||
use aws_config::{self, BehaviorVersion, Region};
|
||||
use aws_config::{BehaviorVersion, Region};
|
||||
use camino::Utf8PathBuf;
|
||||
use clap::Parser;
|
||||
use diesel::Connection;
|
||||
@@ -78,13 +79,38 @@ impl Secrets {
|
||||
"neon-storage-controller-control-plane-jwt-token";
|
||||
const PUBLIC_KEY_SECRET: &'static str = "neon-storage-controller-public-key";
|
||||
|
||||
const DATABASE_URL_ENV: &'static str = "DATABASE_URL";
|
||||
const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN";
|
||||
const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN";
|
||||
const PUBLIC_KEY_ENV: &'static str = "PUBLIC_KEY";
|
||||
|
||||
/// Load secrets from, in order of preference:
|
||||
/// - CLI args if database URL is provided on the CLI
|
||||
/// - Environment variables if DATABASE_URL is set.
|
||||
/// - AWS Secrets Manager secrets
|
||||
async fn load(args: &Cli) -> anyhow::Result<Self> {
|
||||
match &args.database_url {
|
||||
Some(url) => Self::load_cli(url, args),
|
||||
None => Self::load_aws_sm().await,
|
||||
None => match std::env::var(Self::DATABASE_URL_ENV) {
|
||||
Ok(database_url) => Self::load_env(database_url),
|
||||
Err(_) => Self::load_aws_sm().await,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
fn load_env(database_url: String) -> anyhow::Result<Self> {
|
||||
let public_key = match std::env::var(Self::PUBLIC_KEY_ENV) {
|
||||
Ok(public_key) => Some(JwtAuth::from_key(public_key).context("Loading public key")?),
|
||||
Err(_) => None,
|
||||
};
|
||||
Ok(Self {
|
||||
database_url,
|
||||
public_key,
|
||||
jwt_token: std::env::var(Self::PAGESERVER_JWT_TOKEN_ENV).ok(),
|
||||
control_plane_jwt_token: std::env::var(Self::CONTROL_PLANE_JWT_TOKEN_ENV).ok(),
|
||||
})
|
||||
}
|
||||
|
||||
async fn load_aws_sm() -> anyhow::Result<Self> {
|
||||
let Ok(region) = std::env::var("AWS_REGION") else {
|
||||
anyhow::bail!("AWS_REGION is not set, cannot load secrets automatically: either set this, or use CLI args to supply secrets");
|
||||
@@ -205,6 +231,8 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
logging::Output::Stdout,
|
||||
)?;
|
||||
|
||||
preinitialize_metrics();
|
||||
|
||||
let args = Cli::parse();
|
||||
tracing::info!(
|
||||
"version: {}, launch_timestamp: {}, build_tag {}, state at {}, listening on {}",
|
||||
|
||||
32
control_plane/attachment_service/src/metrics.rs
Normal file
32
control_plane/attachment_service/src/metrics.rs
Normal file
@@ -0,0 +1,32 @@
|
||||
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
pub(crate) struct ReconcilerMetrics {
|
||||
pub(crate) spawned: IntCounter,
|
||||
pub(crate) complete: IntCounterVec,
|
||||
}
|
||||
|
||||
impl ReconcilerMetrics {
|
||||
// Labels used on [`Self::complete`]
|
||||
pub(crate) const SUCCESS: &'static str = "ok";
|
||||
pub(crate) const ERROR: &'static str = "success";
|
||||
pub(crate) const CANCEL: &'static str = "cancel";
|
||||
}
|
||||
|
||||
pub(crate) static RECONCILER: Lazy<ReconcilerMetrics> = Lazy::new(|| ReconcilerMetrics {
|
||||
spawned: register_int_counter!(
|
||||
"storage_controller_reconcile_spawn",
|
||||
"Count of how many times we spawn a reconcile task",
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
complete: register_int_counter_vec!(
|
||||
"storage_controller_reconcile_complete",
|
||||
"Reconciler tasks completed, broken down by success/failure/cancelled",
|
||||
&["status"],
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
});
|
||||
|
||||
pub fn preinitialize_metrics() {
|
||||
Lazy::force(&RECONCILER);
|
||||
}
|
||||
@@ -1,9 +1,16 @@
|
||||
use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
|
||||
use pageserver_api::controller_api::{NodeAvailability, NodeSchedulingPolicy};
|
||||
use serde::Serialize;
|
||||
use utils::id::NodeId;
|
||||
|
||||
use crate::persistence::NodePersistence;
|
||||
|
||||
#[derive(Clone)]
|
||||
/// Represents the in-memory description of a Node.
|
||||
///
|
||||
/// Scheduling statistics are maintened separately in [`crate::scheduler`].
|
||||
///
|
||||
/// The persistent subset of the Node is defined in [`crate::persistence::NodePersistence`]: the
|
||||
/// implementation of serialization on this type is only for debug dumps.
|
||||
#[derive(Clone, Serialize)]
|
||||
pub(crate) struct Node {
|
||||
pub(crate) id: NodeId,
|
||||
|
||||
|
||||
@@ -6,10 +6,12 @@ use std::time::Duration;
|
||||
use self::split_state::SplitState;
|
||||
use camino::Utf8Path;
|
||||
use camino::Utf8PathBuf;
|
||||
use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
|
||||
use diesel::pg::PgConnection;
|
||||
use diesel::prelude::*;
|
||||
use diesel::Connection;
|
||||
use diesel::{
|
||||
Connection, ExpressionMethods, Insertable, QueryDsl, QueryResult, Queryable, RunQueryDsl,
|
||||
Selectable, SelectableHelper,
|
||||
};
|
||||
use pageserver_api::controller_api::NodeSchedulingPolicy;
|
||||
use pageserver_api::models::TenantConfig;
|
||||
use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -130,24 +132,10 @@ impl Persistence {
|
||||
}
|
||||
|
||||
/// At startup, populate the list of nodes which our shards may be placed on
|
||||
pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<Node>> {
|
||||
let nodes: Vec<Node> = self
|
||||
pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<NodePersistence>> {
|
||||
let nodes: Vec<NodePersistence> = self
|
||||
.with_conn(move |conn| -> DatabaseResult<_> {
|
||||
Ok(crate::schema::nodes::table
|
||||
.load::<NodePersistence>(conn)?
|
||||
.into_iter()
|
||||
.map(|n| Node {
|
||||
id: NodeId(n.node_id as u64),
|
||||
// At startup we consider a node offline until proven otherwise.
|
||||
availability: NodeAvailability::Offline,
|
||||
scheduling: NodeSchedulingPolicy::from_str(&n.scheduling_policy)
|
||||
.expect("Bad scheduling policy in DB"),
|
||||
listen_http_addr: n.listen_http_addr,
|
||||
listen_http_port: n.listen_http_port as u16,
|
||||
listen_pg_addr: n.listen_pg_addr,
|
||||
listen_pg_port: n.listen_pg_port as u16,
|
||||
})
|
||||
.collect::<Vec<Node>>())
|
||||
Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
|
||||
})
|
||||
.await?;
|
||||
|
||||
@@ -156,6 +144,31 @@ impl Persistence {
|
||||
Ok(nodes)
|
||||
}
|
||||
|
||||
pub(crate) async fn update_node(
|
||||
&self,
|
||||
input_node_id: NodeId,
|
||||
input_scheduling: NodeSchedulingPolicy,
|
||||
) -> DatabaseResult<()> {
|
||||
use crate::schema::nodes::dsl::*;
|
||||
let updated = self
|
||||
.with_conn(move |conn| {
|
||||
let updated = diesel::update(nodes)
|
||||
.filter(node_id.eq(input_node_id.0 as i64))
|
||||
.set((scheduling_policy.eq(String::from(input_scheduling)),))
|
||||
.execute(conn)?;
|
||||
Ok(updated)
|
||||
})
|
||||
.await?;
|
||||
|
||||
if updated != 1 {
|
||||
Err(DatabaseError::Logical(format!(
|
||||
"Node {node_id:?} not found for update",
|
||||
)))
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// At startup, load the high level state for shards, such as their config + policy. This will
|
||||
/// be enriched at runtime with state discovered on pageservers.
|
||||
pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
|
||||
@@ -320,7 +333,15 @@ impl Persistence {
|
||||
shard_number: ShardNumber(tsp.shard_number as u8),
|
||||
shard_count: ShardCount::new(tsp.shard_count as u8),
|
||||
};
|
||||
result.insert(tenant_shard_id, Generation::new(tsp.generation as u32));
|
||||
|
||||
let Some(g) = tsp.generation else {
|
||||
// If the generation_pageserver column was non-NULL, then the generation column should also be non-NULL:
|
||||
// we only set generation_pageserver when setting generation.
|
||||
return Err(DatabaseError::Logical(
|
||||
"Generation should always be set after incrementing".to_string(),
|
||||
));
|
||||
};
|
||||
result.insert(tenant_shard_id, Generation::new(g as u32));
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
@@ -353,7 +374,85 @@ impl Persistence {
|
||||
})
|
||||
.await?;
|
||||
|
||||
Ok(Generation::new(updated.generation as u32))
|
||||
// Generation is always non-null in the rseult: if the generation column had been NULL, then we
|
||||
// should have experienced an SQL Confilict error while executing a query that tries to increment it.
|
||||
debug_assert!(updated.generation.is_some());
|
||||
let Some(g) = updated.generation else {
|
||||
return Err(DatabaseError::Logical(
|
||||
"Generation should always be set after incrementing".to_string(),
|
||||
)
|
||||
.into());
|
||||
};
|
||||
|
||||
Ok(Generation::new(g as u32))
|
||||
}
|
||||
|
||||
/// For use when updating a persistent property of a tenant, such as its config or placement_policy.
|
||||
///
|
||||
/// Do not use this for settting generation, unless in the special onboarding code path (/location_config)
|
||||
/// API: use [`Self::increment_generation`] instead. Setting the generation via this route is a one-time thing
|
||||
/// that we only do the first time a tenant is set to an attached policy via /location_config.
|
||||
pub(crate) async fn update_tenant_shard(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
input_placement_policy: PlacementPolicy,
|
||||
input_config: TenantConfig,
|
||||
input_generation: Option<Generation>,
|
||||
) -> DatabaseResult<()> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
|
||||
self.with_conn(move |conn| {
|
||||
let query = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
||||
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
||||
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32));
|
||||
|
||||
if let Some(input_generation) = input_generation {
|
||||
// Update includes generation column
|
||||
query
|
||||
.set((
|
||||
generation.eq(Some(input_generation.into().unwrap() as i32)),
|
||||
placement_policy
|
||||
.eq(serde_json::to_string(&input_placement_policy).unwrap()),
|
||||
config.eq(serde_json::to_string(&input_config).unwrap()),
|
||||
))
|
||||
.execute(conn)?;
|
||||
} else {
|
||||
// Update does not include generation column
|
||||
query
|
||||
.set((
|
||||
placement_policy
|
||||
.eq(serde_json::to_string(&input_placement_policy).unwrap()),
|
||||
config.eq(serde_json::to_string(&input_config).unwrap()),
|
||||
))
|
||||
.execute(conn)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
})
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn update_tenant_config(
|
||||
&self,
|
||||
input_tenant_id: TenantId,
|
||||
input_config: TenantConfig,
|
||||
) -> DatabaseResult<()> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
|
||||
self.with_conn(move |conn| {
|
||||
diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(input_tenant_id.to_string()))
|
||||
.set((config.eq(serde_json::to_string(&input_config).unwrap()),))
|
||||
.execute(conn)?;
|
||||
|
||||
Ok(())
|
||||
})
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
|
||||
@@ -364,7 +463,7 @@ impl Persistence {
|
||||
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
||||
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
|
||||
.set((
|
||||
generation_pageserver.eq(i64::MAX),
|
||||
generation_pageserver.eq(Option::<i64>::None),
|
||||
placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
|
||||
))
|
||||
.execute(conn)?;
|
||||
@@ -477,7 +576,7 @@ impl Persistence {
|
||||
}
|
||||
|
||||
/// Parts of [`crate::tenant_state::TenantState`] that are stored durably
|
||||
#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone)]
|
||||
#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)]
|
||||
#[diesel(table_name = crate::schema::tenant_shards)]
|
||||
pub(crate) struct TenantShardPersistence {
|
||||
#[serde(default)]
|
||||
@@ -490,12 +589,15 @@ pub(crate) struct TenantShardPersistence {
|
||||
pub(crate) shard_stripe_size: i32,
|
||||
|
||||
// Latest generation number: next time we attach, increment this
|
||||
// and use the incremented number when attaching
|
||||
pub(crate) generation: i32,
|
||||
// and use the incremented number when attaching.
|
||||
//
|
||||
// Generation is only None when first onboarding a tenant, where it may
|
||||
// be in PlacementPolicy::Secondary and therefore have no valid generation state.
|
||||
pub(crate) generation: Option<i32>,
|
||||
|
||||
// Currently attached pageserver
|
||||
#[serde(rename = "pageserver")]
|
||||
pub(crate) generation_pageserver: i64,
|
||||
pub(crate) generation_pageserver: Option<i64>,
|
||||
|
||||
#[serde(default)]
|
||||
pub(crate) placement_policy: String,
|
||||
@@ -506,7 +608,7 @@ pub(crate) struct TenantShardPersistence {
|
||||
}
|
||||
|
||||
/// Parts of [`crate::node::Node`] that are stored durably
|
||||
#[derive(Serialize, Deserialize, Queryable, Selectable, Insertable)]
|
||||
#[derive(Serialize, Deserialize, Queryable, Selectable, Insertable, Eq, PartialEq)]
|
||||
#[diesel(table_name = crate::schema::nodes)]
|
||||
pub(crate) struct NodePersistence {
|
||||
pub(crate) node_id: i64,
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use crate::persistence::Persistence;
|
||||
use crate::service;
|
||||
use control_plane::attachment_service::NodeAvailability;
|
||||
use pageserver_api::controller_api::NodeAvailability;
|
||||
use pageserver_api::models::{
|
||||
LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
|
||||
};
|
||||
@@ -26,8 +26,8 @@ pub(super) struct Reconciler {
|
||||
/// of a tenant's state from when we spawned a reconcile task.
|
||||
pub(super) tenant_shard_id: TenantShardId,
|
||||
pub(crate) shard: ShardIdentity,
|
||||
pub(crate) generation: Generation,
|
||||
pub(crate) intent: IntentState,
|
||||
pub(crate) generation: Option<Generation>,
|
||||
pub(crate) intent: TargetState,
|
||||
pub(crate) config: TenantConfig,
|
||||
pub(crate) observed: ObservedState,
|
||||
|
||||
@@ -62,10 +62,38 @@ pub(super) struct Reconciler {
|
||||
pub(crate) persistence: Arc<Persistence>,
|
||||
}
|
||||
|
||||
/// This is a snapshot of [`crate::tenant_state::IntentState`], but it does not do any
|
||||
/// reference counting for Scheduler. The IntentState is what the scheduler works with,
|
||||
/// and the TargetState is just the instruction for a particular Reconciler run.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct TargetState {
|
||||
pub(crate) attached: Option<NodeId>,
|
||||
pub(crate) secondary: Vec<NodeId>,
|
||||
}
|
||||
|
||||
impl TargetState {
|
||||
pub(crate) fn from_intent(intent: &IntentState) -> Self {
|
||||
Self {
|
||||
attached: *intent.get_attached(),
|
||||
secondary: intent.get_secondary().clone(),
|
||||
}
|
||||
}
|
||||
|
||||
fn all_pageservers(&self) -> Vec<NodeId> {
|
||||
let mut result = self.secondary.clone();
|
||||
if let Some(node_id) = &self.attached {
|
||||
result.push(*node_id);
|
||||
}
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum ReconcileError {
|
||||
#[error(transparent)]
|
||||
Notify(#[from] NotifyError),
|
||||
#[error("Cancelled")]
|
||||
Cancel,
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
@@ -284,7 +312,7 @@ impl Reconciler {
|
||||
&self.shard,
|
||||
&self.config,
|
||||
LocationConfigMode::AttachedStale,
|
||||
Some(self.generation),
|
||||
self.generation,
|
||||
None,
|
||||
);
|
||||
self.location_config(origin_ps_id, stale_conf, Some(Duration::from_secs(10)))
|
||||
@@ -307,16 +335,17 @@ impl Reconciler {
|
||||
}
|
||||
|
||||
// Increment generation before attaching to new pageserver
|
||||
self.generation = self
|
||||
.persistence
|
||||
.increment_generation(self.tenant_shard_id, dest_ps_id)
|
||||
.await?;
|
||||
self.generation = Some(
|
||||
self.persistence
|
||||
.increment_generation(self.tenant_shard_id, dest_ps_id)
|
||||
.await?,
|
||||
);
|
||||
|
||||
let dest_conf = build_location_config(
|
||||
&self.shard,
|
||||
&self.config,
|
||||
LocationConfigMode::AttachedMulti,
|
||||
Some(self.generation),
|
||||
self.generation,
|
||||
None,
|
||||
);
|
||||
|
||||
@@ -373,7 +402,7 @@ impl Reconciler {
|
||||
&self.shard,
|
||||
&self.config,
|
||||
LocationConfigMode::AttachedSingle,
|
||||
Some(self.generation),
|
||||
self.generation,
|
||||
None,
|
||||
);
|
||||
self.location_config(dest_ps_id, dest_final_conf.clone(), None)
|
||||
@@ -405,23 +434,63 @@ impl Reconciler {
|
||||
|
||||
// If the attached pageserver is not attached, do so now.
|
||||
if let Some(node_id) = self.intent.attached {
|
||||
let mut wanted_conf =
|
||||
attached_location_conf(self.generation, &self.shard, &self.config);
|
||||
// If we are in an attached policy, then generation must have been set (null generations
|
||||
// are only present when a tenant is initially loaded with a secondary policy)
|
||||
debug_assert!(self.generation.is_some());
|
||||
let Some(generation) = self.generation else {
|
||||
return Err(ReconcileError::Other(anyhow::anyhow!(
|
||||
"Attempted to attach with NULL generation"
|
||||
)));
|
||||
};
|
||||
|
||||
let mut wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
|
||||
match self.observed.locations.get(&node_id) {
|
||||
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
|
||||
// Nothing to do
|
||||
tracing::info!("Observed configuration already correct.")
|
||||
tracing::info!(%node_id, "Observed configuration already correct.")
|
||||
}
|
||||
_ => {
|
||||
observed => {
|
||||
// In all cases other than a matching observed configuration, we will
|
||||
// reconcile this location. This includes locations with different configurations, as well
|
||||
// as locations with unknown (None) observed state.
|
||||
self.generation = self
|
||||
.persistence
|
||||
.increment_generation(self.tenant_shard_id, node_id)
|
||||
.await?;
|
||||
wanted_conf.generation = self.generation.into();
|
||||
tracing::info!("Observed configuration requires update.");
|
||||
|
||||
// The general case is to increment the generation. However, there are cases
|
||||
// where this is not necessary:
|
||||
// - if we are only updating the TenantConf part of the location
|
||||
// - if we are only changing the attachment mode (e.g. going to attachedmulti or attachedstale)
|
||||
// and the location was already in the correct generation
|
||||
let increment_generation = match observed {
|
||||
None => true,
|
||||
Some(ObservedStateLocation { conf: None }) => true,
|
||||
Some(ObservedStateLocation {
|
||||
conf: Some(observed),
|
||||
}) => {
|
||||
let generations_match = observed.generation == wanted_conf.generation;
|
||||
|
||||
use LocationConfigMode::*;
|
||||
let mode_transition_requires_gen_inc =
|
||||
match (observed.mode, wanted_conf.mode) {
|
||||
// Usually the short-lived attachment modes (multi and stale) are only used
|
||||
// in the case of [`Self::live_migrate`], but it is simple to handle them correctly
|
||||
// here too. Locations are allowed to go Single->Stale and Multi->Single within the same generation.
|
||||
(AttachedSingle, AttachedStale) => false,
|
||||
(AttachedMulti, AttachedSingle) => false,
|
||||
(lhs, rhs) => lhs != rhs,
|
||||
};
|
||||
|
||||
!generations_match || mode_transition_requires_gen_inc
|
||||
}
|
||||
};
|
||||
|
||||
if increment_generation {
|
||||
let generation = self
|
||||
.persistence
|
||||
.increment_generation(self.tenant_shard_id, node_id)
|
||||
.await?;
|
||||
self.generation = Some(generation);
|
||||
wanted_conf.generation = generation.into();
|
||||
}
|
||||
tracing::info!(%node_id, "Observed configuration requires update.");
|
||||
self.location_config(node_id, wanted_conf, None).await?;
|
||||
self.compute_notify().await?;
|
||||
}
|
||||
@@ -471,6 +540,9 @@ impl Reconciler {
|
||||
}
|
||||
|
||||
for (node_id, conf) in changes {
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(ReconcileError::Cancel);
|
||||
}
|
||||
self.location_config(node_id, conf, None).await?;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use utils::{http::error::ApiError, id::NodeId};
|
||||
|
||||
use crate::{node::Node, tenant_state::TenantState};
|
||||
use serde::Serialize;
|
||||
use std::collections::HashMap;
|
||||
use utils::{http::error::ApiError, id::NodeId};
|
||||
|
||||
/// Scenarios in which we cannot find a suitable location for a tenant shard
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
@@ -19,52 +18,203 @@ impl From<ScheduleError> for ApiError {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Eq, PartialEq)]
|
||||
struct SchedulerNode {
|
||||
/// How many shards are currently scheduled on this node, via their [`crate::tenant_state::IntentState`].
|
||||
shard_count: usize,
|
||||
|
||||
/// Whether this node is currently elegible to have new shards scheduled (this is derived
|
||||
/// from a node's availability state and scheduling policy).
|
||||
may_schedule: bool,
|
||||
}
|
||||
|
||||
/// This type is responsible for selecting which node is used when a tenant shard needs to choose a pageserver
|
||||
/// on which to run.
|
||||
///
|
||||
/// The type has no persistent state of its own: this is all populated at startup. The Serialize
|
||||
/// impl is only for debug dumps.
|
||||
#[derive(Serialize)]
|
||||
pub(crate) struct Scheduler {
|
||||
tenant_counts: HashMap<NodeId, usize>,
|
||||
nodes: HashMap<NodeId, SchedulerNode>,
|
||||
}
|
||||
|
||||
impl Scheduler {
|
||||
pub(crate) fn new(
|
||||
tenants: &BTreeMap<TenantShardId, TenantState>,
|
||||
nodes: &HashMap<NodeId, Node>,
|
||||
) -> Self {
|
||||
let mut tenant_counts = HashMap::new();
|
||||
for node_id in nodes.keys() {
|
||||
tenant_counts.insert(*node_id, 0);
|
||||
pub(crate) fn new<'a>(nodes: impl Iterator<Item = &'a Node>) -> Self {
|
||||
let mut scheduler_nodes = HashMap::new();
|
||||
for node in nodes {
|
||||
scheduler_nodes.insert(
|
||||
node.id,
|
||||
SchedulerNode {
|
||||
shard_count: 0,
|
||||
may_schedule: node.may_schedule(),
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
for tenant in tenants.values() {
|
||||
if let Some(ps) = tenant.intent.attached {
|
||||
let entry = tenant_counts.entry(ps).or_insert(0);
|
||||
*entry += 1;
|
||||
}
|
||||
Self {
|
||||
nodes: scheduler_nodes,
|
||||
}
|
||||
|
||||
for (node_id, node) in nodes {
|
||||
if !node.may_schedule() {
|
||||
tenant_counts.remove(node_id);
|
||||
}
|
||||
}
|
||||
|
||||
Self { tenant_counts }
|
||||
}
|
||||
|
||||
pub(crate) fn schedule_shard(
|
||||
&mut self,
|
||||
hard_exclude: &[NodeId],
|
||||
) -> Result<NodeId, ScheduleError> {
|
||||
if self.tenant_counts.is_empty() {
|
||||
/// For debug/support: check that our internal statistics are in sync with the state of
|
||||
/// the nodes & tenant shards.
|
||||
///
|
||||
/// If anything is inconsistent, log details and return an error.
|
||||
pub(crate) fn consistency_check<'a>(
|
||||
&self,
|
||||
nodes: impl Iterator<Item = &'a Node>,
|
||||
shards: impl Iterator<Item = &'a TenantState>,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut expect_nodes: HashMap<NodeId, SchedulerNode> = HashMap::new();
|
||||
for node in nodes {
|
||||
expect_nodes.insert(
|
||||
node.id,
|
||||
SchedulerNode {
|
||||
shard_count: 0,
|
||||
may_schedule: node.may_schedule(),
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
for shard in shards {
|
||||
if let Some(node_id) = shard.intent.get_attached() {
|
||||
match expect_nodes.get_mut(node_id) {
|
||||
Some(node) => node.shard_count += 1,
|
||||
None => anyhow::bail!(
|
||||
"Tenant {} references nonexistent node {}",
|
||||
shard.tenant_shard_id,
|
||||
node_id
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
for node_id in shard.intent.get_secondary() {
|
||||
match expect_nodes.get_mut(node_id) {
|
||||
Some(node) => node.shard_count += 1,
|
||||
None => anyhow::bail!(
|
||||
"Tenant {} references nonexistent node {}",
|
||||
shard.tenant_shard_id,
|
||||
node_id
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (node_id, expect_node) in &expect_nodes {
|
||||
let Some(self_node) = self.nodes.get(node_id) else {
|
||||
anyhow::bail!("Node {node_id} not found in Self")
|
||||
};
|
||||
|
||||
if self_node != expect_node {
|
||||
tracing::error!("Inconsistency detected in scheduling state for node {node_id}");
|
||||
tracing::error!("Expected state: {}", serde_json::to_string(expect_node)?);
|
||||
tracing::error!("Self state: {}", serde_json::to_string(self_node)?);
|
||||
|
||||
anyhow::bail!("Inconsistent state on {node_id}");
|
||||
}
|
||||
}
|
||||
|
||||
if expect_nodes.len() != self.nodes.len() {
|
||||
// We just checked that all the expected nodes are present. If the lengths don't match,
|
||||
// it means that we have nodes in Self that are unexpected.
|
||||
for node_id in self.nodes.keys() {
|
||||
if !expect_nodes.contains_key(node_id) {
|
||||
anyhow::bail!("Node {node_id} found in Self but not in expected nodes");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Increment the reference count of a node. This reference count is used to guide scheduling
|
||||
/// decisions, not for memory management: it represents one tenant shard whose IntentState targets
|
||||
/// this node.
|
||||
///
|
||||
/// It is an error to call this for a node that is not known to the scheduler (i.e. passed into
|
||||
/// [`Self::new`] or [`Self::node_upsert`])
|
||||
pub(crate) fn node_inc_ref(&mut self, node_id: NodeId) {
|
||||
let Some(node) = self.nodes.get_mut(&node_id) else {
|
||||
tracing::error!("Scheduler missing node {node_id}");
|
||||
debug_assert!(false);
|
||||
return;
|
||||
};
|
||||
|
||||
node.shard_count += 1;
|
||||
}
|
||||
|
||||
/// Decrement a node's reference count. Inverse of [`Self::node_inc_ref`].
|
||||
pub(crate) fn node_dec_ref(&mut self, node_id: NodeId) {
|
||||
let Some(node) = self.nodes.get_mut(&node_id) else {
|
||||
debug_assert!(false);
|
||||
tracing::error!("Scheduler missing node {node_id}");
|
||||
return;
|
||||
};
|
||||
|
||||
node.shard_count -= 1;
|
||||
}
|
||||
|
||||
pub(crate) fn node_upsert(&mut self, node: &Node) {
|
||||
use std::collections::hash_map::Entry::*;
|
||||
match self.nodes.entry(node.id) {
|
||||
Occupied(mut entry) => {
|
||||
entry.get_mut().may_schedule = node.may_schedule();
|
||||
}
|
||||
Vacant(entry) => {
|
||||
entry.insert(SchedulerNode {
|
||||
shard_count: 0,
|
||||
may_schedule: node.may_schedule(),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn node_remove(&mut self, node_id: NodeId) {
|
||||
if self.nodes.remove(&node_id).is_none() {
|
||||
tracing::warn!(node_id=%node_id, "Removed non-existent node from scheduler");
|
||||
}
|
||||
}
|
||||
|
||||
/// Where we have several nodes to choose from, for example when picking a secondary location
|
||||
/// to promote to an attached location, this method may be used to pick the best choice based
|
||||
/// on the scheduler's knowledge of utilization and availability.
|
||||
///
|
||||
/// If the input is empty, or all the nodes are not elegible for scheduling, return None: the
|
||||
/// caller can pick a node some other way.
|
||||
pub(crate) fn node_preferred(&self, nodes: &[NodeId]) -> Option<NodeId> {
|
||||
if nodes.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let node = nodes
|
||||
.iter()
|
||||
.map(|node_id| {
|
||||
let may_schedule = self
|
||||
.nodes
|
||||
.get(node_id)
|
||||
.map(|n| n.may_schedule)
|
||||
.unwrap_or(false);
|
||||
(*node_id, may_schedule)
|
||||
})
|
||||
.max_by_key(|(_n, may_schedule)| *may_schedule);
|
||||
|
||||
// If even the preferred node has may_schedule==false, return None
|
||||
node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None })
|
||||
}
|
||||
|
||||
pub(crate) fn schedule_shard(&self, hard_exclude: &[NodeId]) -> Result<NodeId, ScheduleError> {
|
||||
if self.nodes.is_empty() {
|
||||
return Err(ScheduleError::NoPageservers);
|
||||
}
|
||||
|
||||
let mut tenant_counts: Vec<(NodeId, usize)> = self
|
||||
.tenant_counts
|
||||
.nodes
|
||||
.iter()
|
||||
.filter_map(|(k, v)| {
|
||||
if hard_exclude.contains(k) {
|
||||
if hard_exclude.contains(k) || !v.may_schedule {
|
||||
None
|
||||
} else {
|
||||
Some((*k, *v))
|
||||
Some((*k, v.shard_count))
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
@@ -73,7 +223,18 @@ impl Scheduler {
|
||||
tenant_counts.sort_by_key(|i| (i.1, i.0));
|
||||
|
||||
if tenant_counts.is_empty() {
|
||||
// After applying constraints, no pageservers were left
|
||||
// After applying constraints, no pageservers were left. We log some detail about
|
||||
// the state of nodes to help understand why this happened. This is not logged as an error because
|
||||
// it is legitimately possible for enough nodes to be Offline to prevent scheduling a shard.
|
||||
tracing::info!("Scheduling failure, while excluding {hard_exclude:?}, node states:");
|
||||
for (node_id, node) in &self.nodes {
|
||||
tracing::info!(
|
||||
"Node {node_id}: may_schedule={} shards={}",
|
||||
node.may_schedule,
|
||||
node.shard_count
|
||||
);
|
||||
}
|
||||
|
||||
return Err(ScheduleError::ImpossibleConstraint);
|
||||
}
|
||||
|
||||
@@ -82,7 +243,88 @@ impl Scheduler {
|
||||
"scheduler selected node {node_id} (elegible nodes {:?}, exclude: {hard_exclude:?})",
|
||||
tenant_counts.iter().map(|i| i.0 .0).collect::<Vec<_>>()
|
||||
);
|
||||
*self.tenant_counts.get_mut(&node_id).unwrap() += 1;
|
||||
|
||||
// Note that we do not update shard count here to reflect the scheduling: that
|
||||
// is IntentState's job when the scheduled location is used.
|
||||
|
||||
Ok(node_id)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod test_utils {
|
||||
|
||||
use crate::node::Node;
|
||||
use pageserver_api::controller_api::{NodeAvailability, NodeSchedulingPolicy};
|
||||
use std::collections::HashMap;
|
||||
use utils::id::NodeId;
|
||||
/// Test helper: synthesize the requested number of nodes, all in active state.
|
||||
///
|
||||
/// Node IDs start at one.
|
||||
pub(crate) fn make_test_nodes(n: u64) -> HashMap<NodeId, Node> {
|
||||
(1..n + 1)
|
||||
.map(|i| {
|
||||
(
|
||||
NodeId(i),
|
||||
Node {
|
||||
id: NodeId(i),
|
||||
availability: NodeAvailability::Active,
|
||||
scheduling: NodeSchedulingPolicy::Active,
|
||||
listen_http_addr: format!("httphost-{i}"),
|
||||
listen_http_port: 80 + i as u16,
|
||||
listen_pg_addr: format!("pghost-{i}"),
|
||||
listen_pg_port: 5432 + i as u16,
|
||||
},
|
||||
)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use crate::tenant_state::IntentState;
|
||||
#[test]
|
||||
fn scheduler_basic() -> anyhow::Result<()> {
|
||||
let nodes = test_utils::make_test_nodes(2);
|
||||
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
let mut t1_intent = IntentState::new();
|
||||
let mut t2_intent = IntentState::new();
|
||||
|
||||
let scheduled = scheduler.schedule_shard(&[])?;
|
||||
t1_intent.set_attached(&mut scheduler, Some(scheduled));
|
||||
let scheduled = scheduler.schedule_shard(&[])?;
|
||||
t2_intent.set_attached(&mut scheduler, Some(scheduled));
|
||||
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);
|
||||
|
||||
let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers())?;
|
||||
t1_intent.push_secondary(&mut scheduler, scheduled);
|
||||
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 2);
|
||||
|
||||
t1_intent.clear(&mut scheduler);
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 0);
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);
|
||||
|
||||
if cfg!(debug_assertions) {
|
||||
// Dropping an IntentState without clearing it causes a panic in debug mode,
|
||||
// because we have failed to properly update scheduler shard counts.
|
||||
let result = std::panic::catch_unwind(move || {
|
||||
drop(t2_intent);
|
||||
});
|
||||
assert!(result.is_err());
|
||||
} else {
|
||||
t2_intent.clear(&mut scheduler);
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 0);
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 0);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,8 +17,8 @@ diesel::table! {
|
||||
shard_number -> Int4,
|
||||
shard_count -> Int4,
|
||||
shard_stripe_size -> Int4,
|
||||
generation -> Int4,
|
||||
generation_pageserver -> Int8,
|
||||
generation -> Nullable<Int4>,
|
||||
generation_pageserver -> Nullable<Int8>,
|
||||
placement_policy -> Varchar,
|
||||
splitting -> Int2,
|
||||
config -> Text,
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,10 +1,12 @@
|
||||
use std::{collections::HashMap, sync::Arc, time::Duration};
|
||||
|
||||
use control_plane::attachment_service::NodeAvailability;
|
||||
use crate::{metrics, persistence::TenantShardPersistence};
|
||||
use pageserver_api::controller_api::NodeAvailability;
|
||||
use pageserver_api::{
|
||||
models::{LocationConfig, LocationConfigMode, TenantConfig},
|
||||
shard::{ShardIdentity, TenantShardId},
|
||||
};
|
||||
use serde::Serialize;
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{instrument, Instrument};
|
||||
@@ -19,11 +21,27 @@ use crate::{
|
||||
compute_hook::ComputeHook,
|
||||
node::Node,
|
||||
persistence::{split_state::SplitState, Persistence},
|
||||
reconciler::{attached_location_conf, secondary_location_conf, ReconcileError, Reconciler},
|
||||
reconciler::{
|
||||
attached_location_conf, secondary_location_conf, ReconcileError, Reconciler, TargetState,
|
||||
},
|
||||
scheduler::{ScheduleError, Scheduler},
|
||||
service, PlacementPolicy, Sequence,
|
||||
};
|
||||
|
||||
/// Serialization helper
|
||||
fn read_mutex_content<S, T>(v: &std::sync::Mutex<T>, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::ser::Serializer,
|
||||
T: Clone + std::fmt::Display,
|
||||
{
|
||||
serializer.collect_str(&v.lock().unwrap())
|
||||
}
|
||||
|
||||
/// In-memory state for a particular tenant shard.
|
||||
///
|
||||
/// This struct implement Serialize for debugging purposes, but is _not_ persisted
|
||||
/// itself: see [`crate::persistence`] for the subset of tenant shard state that is persisted.
|
||||
#[derive(Serialize)]
|
||||
pub(crate) struct TenantState {
|
||||
pub(crate) tenant_shard_id: TenantShardId,
|
||||
|
||||
@@ -35,8 +53,11 @@ pub(crate) struct TenantState {
|
||||
pub(crate) sequence: Sequence,
|
||||
|
||||
// Latest generation number: next time we attach, increment this
|
||||
// and use the incremented number when attaching
|
||||
pub(crate) generation: Generation,
|
||||
// and use the incremented number when attaching.
|
||||
//
|
||||
// None represents an incompletely onboarded tenant via the [`Service::location_config`]
|
||||
// API, where this tenant may only run in PlacementPolicy::Secondary.
|
||||
pub(crate) generation: Option<Generation>,
|
||||
|
||||
// High level description of how the tenant should be set up. Provided
|
||||
// externally.
|
||||
@@ -58,6 +79,7 @@ pub(crate) struct TenantState {
|
||||
/// If a reconcile task is currently in flight, it may be joined here (it is
|
||||
/// only safe to join if either the result has been received or the reconciler's
|
||||
/// cancellation token has been fired)
|
||||
#[serde(skip)]
|
||||
pub(crate) reconciler: Option<ReconcilerHandle>,
|
||||
|
||||
/// If a tenant is being split, then all shards with that TenantId will have a
|
||||
@@ -67,16 +89,19 @@ pub(crate) struct TenantState {
|
||||
|
||||
/// Optionally wait for reconciliation to complete up to a particular
|
||||
/// sequence number.
|
||||
#[serde(skip)]
|
||||
pub(crate) waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,
|
||||
|
||||
/// Indicates sequence number for which we have encountered an error reconciling. If
|
||||
/// this advances ahead of [`Self::waiter`] then a reconciliation error has occurred,
|
||||
/// and callers should stop waiting for `waiter` and propagate the error.
|
||||
#[serde(skip)]
|
||||
pub(crate) error_waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,
|
||||
|
||||
/// The most recent error from a reconcile on this tenant
|
||||
/// TODO: generalize to an array of recent events
|
||||
/// TOOD: use a ArcSwap instead of mutex for faster reads?
|
||||
#[serde(serialize_with = "read_mutex_content")]
|
||||
pub(crate) last_error: std::sync::Arc<std::sync::Mutex<String>>,
|
||||
|
||||
/// If we have a pending compute notification that for some reason we weren't able to send,
|
||||
@@ -86,13 +111,140 @@ pub(crate) struct TenantState {
|
||||
pub(crate) pending_compute_notification: bool,
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Debug)]
|
||||
#[derive(Default, Clone, Debug, Serialize)]
|
||||
pub(crate) struct IntentState {
|
||||
pub(crate) attached: Option<NodeId>,
|
||||
pub(crate) secondary: Vec<NodeId>,
|
||||
attached: Option<NodeId>,
|
||||
secondary: Vec<NodeId>,
|
||||
}
|
||||
|
||||
#[derive(Default, Clone)]
|
||||
impl IntentState {
|
||||
pub(crate) fn new() -> Self {
|
||||
Self {
|
||||
attached: None,
|
||||
secondary: vec![],
|
||||
}
|
||||
}
|
||||
pub(crate) fn single(scheduler: &mut Scheduler, node_id: Option<NodeId>) -> Self {
|
||||
if let Some(node_id) = node_id {
|
||||
scheduler.node_inc_ref(node_id);
|
||||
}
|
||||
Self {
|
||||
attached: node_id,
|
||||
secondary: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn set_attached(&mut self, scheduler: &mut Scheduler, new_attached: Option<NodeId>) {
|
||||
if self.attached != new_attached {
|
||||
if let Some(old_attached) = self.attached.take() {
|
||||
scheduler.node_dec_ref(old_attached);
|
||||
}
|
||||
if let Some(new_attached) = &new_attached {
|
||||
scheduler.node_inc_ref(*new_attached);
|
||||
}
|
||||
self.attached = new_attached;
|
||||
}
|
||||
}
|
||||
|
||||
/// Like set_attached, but the node is from [`Self::secondary`]. This swaps the node from
|
||||
/// secondary to attached while maintaining the scheduler's reference counts.
|
||||
pub(crate) fn promote_attached(
|
||||
&mut self,
|
||||
_scheduler: &mut Scheduler,
|
||||
promote_secondary: NodeId,
|
||||
) {
|
||||
// If we call this with a node that isn't in secondary, it would cause incorrect
|
||||
// scheduler reference counting, since we assume the node is already referenced as a secondary.
|
||||
debug_assert!(self.secondary.contains(&promote_secondary));
|
||||
|
||||
// TODO: when scheduler starts tracking attached + secondary counts separately, we will
|
||||
// need to call into it here.
|
||||
self.secondary.retain(|n| n != &promote_secondary);
|
||||
self.attached = Some(promote_secondary);
|
||||
}
|
||||
|
||||
pub(crate) fn push_secondary(&mut self, scheduler: &mut Scheduler, new_secondary: NodeId) {
|
||||
debug_assert!(!self.secondary.contains(&new_secondary));
|
||||
scheduler.node_inc_ref(new_secondary);
|
||||
self.secondary.push(new_secondary);
|
||||
}
|
||||
|
||||
/// It is legal to call this with a node that is not currently a secondary: that is a no-op
|
||||
pub(crate) fn remove_secondary(&mut self, scheduler: &mut Scheduler, node_id: NodeId) {
|
||||
let index = self.secondary.iter().position(|n| *n == node_id);
|
||||
if let Some(index) = index {
|
||||
scheduler.node_dec_ref(node_id);
|
||||
self.secondary.remove(index);
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn clear_secondary(&mut self, scheduler: &mut Scheduler) {
|
||||
for secondary in self.secondary.drain(..) {
|
||||
scheduler.node_dec_ref(secondary);
|
||||
}
|
||||
}
|
||||
|
||||
/// Remove the last secondary node from the list of secondaries
|
||||
pub(crate) fn pop_secondary(&mut self, scheduler: &mut Scheduler) {
|
||||
if let Some(node_id) = self.secondary.pop() {
|
||||
scheduler.node_dec_ref(node_id);
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn clear(&mut self, scheduler: &mut Scheduler) {
|
||||
if let Some(old_attached) = self.attached.take() {
|
||||
scheduler.node_dec_ref(old_attached);
|
||||
}
|
||||
|
||||
self.clear_secondary(scheduler);
|
||||
}
|
||||
|
||||
pub(crate) fn all_pageservers(&self) -> Vec<NodeId> {
|
||||
let mut result = Vec::new();
|
||||
if let Some(p) = self.attached {
|
||||
result.push(p)
|
||||
}
|
||||
|
||||
result.extend(self.secondary.iter().copied());
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
pub(crate) fn get_attached(&self) -> &Option<NodeId> {
|
||||
&self.attached
|
||||
}
|
||||
|
||||
pub(crate) fn get_secondary(&self) -> &Vec<NodeId> {
|
||||
&self.secondary
|
||||
}
|
||||
|
||||
/// If the node is in use as the attached location, demote it into
|
||||
/// the list of secondary locations. This is used when a node goes offline,
|
||||
/// and we want to use a different node for attachment, but not permanently
|
||||
/// forget the location on the offline node.
|
||||
///
|
||||
/// Returns true if a change was made
|
||||
pub(crate) fn demote_attached(&mut self, node_id: NodeId) -> bool {
|
||||
if self.attached == Some(node_id) {
|
||||
// TODO: when scheduler starts tracking attached + secondary counts separately, we will
|
||||
// need to call into it here.
|
||||
self.attached = None;
|
||||
self.secondary.push(node_id);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for IntentState {
|
||||
fn drop(&mut self) {
|
||||
// Must clear before dropping, to avoid leaving stale refcounts in the Scheduler
|
||||
debug_assert!(self.attached.is_none() && self.secondary.is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Serialize)]
|
||||
pub(crate) struct ObservedState {
|
||||
pub(crate) locations: HashMap<NodeId, ObservedStateLocation>,
|
||||
}
|
||||
@@ -106,7 +258,7 @@ pub(crate) struct ObservedState {
|
||||
/// what it is (e.g. we failed partway through configuring it)
|
||||
/// * Instance exists with conf==Some: this tells us what we last successfully configured on this node,
|
||||
/// and that configuration will still be present unless something external interfered.
|
||||
#[derive(Clone)]
|
||||
#[derive(Clone, Serialize)]
|
||||
pub(crate) struct ObservedStateLocation {
|
||||
/// If None, it means we do not know the status of this shard's location on this node, but
|
||||
/// we know that we might have some state on this node.
|
||||
@@ -175,53 +327,13 @@ pub(crate) struct ReconcileResult {
|
||||
pub(crate) result: Result<(), ReconcileError>,
|
||||
|
||||
pub(crate) tenant_shard_id: TenantShardId,
|
||||
pub(crate) generation: Generation,
|
||||
pub(crate) generation: Option<Generation>,
|
||||
pub(crate) observed: ObservedState,
|
||||
|
||||
/// Set [`TenantState::pending_compute_notification`] from this flag
|
||||
pub(crate) pending_compute_notification: bool,
|
||||
}
|
||||
|
||||
impl IntentState {
|
||||
pub(crate) fn new() -> Self {
|
||||
Self {
|
||||
attached: None,
|
||||
secondary: vec![],
|
||||
}
|
||||
}
|
||||
pub(crate) fn all_pageservers(&self) -> Vec<NodeId> {
|
||||
let mut result = Vec::new();
|
||||
if let Some(p) = self.attached {
|
||||
result.push(p)
|
||||
}
|
||||
|
||||
result.extend(self.secondary.iter().copied());
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
pub(crate) fn single(node_id: Option<NodeId>) -> Self {
|
||||
Self {
|
||||
attached: node_id,
|
||||
secondary: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
/// When a node goes offline, we update intents to avoid using it
|
||||
/// as their attached pageserver.
|
||||
///
|
||||
/// Returns true if a change was made
|
||||
pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool {
|
||||
if self.attached == Some(node_id) {
|
||||
self.attached = None;
|
||||
self.secondary.push(node_id);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ObservedState {
|
||||
pub(crate) fn new() -> Self {
|
||||
Self {
|
||||
@@ -240,7 +352,7 @@ impl TenantState {
|
||||
tenant_shard_id,
|
||||
policy,
|
||||
intent: IntentState::default(),
|
||||
generation: Generation::new(0),
|
||||
generation: Some(Generation::new(0)),
|
||||
shard,
|
||||
observed: ObservedState::default(),
|
||||
config: TenantConfig::default(),
|
||||
@@ -289,6 +401,9 @@ impl TenantState {
|
||||
// All remaining observed locations generate secondary intents. This includes None
|
||||
// observations, as these may well have some local content on disk that is usable (this
|
||||
// is an edge case that might occur if we restarted during a migration or other change)
|
||||
//
|
||||
// We may leave intent.attached empty if we didn't find any attached locations: [`Self::schedule`]
|
||||
// will take care of promoting one of these secondaries to be attached.
|
||||
self.observed.locations.keys().for_each(|node_id| {
|
||||
if Some(*node_id) != self.intent.attached {
|
||||
self.intent.secondary.push(*node_id);
|
||||
@@ -296,6 +411,33 @@ impl TenantState {
|
||||
});
|
||||
}
|
||||
|
||||
/// Part of [`Self::schedule`] that is used to choose exactly one node to act as the
|
||||
/// attached pageserver for a shard.
|
||||
///
|
||||
/// Returns whether we modified it, and the NodeId selected.
|
||||
fn schedule_attached(
|
||||
&mut self,
|
||||
scheduler: &mut Scheduler,
|
||||
) -> Result<(bool, NodeId), ScheduleError> {
|
||||
// No work to do if we already have an attached tenant
|
||||
if let Some(node_id) = self.intent.attached {
|
||||
return Ok((false, node_id));
|
||||
}
|
||||
|
||||
if let Some(promote_secondary) = scheduler.node_preferred(&self.intent.secondary) {
|
||||
// Promote a secondary
|
||||
tracing::debug!("Promoted secondary {} to attached", promote_secondary);
|
||||
self.intent.promote_attached(scheduler, promote_secondary);
|
||||
Ok((true, promote_secondary))
|
||||
} else {
|
||||
// Pick a fresh node: either we had no secondaries or none were schedulable
|
||||
let node_id = scheduler.schedule_shard(&self.intent.secondary)?;
|
||||
tracing::debug!("Selected {} as attached", node_id);
|
||||
self.intent.set_attached(scheduler, Some(node_id));
|
||||
Ok((true, node_id))
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn schedule(&mut self, scheduler: &mut Scheduler) -> Result<(), ScheduleError> {
|
||||
// TODO: before scheduling new nodes, check if any existing content in
|
||||
// self.intent refers to pageservers that are offline, and pick other
|
||||
@@ -306,49 +448,78 @@ impl TenantState {
|
||||
|
||||
// Build the set of pageservers already in use by this tenant, to avoid scheduling
|
||||
// more work on the same pageservers we're already using.
|
||||
let mut used_pageservers = self.intent.all_pageservers();
|
||||
let mut modified = false;
|
||||
|
||||
// Add/remove nodes to fulfil policy
|
||||
use PlacementPolicy::*;
|
||||
match self.policy {
|
||||
Single => {
|
||||
// Should have exactly one attached, and zero secondaries
|
||||
if self.intent.attached.is_none() {
|
||||
let node_id = scheduler.schedule_shard(&used_pageservers)?;
|
||||
self.intent.attached = Some(node_id);
|
||||
used_pageservers.push(node_id);
|
||||
if !self.intent.secondary.is_empty() {
|
||||
self.intent.clear_secondary(scheduler);
|
||||
modified = true;
|
||||
}
|
||||
|
||||
let (modified_attached, _attached_node_id) = self.schedule_attached(scheduler)?;
|
||||
modified |= modified_attached;
|
||||
|
||||
if !self.intent.secondary.is_empty() {
|
||||
self.intent.secondary.clear();
|
||||
self.intent.clear_secondary(scheduler);
|
||||
modified = true;
|
||||
}
|
||||
}
|
||||
Double(secondary_count) => {
|
||||
// Should have exactly one attached, and N secondaries
|
||||
if self.intent.attached.is_none() {
|
||||
let node_id = scheduler.schedule_shard(&used_pageservers)?;
|
||||
self.intent.attached = Some(node_id);
|
||||
used_pageservers.push(node_id);
|
||||
let retain_secondaries = if self.intent.attached.is_none()
|
||||
&& scheduler.node_preferred(&self.intent.secondary).is_some()
|
||||
{
|
||||
// If we have no attached, and one of the secondaries is elegible to be promoted, retain
|
||||
// one more secondary than we usually would, as one of them will become attached futher down this function.
|
||||
secondary_count + 1
|
||||
} else {
|
||||
secondary_count
|
||||
};
|
||||
|
||||
while self.intent.secondary.len() > retain_secondaries {
|
||||
// We have no particular preference for one secondary location over another: just
|
||||
// arbitrarily drop from the end
|
||||
self.intent.pop_secondary(scheduler);
|
||||
modified = true;
|
||||
}
|
||||
|
||||
// Should have exactly one attached, and N secondaries
|
||||
let (modified_attached, attached_node_id) = self.schedule_attached(scheduler)?;
|
||||
modified |= modified_attached;
|
||||
|
||||
let mut used_pageservers = vec![attached_node_id];
|
||||
while self.intent.secondary.len() < secondary_count {
|
||||
let node_id = scheduler.schedule_shard(&used_pageservers)?;
|
||||
self.intent.secondary.push(node_id);
|
||||
self.intent.push_secondary(scheduler, node_id);
|
||||
used_pageservers.push(node_id);
|
||||
modified = true;
|
||||
}
|
||||
}
|
||||
Detached => {
|
||||
// Should have no attached or secondary pageservers
|
||||
if self.intent.attached.is_some() {
|
||||
self.intent.attached = None;
|
||||
Secondary => {
|
||||
if let Some(node_id) = self.intent.get_attached() {
|
||||
// Populate secondary by demoting the attached node
|
||||
self.intent.demote_attached(*node_id);
|
||||
modified = true;
|
||||
} else if self.intent.secondary.is_empty() {
|
||||
// Populate secondary by scheduling a fresh node
|
||||
let node_id = scheduler.schedule_shard(&[])?;
|
||||
self.intent.push_secondary(scheduler, node_id);
|
||||
modified = true;
|
||||
}
|
||||
|
||||
if !self.intent.secondary.is_empty() {
|
||||
self.intent.secondary.clear();
|
||||
while self.intent.secondary.len() > 1 {
|
||||
// We have no particular preference for one secondary location over another: just
|
||||
// arbitrarily drop from the end
|
||||
self.intent.pop_secondary(scheduler);
|
||||
modified = true;
|
||||
}
|
||||
}
|
||||
Detached => {
|
||||
// Never add locations in this mode
|
||||
if self.intent.get_attached().is_some() || !self.intent.get_secondary().is_empty() {
|
||||
self.intent.clear(scheduler);
|
||||
modified = true;
|
||||
}
|
||||
}
|
||||
@@ -395,7 +566,12 @@ impl TenantState {
|
||||
|
||||
fn dirty(&self) -> bool {
|
||||
if let Some(node_id) = self.intent.attached {
|
||||
let wanted_conf = attached_location_conf(self.generation, &self.shard, &self.config);
|
||||
// Maybe panic: it is a severe bug if we try to attach while generation is null.
|
||||
let generation = self
|
||||
.generation
|
||||
.expect("Attempted to enter attached state without a generation");
|
||||
|
||||
let wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
|
||||
match self.observed.locations.get(&node_id) {
|
||||
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
|
||||
Some(_) | None => {
|
||||
@@ -414,6 +590,13 @@ impl TenantState {
|
||||
}
|
||||
}
|
||||
|
||||
for node_id in self.observed.locations.keys() {
|
||||
if self.intent.attached != Some(*node_id) && !self.intent.secondary.contains(node_id) {
|
||||
// We have observed state that isn't part of our intent: need to clean it up.
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Even if there is no pageserver work to be done, if we have a pending notification to computes,
|
||||
// wake up a reconciler to send it.
|
||||
if self.pending_compute_notification {
|
||||
@@ -466,6 +649,10 @@ impl TenantState {
|
||||
// Reconcile already in flight for the current sequence?
|
||||
if let Some(handle) = &self.reconciler {
|
||||
if handle.sequence == self.sequence {
|
||||
tracing::info!(
|
||||
"Reconciliation already in progress for sequence {:?}",
|
||||
self.sequence,
|
||||
);
|
||||
return Some(ReconcilerWaiter {
|
||||
tenant_shard_id: self.tenant_shard_id,
|
||||
seq_wait: self.waiter.clone(),
|
||||
@@ -485,12 +672,16 @@ impl TenantState {
|
||||
return None;
|
||||
};
|
||||
|
||||
// Advance the sequence before spawning a reconciler, so that sequence waiters
|
||||
// can distinguish between before+after the reconcile completes.
|
||||
self.sequence = self.sequence.next();
|
||||
|
||||
let reconciler_cancel = cancel.child_token();
|
||||
let mut reconciler = Reconciler {
|
||||
tenant_shard_id: self.tenant_shard_id,
|
||||
shard: self.shard,
|
||||
generation: self.generation,
|
||||
intent: self.intent.clone(),
|
||||
intent: TargetState::from_intent(&self.intent),
|
||||
config: self.config.clone(),
|
||||
observed: self.observed.clone(),
|
||||
pageservers: pageservers.clone(),
|
||||
@@ -509,6 +700,7 @@ impl TenantState {
|
||||
let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq,
|
||||
tenant_id=%reconciler.tenant_shard_id.tenant_id,
|
||||
shard_id=%reconciler.tenant_shard_id.shard_slug());
|
||||
metrics::RECONCILER.spawned.inc();
|
||||
let join_handle = tokio::task::spawn(
|
||||
async move {
|
||||
// Wait for any previous reconcile task to complete before we start
|
||||
@@ -525,6 +717,10 @@ impl TenantState {
|
||||
// TODO: wrap all remote API operations in cancellation check
|
||||
// as well.
|
||||
if reconciler.cancel.is_cancelled() {
|
||||
metrics::RECONCILER
|
||||
.complete
|
||||
.with_label_values(&[metrics::ReconcilerMetrics::CANCEL])
|
||||
.inc();
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -538,6 +734,20 @@ impl TenantState {
|
||||
reconciler.compute_notify().await.ok();
|
||||
}
|
||||
|
||||
// Update result counter
|
||||
match &result {
|
||||
Ok(_) => metrics::RECONCILER
|
||||
.complete
|
||||
.with_label_values(&[metrics::ReconcilerMetrics::SUCCESS]),
|
||||
Err(ReconcileError::Cancel) => metrics::RECONCILER
|
||||
.complete
|
||||
.with_label_values(&[metrics::ReconcilerMetrics::CANCEL]),
|
||||
Err(_) => metrics::RECONCILER
|
||||
.complete
|
||||
.with_label_values(&[metrics::ReconcilerMetrics::ERROR]),
|
||||
}
|
||||
.inc();
|
||||
|
||||
result_tx
|
||||
.send(ReconcileResult {
|
||||
sequence: reconcile_seq,
|
||||
@@ -567,6 +777,17 @@ impl TenantState {
|
||||
})
|
||||
}
|
||||
|
||||
/// Called when a ReconcileResult has been emitted and the service is updating
|
||||
/// our state: if the result is from a sequence >= my ReconcileHandle, then drop
|
||||
/// the handle to indicate there is no longer a reconciliation in progress.
|
||||
pub(crate) fn reconcile_complete(&mut self, sequence: Sequence) {
|
||||
if let Some(reconcile_handle) = &self.reconciler {
|
||||
if reconcile_handle.sequence <= sequence {
|
||||
self.reconciler = None;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If we had any state at all referring to this node ID, drop it. Does not
|
||||
// attempt to reschedule.
|
||||
pub(crate) fn deref_node(&mut self, node_id: NodeId) {
|
||||
@@ -580,4 +801,100 @@ impl TenantState {
|
||||
|
||||
debug_assert!(!self.intent.all_pageservers().contains(&node_id));
|
||||
}
|
||||
|
||||
pub(crate) fn to_persistent(&self) -> TenantShardPersistence {
|
||||
TenantShardPersistence {
|
||||
tenant_id: self.tenant_shard_id.tenant_id.to_string(),
|
||||
shard_number: self.tenant_shard_id.shard_number.0 as i32,
|
||||
shard_count: self.tenant_shard_id.shard_count.literal() as i32,
|
||||
shard_stripe_size: self.shard.stripe_size.0 as i32,
|
||||
generation: self.generation.map(|g| g.into().unwrap_or(0) as i32),
|
||||
generation_pageserver: self.intent.get_attached().map(|n| n.0 as i64),
|
||||
placement_policy: serde_json::to_string(&self.policy).unwrap(),
|
||||
config: serde_json::to_string(&self.config).unwrap(),
|
||||
splitting: SplitState::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod tests {
|
||||
use pageserver_api::shard::{ShardCount, ShardNumber};
|
||||
use utils::id::TenantId;
|
||||
|
||||
use crate::scheduler::test_utils::make_test_nodes;
|
||||
|
||||
use super::*;
|
||||
|
||||
fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantState {
|
||||
let tenant_id = TenantId::generate();
|
||||
let shard_number = ShardNumber(0);
|
||||
let shard_count = ShardCount::new(1);
|
||||
|
||||
let tenant_shard_id = TenantShardId {
|
||||
tenant_id,
|
||||
shard_number,
|
||||
shard_count,
|
||||
};
|
||||
TenantState::new(
|
||||
tenant_shard_id,
|
||||
ShardIdentity::new(
|
||||
shard_number,
|
||||
shard_count,
|
||||
pageserver_api::shard::ShardStripeSize(32768),
|
||||
)
|
||||
.unwrap(),
|
||||
policy,
|
||||
)
|
||||
}
|
||||
|
||||
/// Test the scheduling behaviors used when a tenant configured for HA is subject
|
||||
/// to nodes being marked offline.
|
||||
#[test]
|
||||
fn tenant_ha_scheduling() -> anyhow::Result<()> {
|
||||
// Start with three nodes. Our tenant will only use two. The third one is
|
||||
// expected to remain unused.
|
||||
let mut nodes = make_test_nodes(3);
|
||||
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
|
||||
let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));
|
||||
tenant_state
|
||||
.schedule(&mut scheduler)
|
||||
.expect("we have enough nodes, scheduling should work");
|
||||
|
||||
// Expect to initially be schedule on to different nodes
|
||||
assert_eq!(tenant_state.intent.secondary.len(), 1);
|
||||
assert!(tenant_state.intent.attached.is_some());
|
||||
|
||||
let attached_node_id = tenant_state.intent.attached.unwrap();
|
||||
let secondary_node_id = *tenant_state.intent.secondary.iter().last().unwrap();
|
||||
assert_ne!(attached_node_id, secondary_node_id);
|
||||
|
||||
// Notifying the attached node is offline should demote it to a secondary
|
||||
let changed = tenant_state.intent.demote_attached(attached_node_id);
|
||||
assert!(changed);
|
||||
assert!(tenant_state.intent.attached.is_none());
|
||||
assert_eq!(tenant_state.intent.secondary.len(), 2);
|
||||
|
||||
// Update the scheduler state to indicate the node is offline
|
||||
nodes.get_mut(&attached_node_id).unwrap().availability = NodeAvailability::Offline;
|
||||
scheduler.node_upsert(nodes.get(&attached_node_id).unwrap());
|
||||
|
||||
// Scheduling the node should promote the still-available secondary node to attached
|
||||
tenant_state
|
||||
.schedule(&mut scheduler)
|
||||
.expect("active nodes are available");
|
||||
assert_eq!(tenant_state.intent.attached.unwrap(), secondary_node_id);
|
||||
|
||||
// The original attached node should have been retained as a secondary
|
||||
assert_eq!(
|
||||
*tenant_state.intent.secondary.iter().last().unwrap(),
|
||||
attached_node_id
|
||||
);
|
||||
|
||||
tenant_state.intent.clear(&mut scheduler);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,8 +2,12 @@ use crate::{background_process, local_env::LocalEnv};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use hyper::Method;
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse, TenantLocateResponse,
|
||||
TenantShardMigrateRequest, TenantShardMigrateResponse,
|
||||
},
|
||||
models::{
|
||||
ShardParameters, TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
|
||||
TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
|
||||
TimelineCreateRequest, TimelineInfo,
|
||||
},
|
||||
shard::TenantShardId,
|
||||
@@ -11,12 +15,12 @@ use pageserver_api::{
|
||||
use pageserver_client::mgmt_api::ResponseErrorMessageExt;
|
||||
use postgres_backend::AuthType;
|
||||
use serde::{de::DeserializeOwned, Deserialize, Serialize};
|
||||
use std::str::FromStr;
|
||||
use std::{fs, str::FromStr};
|
||||
use tokio::process::Command;
|
||||
use tracing::instrument;
|
||||
use url::Url;
|
||||
use utils::{
|
||||
auth::{Claims, Scope},
|
||||
auth::{encode_from_key_file, Claims, Scope},
|
||||
id::{NodeId, TenantId},
|
||||
};
|
||||
|
||||
@@ -24,7 +28,7 @@ pub struct AttachmentService {
|
||||
env: LocalEnv,
|
||||
listen: String,
|
||||
path: Utf8PathBuf,
|
||||
jwt_token: Option<String>,
|
||||
private_key: Option<Vec<u8>>,
|
||||
public_key: Option<String>,
|
||||
postgres_port: u16,
|
||||
client: reqwest::Client,
|
||||
@@ -55,126 +59,6 @@ pub struct InspectResponse {
|
||||
pub attachment: Option<(u32, NodeId)>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantCreateResponseShard {
|
||||
pub shard_id: TenantShardId,
|
||||
pub node_id: NodeId,
|
||||
pub generation: u32,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantCreateResponse {
|
||||
pub shards: Vec<TenantCreateResponseShard>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct NodeRegisterRequest {
|
||||
pub node_id: NodeId,
|
||||
|
||||
pub listen_pg_addr: String,
|
||||
pub listen_pg_port: u16,
|
||||
|
||||
pub listen_http_addr: String,
|
||||
pub listen_http_port: u16,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct NodeConfigureRequest {
|
||||
pub node_id: NodeId,
|
||||
|
||||
pub availability: Option<NodeAvailability>,
|
||||
pub scheduling: Option<NodeSchedulingPolicy>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantLocateResponseShard {
|
||||
pub shard_id: TenantShardId,
|
||||
pub node_id: NodeId,
|
||||
|
||||
pub listen_pg_addr: String,
|
||||
pub listen_pg_port: u16,
|
||||
|
||||
pub listen_http_addr: String,
|
||||
pub listen_http_port: u16,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantLocateResponse {
|
||||
pub shards: Vec<TenantLocateResponseShard>,
|
||||
pub shard_params: ShardParameters,
|
||||
}
|
||||
|
||||
/// Explicitly migrating a particular shard is a low level operation
|
||||
/// TODO: higher level "Reschedule tenant" operation where the request
|
||||
/// specifies some constraints, e.g. asking it to get off particular node(s)
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantShardMigrateRequest {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
pub node_id: NodeId,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Copy)]
|
||||
pub enum NodeAvailability {
|
||||
// Normal, happy state
|
||||
Active,
|
||||
// Offline: Tenants shouldn't try to attach here, but they may assume that their
|
||||
// secondary locations on this node still exist. Newly added nodes are in this
|
||||
// state until we successfully contact them.
|
||||
Offline,
|
||||
}
|
||||
|
||||
impl FromStr for NodeAvailability {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s {
|
||||
"active" => Ok(Self::Active),
|
||||
"offline" => Ok(Self::Offline),
|
||||
_ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// FIXME: this is a duplicate of the type in the attachment_service crate, because the
|
||||
/// type needs to be defined with diesel traits in there.
|
||||
#[derive(Serialize, Deserialize, Clone, Copy)]
|
||||
pub enum NodeSchedulingPolicy {
|
||||
Active,
|
||||
Filling,
|
||||
Pause,
|
||||
Draining,
|
||||
}
|
||||
|
||||
impl FromStr for NodeSchedulingPolicy {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s {
|
||||
"active" => Ok(Self::Active),
|
||||
"filling" => Ok(Self::Filling),
|
||||
"pause" => Ok(Self::Pause),
|
||||
"draining" => Ok(Self::Draining),
|
||||
_ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<NodeSchedulingPolicy> for String {
|
||||
fn from(value: NodeSchedulingPolicy) -> String {
|
||||
use NodeSchedulingPolicy::*;
|
||||
match value {
|
||||
Active => "active",
|
||||
Filling => "filling",
|
||||
Pause => "pause",
|
||||
Draining => "draining",
|
||||
}
|
||||
.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantShardMigrateResponse {}
|
||||
|
||||
impl AttachmentService {
|
||||
pub fn from_env(env: &LocalEnv) -> Self {
|
||||
let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
|
||||
@@ -204,12 +88,11 @@ impl AttachmentService {
|
||||
.pageservers
|
||||
.first()
|
||||
.expect("Config is validated to contain at least one pageserver");
|
||||
let (jwt_token, public_key) = match ps_conf.http_auth_type {
|
||||
let (private_key, public_key) = match ps_conf.http_auth_type {
|
||||
AuthType::Trust => (None, None),
|
||||
AuthType::NeonJWT => {
|
||||
let jwt_token = env
|
||||
.generate_auth_token(&Claims::new(None, Scope::PageServerApi))
|
||||
.unwrap();
|
||||
let private_key_path = env.get_private_key_path();
|
||||
let private_key = fs::read(private_key_path).expect("failed to read private key");
|
||||
|
||||
// If pageserver auth is enabled, this implicitly enables auth for this service,
|
||||
// using the same credentials.
|
||||
@@ -235,7 +118,7 @@ impl AttachmentService {
|
||||
} else {
|
||||
std::fs::read_to_string(&public_key_path).expect("Can't read public key")
|
||||
};
|
||||
(Some(jwt_token), Some(public_key))
|
||||
(Some(private_key), Some(public_key))
|
||||
}
|
||||
};
|
||||
|
||||
@@ -243,7 +126,7 @@ impl AttachmentService {
|
||||
env: env.clone(),
|
||||
path,
|
||||
listen,
|
||||
jwt_token,
|
||||
private_key,
|
||||
public_key,
|
||||
postgres_port,
|
||||
client: reqwest::ClientBuilder::new()
|
||||
@@ -317,7 +200,7 @@ impl AttachmentService {
|
||||
"localhost",
|
||||
"-p",
|
||||
&format!("{}", self.postgres_port),
|
||||
&DB_NAME,
|
||||
DB_NAME,
|
||||
])
|
||||
.output()
|
||||
.await
|
||||
@@ -397,7 +280,10 @@ impl AttachmentService {
|
||||
.into_iter()
|
||||
.map(|s| s.to_string())
|
||||
.collect::<Vec<_>>();
|
||||
if let Some(jwt_token) = &self.jwt_token {
|
||||
if let Some(private_key) = &self.private_key {
|
||||
let claims = Claims::new(None, Scope::PageServerApi);
|
||||
let jwt_token =
|
||||
encode_from_key_file(&claims, private_key).expect("failed to generate jwt token");
|
||||
args.push(format!("--jwt-token={jwt_token}"));
|
||||
}
|
||||
|
||||
@@ -422,7 +308,7 @@ impl AttachmentService {
|
||||
)],
|
||||
background_process::InitialPidFile::Create(self.pid_file()),
|
||||
|| async {
|
||||
match self.status().await {
|
||||
match self.ready().await {
|
||||
Ok(_) => Ok(true),
|
||||
Err(_) => Ok(false),
|
||||
}
|
||||
@@ -468,6 +354,20 @@ impl AttachmentService {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_claims_for_path(path: &str) -> anyhow::Result<Option<Claims>> {
|
||||
let category = match path.find('/') {
|
||||
Some(idx) => &path[..idx],
|
||||
None => path,
|
||||
};
|
||||
|
||||
match category {
|
||||
"status" | "ready" => Ok(None),
|
||||
"control" | "debug" => Ok(Some(Claims::new(None, Scope::Admin))),
|
||||
"v1" => Ok(Some(Claims::new(None, Scope::PageServerApi))),
|
||||
_ => Err(anyhow::anyhow!("Failed to determine claims for {}", path)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Simple HTTP request wrapper for calling into attachment service
|
||||
async fn dispatch<RQ, RS>(
|
||||
&self,
|
||||
@@ -493,11 +393,16 @@ impl AttachmentService {
|
||||
if let Some(body) = body {
|
||||
builder = builder.json(&body)
|
||||
}
|
||||
if let Some(jwt_token) = &self.jwt_token {
|
||||
builder = builder.header(
|
||||
reqwest::header::AUTHORIZATION,
|
||||
format!("Bearer {jwt_token}"),
|
||||
);
|
||||
if let Some(private_key) = &self.private_key {
|
||||
println!("Getting claims for path {}", path);
|
||||
if let Some(required_claims) = Self::get_claims_for_path(&path)? {
|
||||
println!("Got claims {:?} for path {}", required_claims, path);
|
||||
let jwt_token = encode_from_key_file(&required_claims, private_key)?;
|
||||
builder = builder.header(
|
||||
reqwest::header::AUTHORIZATION,
|
||||
format!("Bearer {jwt_token}"),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let response = builder.send().await?;
|
||||
@@ -617,8 +522,8 @@ impl AttachmentService {
|
||||
}
|
||||
|
||||
#[instrument(skip(self))]
|
||||
pub async fn status(&self) -> anyhow::Result<()> {
|
||||
self.dispatch::<(), ()>(Method::GET, "status".to_string(), None)
|
||||
pub async fn ready(&self) -> anyhow::Result<()> {
|
||||
self.dispatch::<(), ()>(Method::GET, "ready".to_string(), None)
|
||||
.await
|
||||
}
|
||||
|
||||
|
||||
@@ -8,14 +8,15 @@
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum};
|
||||
use compute_api::spec::ComputeMode;
|
||||
use control_plane::attachment_service::{
|
||||
AttachmentService, NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy,
|
||||
};
|
||||
use control_plane::attachment_service::AttachmentService;
|
||||
use control_plane::endpoint::ComputeControlPlane;
|
||||
use control_plane::local_env::{InitForceMode, LocalEnv};
|
||||
use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
|
||||
use control_plane::safekeeper::SafekeeperNode;
|
||||
use control_plane::{broker, local_env};
|
||||
use pageserver_api::controller_api::{
|
||||
NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy,
|
||||
};
|
||||
use pageserver_api::models::{
|
||||
ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
|
||||
};
|
||||
@@ -652,6 +653,10 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
let name = import_match
|
||||
.get_one::<String>("node-name")
|
||||
.ok_or_else(|| anyhow!("No node name provided"))?;
|
||||
let update_catalog = import_match
|
||||
.get_one::<bool>("update-catalog")
|
||||
.cloned()
|
||||
.unwrap_or_default();
|
||||
|
||||
// Parse base inputs
|
||||
let base_tarfile = import_match
|
||||
@@ -694,6 +699,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
None,
|
||||
pg_version,
|
||||
ComputeMode::Primary,
|
||||
!update_catalog,
|
||||
)?;
|
||||
println!("Done");
|
||||
}
|
||||
@@ -831,6 +837,10 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
.get_one::<String>("endpoint_id")
|
||||
.map(String::to_string)
|
||||
.unwrap_or_else(|| format!("ep-{branch_name}"));
|
||||
let update_catalog = sub_args
|
||||
.get_one::<bool>("update-catalog")
|
||||
.cloned()
|
||||
.unwrap_or_default();
|
||||
|
||||
let lsn = sub_args
|
||||
.get_one::<String>("lsn")
|
||||
@@ -880,6 +890,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
http_port,
|
||||
pg_version,
|
||||
mode,
|
||||
!update_catalog,
|
||||
)?;
|
||||
}
|
||||
"start" => {
|
||||
@@ -918,6 +929,11 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
.get(endpoint_id.as_str())
|
||||
.ok_or_else(|| anyhow::anyhow!("endpoint {endpoint_id} not found"))?;
|
||||
|
||||
let create_test_user = sub_args
|
||||
.get_one::<bool>("create-test-user")
|
||||
.cloned()
|
||||
.unwrap_or_default();
|
||||
|
||||
cplane.check_conflicting_endpoints(
|
||||
endpoint.mode,
|
||||
endpoint.tenant_id,
|
||||
@@ -972,6 +988,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
pageservers,
|
||||
remote_ext_config,
|
||||
stripe_size.0 as usize,
|
||||
create_test_user,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
@@ -1457,6 +1474,18 @@ fn cli() -> Command {
|
||||
.required(false)
|
||||
.default_value("1");
|
||||
|
||||
let update_catalog = Arg::new("update-catalog")
|
||||
.value_parser(value_parser!(bool))
|
||||
.long("update-catalog")
|
||||
.help("If set, will set up the catalog for neon_superuser")
|
||||
.required(false);
|
||||
|
||||
let create_test_user = Arg::new("create-test-user")
|
||||
.value_parser(value_parser!(bool))
|
||||
.long("create-test-user")
|
||||
.help("If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`")
|
||||
.required(false);
|
||||
|
||||
Command::new("Neon CLI")
|
||||
.arg_required_else_help(true)
|
||||
.version(GIT_VERSION)
|
||||
@@ -1517,6 +1546,7 @@ fn cli() -> Command {
|
||||
.arg(Arg::new("end-lsn").long("end-lsn")
|
||||
.help("Lsn the basebackup ends at"))
|
||||
.arg(pg_version_arg.clone())
|
||||
.arg(update_catalog.clone())
|
||||
)
|
||||
).subcommand(
|
||||
Command::new("tenant")
|
||||
@@ -1630,6 +1660,7 @@ fn cli() -> Command {
|
||||
.required(false))
|
||||
.arg(pg_version_arg.clone())
|
||||
.arg(hot_standby_arg.clone())
|
||||
.arg(update_catalog)
|
||||
)
|
||||
.subcommand(Command::new("start")
|
||||
.about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
|
||||
@@ -1637,6 +1668,7 @@ fn cli() -> Command {
|
||||
.arg(endpoint_pageserver_id_arg.clone())
|
||||
.arg(safekeepers_arg)
|
||||
.arg(remote_ext_config_args)
|
||||
.arg(create_test_user)
|
||||
)
|
||||
.subcommand(Command::new("reconfigure")
|
||||
.about("Reconfigure the endpoint")
|
||||
|
||||
@@ -41,11 +41,15 @@ use std::net::SocketAddr;
|
||||
use std::net::TcpStream;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use compute_api::spec::Database;
|
||||
use compute_api::spec::PgIdent;
|
||||
use compute_api::spec::RemoteExtSpec;
|
||||
use compute_api::spec::Role;
|
||||
use nix::sys::signal::kill;
|
||||
use nix::sys::signal::Signal;
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -122,6 +126,7 @@ impl ComputeControlPlane {
|
||||
http_port: Option<u16>,
|
||||
pg_version: u32,
|
||||
mode: ComputeMode,
|
||||
skip_pg_catalog_updates: bool,
|
||||
) -> Result<Arc<Endpoint>> {
|
||||
let pg_port = pg_port.unwrap_or_else(|| self.get_port());
|
||||
let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
|
||||
@@ -140,7 +145,7 @@ impl ComputeControlPlane {
|
||||
// before and after start are the same. So, skip catalog updates,
|
||||
// with this we basically test a case of waking up an idle compute, where
|
||||
// we also skip catalog updates in the cloud.
|
||||
skip_pg_catalog_updates: true,
|
||||
skip_pg_catalog_updates,
|
||||
features: vec![],
|
||||
});
|
||||
|
||||
@@ -155,7 +160,7 @@ impl ComputeControlPlane {
|
||||
http_port,
|
||||
pg_port,
|
||||
pg_version,
|
||||
skip_pg_catalog_updates: true,
|
||||
skip_pg_catalog_updates,
|
||||
features: vec![],
|
||||
})?,
|
||||
)?;
|
||||
@@ -500,6 +505,7 @@ impl Endpoint {
|
||||
pageservers: Vec<(Host, u16)>,
|
||||
remote_ext_config: Option<&String>,
|
||||
shard_stripe_size: usize,
|
||||
create_test_user: bool,
|
||||
) -> Result<()> {
|
||||
if self.status() == EndpointStatus::Running {
|
||||
anyhow::bail!("The endpoint is already running");
|
||||
@@ -551,8 +557,26 @@ impl Endpoint {
|
||||
cluster_id: None, // project ID: not used
|
||||
name: None, // project name: not used
|
||||
state: None,
|
||||
roles: vec![],
|
||||
databases: vec![],
|
||||
roles: if create_test_user {
|
||||
vec![Role {
|
||||
name: PgIdent::from_str("test").unwrap(),
|
||||
encrypted_password: None,
|
||||
options: None,
|
||||
}]
|
||||
} else {
|
||||
Vec::new()
|
||||
},
|
||||
databases: if create_test_user {
|
||||
vec![Database {
|
||||
name: PgIdent::from_str("neondb").unwrap(),
|
||||
owner: PgIdent::from_str("test").unwrap(),
|
||||
options: None,
|
||||
restrict_conn: false,
|
||||
invalid: false,
|
||||
}]
|
||||
} else {
|
||||
Vec::new()
|
||||
},
|
||||
settings: None,
|
||||
postgresql_conf: Some(postgresql_conf),
|
||||
},
|
||||
@@ -566,6 +590,7 @@ impl Endpoint {
|
||||
remote_extensions,
|
||||
pgbouncer_settings: None,
|
||||
shard_stripe_size: Some(shard_stripe_size),
|
||||
primary_is_running: None,
|
||||
};
|
||||
let spec_path = self.endpoint_path().join("spec.json");
|
||||
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
|
||||
@@ -577,11 +602,16 @@ impl Endpoint {
|
||||
.open(self.endpoint_path().join("compute.log"))?;
|
||||
|
||||
// Launch compute_ctl
|
||||
println!("Starting postgres node at '{}'", self.connstr());
|
||||
let conn_str = self.connstr("cloud_admin", "postgres");
|
||||
println!("Starting postgres node at '{}'", conn_str);
|
||||
if create_test_user {
|
||||
let conn_str = self.connstr("test", "neondb");
|
||||
println!("Also at '{}'", conn_str);
|
||||
}
|
||||
let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
|
||||
cmd.args(["--http-port", &self.http_address.port().to_string()])
|
||||
.args(["--pgdata", self.pgdata().to_str().unwrap()])
|
||||
.args(["--connstr", &self.connstr()])
|
||||
.args(["--connstr", &conn_str])
|
||||
.args([
|
||||
"--spec-path",
|
||||
self.endpoint_path().join("spec.json").to_str().unwrap(),
|
||||
@@ -652,7 +682,9 @@ impl Endpoint {
|
||||
}
|
||||
ComputeStatus::Empty
|
||||
| ComputeStatus::ConfigurationPending
|
||||
| ComputeStatus::Configuration => {
|
||||
| ComputeStatus::Configuration
|
||||
| ComputeStatus::TerminationPending
|
||||
| ComputeStatus::Terminated => {
|
||||
bail!("unexpected compute status: {:?}", state.status)
|
||||
}
|
||||
}
|
||||
@@ -783,13 +815,13 @@ impl Endpoint {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn connstr(&self) -> String {
|
||||
pub fn connstr(&self, user: &str, db_name: &str) -> String {
|
||||
format!(
|
||||
"postgresql://{}@{}:{}/{}",
|
||||
"cloud_admin",
|
||||
user,
|
||||
self.pg_address.ip(),
|
||||
self.pg_address.port(),
|
||||
"postgres"
|
||||
db_name
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -412,14 +412,17 @@ impl LocalEnv {
|
||||
|
||||
// this function is used only for testing purposes in CLI e g generate tokens during init
|
||||
pub fn generate_auth_token(&self, claims: &Claims) -> anyhow::Result<String> {
|
||||
let private_key_path = if self.private_key_path.is_absolute() {
|
||||
let private_key_path = self.get_private_key_path();
|
||||
let key_data = fs::read(private_key_path)?;
|
||||
encode_from_key_file(claims, &key_data)
|
||||
}
|
||||
|
||||
pub fn get_private_key_path(&self) -> PathBuf {
|
||||
if self.private_key_path.is_absolute() {
|
||||
self.private_key_path.to_path_buf()
|
||||
} else {
|
||||
self.base_data_dir.join(&self.private_key_path)
|
||||
};
|
||||
|
||||
let key_data = fs::read(private_key_path)?;
|
||||
encode_from_key_file(claims, &key_data)
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
@@ -17,6 +17,7 @@ use std::time::Duration;
|
||||
use anyhow::{bail, Context};
|
||||
use camino::Utf8PathBuf;
|
||||
use futures::SinkExt;
|
||||
use pageserver_api::controller_api::NodeRegisterRequest;
|
||||
use pageserver_api::models::{
|
||||
self, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, TimelineInfo,
|
||||
};
|
||||
@@ -30,7 +31,7 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use crate::attachment_service::{AttachmentService, NodeRegisterRequest};
|
||||
use crate::attachment_service::AttachmentService;
|
||||
use crate::local_env::PageServerConf;
|
||||
use crate::{background_process, local_env::LocalEnv};
|
||||
|
||||
@@ -115,7 +116,7 @@ impl PageServerNode {
|
||||
if matches!(self.conf.http_auth_type, AuthType::NeonJWT) {
|
||||
let jwt_token = self
|
||||
.env
|
||||
.generate_auth_token(&Claims::new(None, Scope::PageServerApi))
|
||||
.generate_auth_token(&Claims::new(None, Scope::GenerationsApi))
|
||||
.unwrap();
|
||||
overrides.push(format!("control_plane_api_token='{}'", jwt_token));
|
||||
}
|
||||
@@ -210,6 +211,25 @@ impl PageServerNode {
|
||||
update_config: bool,
|
||||
register: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
// Register the node with the storage controller before starting pageserver: pageserver must be registered to
|
||||
// successfully call /re-attach and finish starting up.
|
||||
if register {
|
||||
let attachment_service = AttachmentService::from_env(&self.env);
|
||||
let (pg_host, pg_port) =
|
||||
parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
|
||||
let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
|
||||
.expect("Unable to parse listen_http_addr");
|
||||
attachment_service
|
||||
.node_register(NodeRegisterRequest {
|
||||
node_id: self.conf.id,
|
||||
listen_pg_addr: pg_host.to_string(),
|
||||
listen_pg_port: pg_port.unwrap_or(5432),
|
||||
listen_http_addr: http_host.to_string(),
|
||||
listen_http_port: http_port.unwrap_or(80),
|
||||
})
|
||||
.await?;
|
||||
}
|
||||
|
||||
// TODO: using a thread here because start_process() is not async but we need to call check_status()
|
||||
let datadir = self.repo_path();
|
||||
print!(
|
||||
@@ -248,23 +268,6 @@ impl PageServerNode {
|
||||
)
|
||||
.await?;
|
||||
|
||||
if register {
|
||||
let attachment_service = AttachmentService::from_env(&self.env);
|
||||
let (pg_host, pg_port) =
|
||||
parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
|
||||
let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
|
||||
.expect("Unable to parse listen_http_addr");
|
||||
attachment_service
|
||||
.node_register(NodeRegisterRequest {
|
||||
node_id: self.conf.id,
|
||||
listen_pg_addr: pg_host.to_string(),
|
||||
listen_pg_port: pg_port.unwrap_or(5432),
|
||||
listen_http_addr: http_host.to_string(),
|
||||
listen_http_port: http_port.unwrap_or(80),
|
||||
})
|
||||
.await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -350,6 +353,11 @@ impl PageServerNode {
|
||||
.remove("compaction_threshold")
|
||||
.map(|x| x.parse::<usize>())
|
||||
.transpose()?,
|
||||
compaction_algorithm: settings
|
||||
.remove("compaction_algorithm")
|
||||
.map(serde_json::from_str)
|
||||
.transpose()
|
||||
.context("Failed to parse 'compaction_algorithm' json")?,
|
||||
gc_horizon: settings
|
||||
.remove("gc_horizon")
|
||||
.map(|x| x.parse::<u64>())
|
||||
@@ -389,17 +397,17 @@ impl PageServerNode {
|
||||
evictions_low_residence_duration_metric_threshold: settings
|
||||
.remove("evictions_low_residence_duration_metric_threshold")
|
||||
.map(|x| x.to_string()),
|
||||
gc_feedback: settings
|
||||
.remove("gc_feedback")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'gc_feedback' as bool")?,
|
||||
heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
|
||||
lazy_slru_download: settings
|
||||
.remove("lazy_slru_download")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'lazy_slru_download' as bool")?,
|
||||
timeline_get_throttle: settings
|
||||
.remove("timeline_get_throttle")
|
||||
.map(serde_json::from_str)
|
||||
.transpose()
|
||||
.context("parse `timeline_get_throttle` from json")?,
|
||||
};
|
||||
if !settings.is_empty() {
|
||||
bail!("Unrecognized tenant settings: {settings:?}")
|
||||
@@ -453,6 +461,11 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<usize>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'compaction_threshold' as an integer")?,
|
||||
compaction_algorithm: settings
|
||||
.remove("compactin_algorithm")
|
||||
.map(serde_json::from_str)
|
||||
.transpose()
|
||||
.context("Failed to parse 'compaction_algorithm' json")?,
|
||||
gc_horizon: settings
|
||||
.remove("gc_horizon")
|
||||
.map(|x| x.parse::<u64>())
|
||||
@@ -494,17 +507,17 @@ impl PageServerNode {
|
||||
evictions_low_residence_duration_metric_threshold: settings
|
||||
.remove("evictions_low_residence_duration_metric_threshold")
|
||||
.map(|x| x.to_string()),
|
||||
gc_feedback: settings
|
||||
.remove("gc_feedback")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'gc_feedback' as bool")?,
|
||||
heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
|
||||
lazy_slru_download: settings
|
||||
.remove("lazy_slru_download")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'lazy_slru_download' as bool")?,
|
||||
timeline_get_throttle: settings
|
||||
.remove("timeline_get_throttle")
|
||||
.map(serde_json::from_str)
|
||||
.transpose()
|
||||
.context("parse `timeline_get_throttle` from json")?,
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -70,6 +70,9 @@ Should only be used e.g. for status check/tenant creation/list.
|
||||
Should only be used e.g. for status check.
|
||||
Currently also used for connection from any pageserver to any safekeeper.
|
||||
|
||||
"generations_api": Provides access to the upcall APIs served by the attachment service or the control plane.
|
||||
|
||||
"admin": Provides access to the control plane and admin APIs of the attachment service.
|
||||
|
||||
### CLI
|
||||
CLI generates a key pair during call to `neon_local init` with the following commands:
|
||||
|
||||
@@ -52,6 +52,10 @@ pub enum ComputeStatus {
|
||||
// compute will exit soon or is waiting for
|
||||
// control-plane to terminate it.
|
||||
Failed,
|
||||
// Termination requested
|
||||
TerminationPending,
|
||||
// Terminated Postgres
|
||||
Terminated,
|
||||
}
|
||||
|
||||
fn rfc3339_serialize<S>(x: &Option<DateTime<Utc>>, s: S) -> Result<S::Ok, S::Error>
|
||||
|
||||
@@ -79,6 +79,12 @@ pub struct ComputeSpec {
|
||||
// Stripe size for pageserver sharding, in pages
|
||||
#[serde(default)]
|
||||
pub shard_stripe_size: Option<usize>,
|
||||
|
||||
// When we are starting a new replica in hot standby mode,
|
||||
// we need to know if the primary is running.
|
||||
// This is used to determine if replica should wait for
|
||||
// RUNNING_XACTS from primary or not.
|
||||
pub primary_is_running: Option<bool>,
|
||||
}
|
||||
|
||||
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
|
||||
|
||||
@@ -201,6 +201,11 @@ impl<P: Atomic> GenericCounterPairVec<P> {
|
||||
pub fn with_label_values(&self, vals: &[&str]) -> GenericCounterPair<P> {
|
||||
self.get_metric_with_label_values(vals).unwrap()
|
||||
}
|
||||
|
||||
pub fn remove_label_values(&self, res: &mut [Result<()>; 2], vals: &[&str]) {
|
||||
res[0] = self.inc.remove_label_values(vals);
|
||||
res[1] = self.dec.remove_label_values(vals);
|
||||
}
|
||||
}
|
||||
|
||||
impl<P: Atomic> GenericCounterPair<P> {
|
||||
@@ -247,6 +252,15 @@ impl<P: Atomic> GenericCounterPair<P> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<P: Atomic> Clone for GenericCounterPair<P> {
|
||||
fn clone(&self) -> Self {
|
||||
Self {
|
||||
inc: self.inc.clone(),
|
||||
dec: self.dec.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Guard returned by [`GenericCounterPair::guard`]
|
||||
pub struct GenericCounterPairGuard<P: Atomic>(GenericCounter<P>);
|
||||
|
||||
|
||||
@@ -18,9 +18,11 @@ enum-map.workspace = true
|
||||
strum.workspace = true
|
||||
strum_macros.workspace = true
|
||||
hex.workspace = true
|
||||
humantime.workspace = true
|
||||
thiserror.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
chrono.workspace = true
|
||||
itertools.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
|
||||
129
libs/pageserver_api/src/controller_api.rs
Normal file
129
libs/pageserver_api/src/controller_api.rs
Normal file
@@ -0,0 +1,129 @@
|
||||
use std::str::FromStr;
|
||||
|
||||
/// Request/response types for the storage controller
|
||||
/// API (`/control/v1` prefix). Implemented by the server
|
||||
/// in [`attachment_service::http`]
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::NodeId;
|
||||
|
||||
use crate::{models::ShardParameters, shard::TenantShardId};
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantCreateResponseShard {
|
||||
pub shard_id: TenantShardId,
|
||||
pub node_id: NodeId,
|
||||
pub generation: u32,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantCreateResponse {
|
||||
pub shards: Vec<TenantCreateResponseShard>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct NodeRegisterRequest {
|
||||
pub node_id: NodeId,
|
||||
|
||||
pub listen_pg_addr: String,
|
||||
pub listen_pg_port: u16,
|
||||
|
||||
pub listen_http_addr: String,
|
||||
pub listen_http_port: u16,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct NodeConfigureRequest {
|
||||
pub node_id: NodeId,
|
||||
|
||||
pub availability: Option<NodeAvailability>,
|
||||
pub scheduling: Option<NodeSchedulingPolicy>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantLocateResponseShard {
|
||||
pub shard_id: TenantShardId,
|
||||
pub node_id: NodeId,
|
||||
|
||||
pub listen_pg_addr: String,
|
||||
pub listen_pg_port: u16,
|
||||
|
||||
pub listen_http_addr: String,
|
||||
pub listen_http_port: u16,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantLocateResponse {
|
||||
pub shards: Vec<TenantLocateResponseShard>,
|
||||
pub shard_params: ShardParameters,
|
||||
}
|
||||
|
||||
/// Explicitly migrating a particular shard is a low level operation
|
||||
/// TODO: higher level "Reschedule tenant" operation where the request
|
||||
/// specifies some constraints, e.g. asking it to get off particular node(s)
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantShardMigrateRequest {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
pub node_id: NodeId,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
|
||||
pub enum NodeAvailability {
|
||||
// Normal, happy state
|
||||
Active,
|
||||
// Offline: Tenants shouldn't try to attach here, but they may assume that their
|
||||
// secondary locations on this node still exist. Newly added nodes are in this
|
||||
// state until we successfully contact them.
|
||||
Offline,
|
||||
}
|
||||
|
||||
impl FromStr for NodeAvailability {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s {
|
||||
"active" => Ok(Self::Active),
|
||||
"offline" => Ok(Self::Offline),
|
||||
_ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// FIXME: this is a duplicate of the type in the attachment_service crate, because the
|
||||
/// type needs to be defined with diesel traits in there.
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
|
||||
pub enum NodeSchedulingPolicy {
|
||||
Active,
|
||||
Filling,
|
||||
Pause,
|
||||
Draining,
|
||||
}
|
||||
|
||||
impl FromStr for NodeSchedulingPolicy {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s {
|
||||
"active" => Ok(Self::Active),
|
||||
"filling" => Ok(Self::Filling),
|
||||
"pause" => Ok(Self::Pause),
|
||||
"draining" => Ok(Self::Draining),
|
||||
_ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<NodeSchedulingPolicy> for String {
|
||||
fn from(value: NodeSchedulingPolicy) -> String {
|
||||
use NodeSchedulingPolicy::*;
|
||||
match value {
|
||||
Active => "active",
|
||||
Filling => "filling",
|
||||
Pause => "pause",
|
||||
Draining => "draining",
|
||||
}
|
||||
.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantShardMigrateResponse {}
|
||||
@@ -2,6 +2,7 @@ use postgres_ffi::BLCKSZ;
|
||||
use std::ops::Range;
|
||||
|
||||
use crate::key::Key;
|
||||
use itertools::Itertools;
|
||||
|
||||
///
|
||||
/// Represents a set of Keys, in a compact form.
|
||||
@@ -63,9 +64,36 @@ impl KeySpace {
|
||||
KeyPartitioning { parts }
|
||||
}
|
||||
|
||||
/// Update the keyspace such that it doesn't contain any range
|
||||
/// that is overlapping with `other`. This can involve splitting or
|
||||
/// removing of existing ranges.
|
||||
/// Merge another keyspace into the current one.
|
||||
/// Note: the keyspaces must not ovelap (enforced via assertions)
|
||||
pub fn merge(&mut self, other: &KeySpace) {
|
||||
let all_ranges = self
|
||||
.ranges
|
||||
.iter()
|
||||
.merge_by(other.ranges.iter(), |lhs, rhs| lhs.start < rhs.start);
|
||||
|
||||
let mut accum = KeySpaceAccum::new();
|
||||
let mut prev: Option<&Range<Key>> = None;
|
||||
for range in all_ranges {
|
||||
if let Some(prev) = prev {
|
||||
let overlap =
|
||||
std::cmp::max(range.start, prev.start) < std::cmp::min(range.end, prev.end);
|
||||
assert!(
|
||||
!overlap,
|
||||
"Attempt to merge ovelapping keyspaces: {:?} overlaps {:?}",
|
||||
prev, range
|
||||
);
|
||||
}
|
||||
|
||||
accum.add_range(range.clone());
|
||||
prev = Some(range);
|
||||
}
|
||||
|
||||
self.ranges = accum.to_keyspace().ranges;
|
||||
}
|
||||
|
||||
/// Remove all keys in `other` from `self`.
|
||||
/// This can involve splitting or removing of existing ranges.
|
||||
pub fn remove_overlapping_with(&mut self, other: &KeySpace) {
|
||||
let (self_start, self_end) = match (self.start(), self.end()) {
|
||||
(Some(start), Some(end)) => (start, end),
|
||||
@@ -220,16 +248,7 @@ impl KeySpaceAccum {
|
||||
}
|
||||
|
||||
pub fn consume_keyspace(&mut self) -> KeySpace {
|
||||
if let Some(accum) = self.accum.take() {
|
||||
self.ranges.push(accum);
|
||||
}
|
||||
|
||||
let mut prev_accum = KeySpaceAccum::new();
|
||||
std::mem::swap(self, &mut prev_accum);
|
||||
|
||||
KeySpace {
|
||||
ranges: prev_accum.ranges,
|
||||
}
|
||||
std::mem::take(self).to_keyspace()
|
||||
}
|
||||
|
||||
pub fn size(&self) -> u64 {
|
||||
@@ -279,8 +298,16 @@ impl KeySpaceRandomAccum {
|
||||
}
|
||||
KeySpace { ranges }
|
||||
}
|
||||
|
||||
pub fn consume_keyspace(&mut self) -> KeySpace {
|
||||
let mut prev_accum = KeySpaceRandomAccum::new();
|
||||
std::mem::swap(self, &mut prev_accum);
|
||||
|
||||
prev_accum.to_keyspace()
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn key_range_size(key_range: &Range<Key>) -> u32 {
|
||||
let start = key_range.start;
|
||||
let end = key_range.end;
|
||||
|
||||
@@ -2,13 +2,14 @@
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
use const_format::formatcp;
|
||||
|
||||
/// Public API types
|
||||
pub mod control_api;
|
||||
pub mod controller_api;
|
||||
pub mod key;
|
||||
pub mod keyspace;
|
||||
pub mod models;
|
||||
pub mod reltag;
|
||||
pub mod shard;
|
||||
/// Public API types
|
||||
pub mod upcall_api;
|
||||
|
||||
pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
|
||||
pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
|
||||
|
||||
@@ -1,4 +1,7 @@
|
||||
pub mod partitioning;
|
||||
pub mod utilization;
|
||||
|
||||
pub use utilization::PageserverUtilization;
|
||||
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
@@ -11,7 +14,6 @@ use byteorder::{BigEndian, ReadBytesExt};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::serde_as;
|
||||
use strum_macros;
|
||||
use utils::{
|
||||
completion,
|
||||
history_buffer::HistoryBufferWithDropCounter,
|
||||
@@ -180,7 +182,7 @@ pub enum TimelineState {
|
||||
Broken { reason: String, backtrace: String },
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct TimelineCreateRequest {
|
||||
pub new_timeline_id: TimelineId,
|
||||
#[serde(default)]
|
||||
@@ -269,6 +271,8 @@ pub struct TenantConfig {
|
||||
pub compaction_target_size: Option<u64>,
|
||||
pub compaction_period: Option<String>,
|
||||
pub compaction_threshold: Option<usize>,
|
||||
// defer parsing compaction_algorithm, like eviction_policy
|
||||
pub compaction_algorithm: Option<CompactionAlgorithm>,
|
||||
pub gc_horizon: Option<u64>,
|
||||
pub gc_period: Option<String>,
|
||||
pub image_creation_threshold: Option<usize>,
|
||||
@@ -280,9 +284,9 @@ pub struct TenantConfig {
|
||||
pub eviction_policy: Option<EvictionPolicy>,
|
||||
pub min_resident_size_override: Option<u64>,
|
||||
pub evictions_low_residence_duration_metric_threshold: Option<String>,
|
||||
pub gc_feedback: Option<bool>,
|
||||
pub heatmap_period: Option<String>,
|
||||
pub lazy_slru_download: Option<bool>,
|
||||
pub timeline_get_throttle: Option<ThrottleConfig>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
@@ -290,6 +294,7 @@ pub struct TenantConfig {
|
||||
pub enum EvictionPolicy {
|
||||
NoEviction,
|
||||
LayerAccessThreshold(EvictionPolicyLayerAccessThreshold),
|
||||
OnlyImitiate(EvictionPolicyLayerAccessThreshold),
|
||||
}
|
||||
|
||||
impl EvictionPolicy {
|
||||
@@ -297,10 +302,18 @@ impl EvictionPolicy {
|
||||
match self {
|
||||
EvictionPolicy::NoEviction => "NoEviction",
|
||||
EvictionPolicy::LayerAccessThreshold(_) => "LayerAccessThreshold",
|
||||
EvictionPolicy::OnlyImitiate(_) => "OnlyImitiate",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(tag = "kind")]
|
||||
pub enum CompactionAlgorithm {
|
||||
Legacy,
|
||||
Tiered,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct EvictionPolicyLayerAccessThreshold {
|
||||
#[serde(with = "humantime_serde")]
|
||||
@@ -309,10 +322,39 @@ pub struct EvictionPolicyLayerAccessThreshold {
|
||||
pub threshold: Duration,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
|
||||
pub struct ThrottleConfig {
|
||||
pub task_kinds: Vec<String>, // TaskKind
|
||||
pub initial: usize,
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub refill_interval: Duration,
|
||||
pub refill_amount: NonZeroUsize,
|
||||
pub max: usize,
|
||||
pub fair: bool,
|
||||
}
|
||||
|
||||
impl ThrottleConfig {
|
||||
pub fn disabled() -> Self {
|
||||
Self {
|
||||
task_kinds: vec![], // effectively disables the throttle
|
||||
// other values don't matter with emtpy `task_kinds`.
|
||||
initial: 0,
|
||||
refill_interval: Duration::from_millis(1),
|
||||
refill_amount: NonZeroUsize::new(1).unwrap(),
|
||||
max: 1,
|
||||
fair: true,
|
||||
}
|
||||
}
|
||||
/// The requests per second allowed by the given config.
|
||||
pub fn steady_rps(&self) -> f64 {
|
||||
(self.refill_amount.get() as f64) / (self.refill_interval.as_secs_f64())
|
||||
}
|
||||
}
|
||||
|
||||
/// A flattened analog of a `pagesever::tenant::LocationMode`, which
|
||||
/// lists out all possible states (and the virtual "Detached" state)
|
||||
/// in a flat form rather than using rust-style enums.
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Copy, Eq, PartialEq)]
|
||||
pub enum LocationConfigMode {
|
||||
AttachedSingle,
|
||||
AttachedMulti,
|
||||
@@ -376,6 +418,12 @@ pub struct TenantLocationConfigRequest {
|
||||
pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantTimeTravelRequest {
|
||||
pub shard_counts: Vec<ShardCount>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantShardLocation {
|
||||
@@ -1028,7 +1076,6 @@ impl PagestreamBeMessage {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use bytes::Buf;
|
||||
use serde_json::json;
|
||||
|
||||
use super::*;
|
||||
|
||||
70
libs/pageserver_api/src/models/utilization.rs
Normal file
70
libs/pageserver_api/src/models/utilization.rs
Normal file
@@ -0,0 +1,70 @@
|
||||
use std::time::SystemTime;
|
||||
|
||||
/// Pageserver current utilization and scoring for how good candidate the pageserver would be for
|
||||
/// the next tenant.
|
||||
///
|
||||
/// See and maintain pageserver openapi spec for `/v1/utilization_score` as the truth.
|
||||
///
|
||||
/// `format: int64` fields must use `ser_saturating_u63` because openapi generated clients might
|
||||
/// not handle full u64 values properly.
|
||||
#[derive(serde::Serialize, Debug)]
|
||||
pub struct PageserverUtilization {
|
||||
/// Used disk space
|
||||
#[serde(serialize_with = "ser_saturating_u63")]
|
||||
pub disk_usage_bytes: u64,
|
||||
/// Free disk space
|
||||
#[serde(serialize_with = "ser_saturating_u63")]
|
||||
pub free_space_bytes: u64,
|
||||
/// Lower is better score for how good candidate for a next tenant would this pageserver be.
|
||||
#[serde(serialize_with = "ser_saturating_u63")]
|
||||
pub utilization_score: u64,
|
||||
/// When was this snapshot captured, pageserver local time.
|
||||
///
|
||||
/// Use millis to give confidence that the value is regenerated often enough.
|
||||
#[serde(serialize_with = "ser_rfc3339_millis")]
|
||||
pub captured_at: SystemTime,
|
||||
}
|
||||
|
||||
fn ser_rfc3339_millis<S: serde::Serializer>(
|
||||
ts: &SystemTime,
|
||||
serializer: S,
|
||||
) -> Result<S::Ok, S::Error> {
|
||||
serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
|
||||
}
|
||||
|
||||
/// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
|
||||
///
|
||||
/// Instead of newtype, use this because a newtype would get require handling deserializing values
|
||||
/// with the highest bit set which is properly parsed by serde formats, but would create a
|
||||
/// conundrum on how to handle and again serialize such values at type level. It will be a few
|
||||
/// years until we can use more than `i64::MAX` bytes on a disk.
|
||||
fn ser_saturating_u63<S: serde::Serializer>(value: &u64, serializer: S) -> Result<S::Ok, S::Error> {
|
||||
const MAX_FORMAT_INT64: u64 = i64::MAX as u64;
|
||||
|
||||
let value = (*value).min(MAX_FORMAT_INT64);
|
||||
|
||||
serializer.serialize_u64(value)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::time::Duration;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn u64_max_is_serialized_as_u63_max() {
|
||||
let doc = PageserverUtilization {
|
||||
disk_usage_bytes: u64::MAX,
|
||||
free_space_bytes: 0,
|
||||
utilization_score: u64::MAX,
|
||||
captured_at: SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
|
||||
};
|
||||
|
||||
let s = serde_json::to_string(&doc).unwrap();
|
||||
|
||||
let expected = r#"{"disk_usage_bytes":9223372036854775807,"free_space_bytes":0,"utilization_score":9223372036854775807,"captured_at":"2024-02-21T10:02:59.000Z"}"#;
|
||||
|
||||
assert_eq!(s, expected);
|
||||
}
|
||||
}
|
||||
@@ -6,7 +6,6 @@ use crate::{
|
||||
};
|
||||
use hex::FromHex;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use thiserror;
|
||||
use utils::id::TenantId;
|
||||
|
||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
|
||||
@@ -502,10 +501,12 @@ impl ShardIdentity {
|
||||
pub fn is_key_disposable(&self, key: &Key) -> bool {
|
||||
if key_is_shard0(key) {
|
||||
// Q: Why can't we dispose of shard0 content if we're not shard 0?
|
||||
// A: because the WAL ingestion logic currently ingests some shard 0
|
||||
// content on all shards, even though it's only read on shard 0. If we
|
||||
// dropped it, then subsequent WAL ingest to these keys would encounter
|
||||
// an error.
|
||||
// A1: because the WAL ingestion logic currently ingests some shard 0
|
||||
// content on all shards, even though it's only read on shard 0. If we
|
||||
// dropped it, then subsequent WAL ingest to these keys would encounter
|
||||
// an error.
|
||||
// A2: because key_is_shard0 also covers relation size keys, which are written
|
||||
// on all shards even though they're only maintained accurately on shard 0.
|
||||
false
|
||||
} else {
|
||||
!self.is_key_local(key)
|
||||
@@ -654,10 +655,7 @@ fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Ke
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::str::FromStr;
|
||||
|
||||
use bincode;
|
||||
use utils::{id::TenantId, Hex};
|
||||
use utils::Hex;
|
||||
|
||||
use super::*;
|
||||
|
||||
|
||||
@@ -80,6 +80,9 @@ pub const XLOG_XACT_ABORT: u8 = 0x20;
|
||||
pub const XLOG_XACT_COMMIT_PREPARED: u8 = 0x30;
|
||||
pub const XLOG_XACT_ABORT_PREPARED: u8 = 0x40;
|
||||
|
||||
// From standbydefs.h
|
||||
pub const XLOG_RUNNING_XACTS: u8 = 0x10;
|
||||
|
||||
// From srlu.h
|
||||
pub const SLRU_PAGES_PER_SEGMENT: u32 = 32;
|
||||
pub const SLRU_SEG_SIZE: usize = BLCKSZ as usize * SLRU_PAGES_PER_SEGMENT as usize;
|
||||
|
||||
@@ -119,11 +119,6 @@ pub fn generate_pg_control(
|
||||
// Generate new pg_control needed for bootstrap
|
||||
checkpoint.redo = normalize_lsn(lsn, WAL_SEGMENT_SIZE).0;
|
||||
|
||||
//reset some fields we don't want to preserve
|
||||
//TODO Check this.
|
||||
//We may need to determine the value from twophase data.
|
||||
checkpoint.oldestActiveXid = 0;
|
||||
|
||||
//save new values in pg_control
|
||||
pg_control.checkPoint = 0;
|
||||
pg_control.checkPointCopy = checkpoint;
|
||||
|
||||
@@ -44,6 +44,26 @@ impl DownloadError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<std::io::Error> for DownloadError {
|
||||
fn from(value: std::io::Error) -> Self {
|
||||
let needs_unwrap = value.kind() == std::io::ErrorKind::Other
|
||||
&& value
|
||||
.get_ref()
|
||||
.and_then(|x| x.downcast_ref::<DownloadError>())
|
||||
.is_some();
|
||||
|
||||
if needs_unwrap {
|
||||
*value
|
||||
.into_inner()
|
||||
.expect("just checked")
|
||||
.downcast::<DownloadError>()
|
||||
.expect("just checked")
|
||||
} else {
|
||||
DownloadError::Other(value.into())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum TimeTravelError {
|
||||
/// Validation or other error happened due to user input.
|
||||
@@ -142,13 +162,12 @@ impl std::fmt::Display for TimeoutOrCancel {
|
||||
impl std::error::Error for TimeoutOrCancel {}
|
||||
|
||||
impl TimeoutOrCancel {
|
||||
pub fn caused(error: &anyhow::Error) -> Option<&Self> {
|
||||
error.root_cause().downcast_ref()
|
||||
}
|
||||
|
||||
/// Returns true if the error was caused by [`TimeoutOrCancel::Cancel`].
|
||||
pub fn caused_by_cancel(error: &anyhow::Error) -> bool {
|
||||
Self::caused(error).is_some_and(Self::is_cancel)
|
||||
error
|
||||
.root_cause()
|
||||
.downcast_ref::<Self>()
|
||||
.is_some_and(Self::is_cancel)
|
||||
}
|
||||
|
||||
pub fn is_cancel(&self) -> bool {
|
||||
|
||||
@@ -623,9 +623,7 @@ fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
|
||||
mod fs_tests {
|
||||
use super::*;
|
||||
|
||||
use bytes::Bytes;
|
||||
use camino_tempfile::tempdir;
|
||||
use futures_util::Stream;
|
||||
use std::{collections::HashMap, io::Write};
|
||||
|
||||
async fn read_and_check_metadata(
|
||||
|
||||
@@ -1040,7 +1040,7 @@ mod tests {
|
||||
Some("test/prefix/"),
|
||||
Some("/test/prefix/"),
|
||||
];
|
||||
let expected_outputs = vec![
|
||||
let expected_outputs = [
|
||||
vec!["", "some/path", "some/path"],
|
||||
vec!["/", "/some/path", "/some/path"],
|
||||
vec![
|
||||
|
||||
@@ -73,6 +73,8 @@ where
|
||||
if !*this.hit {
|
||||
if let Poll::Ready(e) = this.cancellation.poll(cx) {
|
||||
*this.hit = true;
|
||||
|
||||
// most likely this will be a std::io::Error wrapping a DownloadError
|
||||
let e = Err(std::io::Error::from(e));
|
||||
return Poll::Ready(Some(e));
|
||||
}
|
||||
@@ -130,6 +132,8 @@ mod tests {
|
||||
.is_some_and(|e| matches!(e, DownloadError::Cancelled)),
|
||||
"{inner:?}"
|
||||
);
|
||||
let e = DownloadError::from(e);
|
||||
assert!(matches!(e, DownloadError::Cancelled), "{e:?}");
|
||||
|
||||
tokio::select! {
|
||||
_ = stream.next() => unreachable!("no timeout ever happens as we were already cancelled"),
|
||||
@@ -146,7 +150,7 @@ mod tests {
|
||||
let stream = DownloadStream::new(cancel_or_timeout(timeout, cancel.clone()), inner);
|
||||
let mut stream = std::pin::pin!(stream);
|
||||
|
||||
// because the stream uses 120s timeout we are paused, we advance to 120s right away.
|
||||
// because the stream uses 120s timeout and we are paused, we advance to 120s right away.
|
||||
let first = stream.next();
|
||||
|
||||
let e = first.await.expect("there must be some").unwrap_err();
|
||||
@@ -158,6 +162,8 @@ mod tests {
|
||||
.is_some_and(|e| matches!(e, DownloadError::Timeout)),
|
||||
"{inner:?}"
|
||||
);
|
||||
let e = DownloadError::from(e);
|
||||
assert!(matches!(e, DownloadError::Timeout), "{e:?}");
|
||||
|
||||
cancel.cancel();
|
||||
|
||||
|
||||
@@ -25,6 +25,7 @@ hyper = { workspace = true, features = ["full"] }
|
||||
fail.workspace = true
|
||||
futures = { workspace = true}
|
||||
jsonwebtoken.workspace = true
|
||||
leaky-bucket.workspace = true
|
||||
nix.workspace = true
|
||||
once_cell.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
// For details about authentication see docs/authentication.md
|
||||
|
||||
use arc_swap::ArcSwap;
|
||||
use serde;
|
||||
use std::{borrow::Cow, fmt::Display, fs, sync::Arc};
|
||||
|
||||
use anyhow::Result;
|
||||
@@ -32,6 +31,8 @@ pub enum Scope {
|
||||
// The scope used by pageservers in upcalls to storage controller and cloud control plane
|
||||
#[serde(rename = "generations_api")]
|
||||
GenerationsApi,
|
||||
// Allows access to control plane managment API and some storage controller endpoints.
|
||||
Admin,
|
||||
}
|
||||
|
||||
/// JWT payload. See docs/authentication.md for the format
|
||||
@@ -204,12 +205,11 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
|
||||
// "scope": "tenant",
|
||||
// "tenant_id": "3d1f7595b468230304e0b73cecbcb081",
|
||||
// "iss": "neon.controlplane",
|
||||
// "exp": 1709200879,
|
||||
// "iat": 1678442479
|
||||
// }
|
||||
// ```
|
||||
//
|
||||
let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw";
|
||||
let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJpYXQiOjE2Nzg0NDI0Nzl9.rNheBnluMJNgXzSTTJoTNIGy4P_qe0JUHl_nVEGuDCTgHOThPVr552EnmKccrCKquPeW3c2YUk0Y9Oh4KyASAw";
|
||||
|
||||
// Check it can be validated with the public key
|
||||
let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]);
|
||||
|
||||
@@ -4,7 +4,9 @@ use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker};
|
||||
///
|
||||
/// Can be cloned, moved and kept around in futures as "guard objects".
|
||||
#[derive(Clone)]
|
||||
pub struct Completion(TaskTrackerToken);
|
||||
pub struct Completion {
|
||||
_token: TaskTrackerToken,
|
||||
}
|
||||
|
||||
/// Barrier will wait until all clones of [`Completion`] have been dropped.
|
||||
#[derive(Clone)]
|
||||
@@ -49,5 +51,5 @@ pub fn channel() -> (Completion, Barrier) {
|
||||
tracker.close();
|
||||
|
||||
let token = tracker.token();
|
||||
(Completion(token), Barrier(tracker))
|
||||
(Completion { _token: token }, Barrier(tracker))
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
fs::{self, File},
|
||||
io,
|
||||
io::{self, Write},
|
||||
};
|
||||
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
@@ -161,6 +161,48 @@ pub async fn durable_rename(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Writes a file to the specified `final_path` in a crash safe fasion, using [`std::fs`].
|
||||
///
|
||||
/// The file is first written to the specified `tmp_path`, and in a second
|
||||
/// step, the `tmp_path` is renamed to the `final_path`. Intermediary fsync
|
||||
/// and atomic rename guarantee that, if we crash at any point, there will never
|
||||
/// be a partially written file at `final_path` (but maybe at `tmp_path`).
|
||||
///
|
||||
/// Callers are responsible for serializing calls of this function for a given `final_path`.
|
||||
/// If they don't, there may be an error due to conflicting `tmp_path`, or there will
|
||||
/// be no error and the content of `final_path` will be the "winner" caller's `content`.
|
||||
/// I.e., the atomticity guarantees still hold.
|
||||
pub fn overwrite(
|
||||
final_path: &Utf8Path,
|
||||
tmp_path: &Utf8Path,
|
||||
content: &[u8],
|
||||
) -> std::io::Result<()> {
|
||||
let Some(final_path_parent) = final_path.parent() else {
|
||||
return Err(std::io::Error::from_raw_os_error(
|
||||
nix::errno::Errno::EINVAL as i32,
|
||||
));
|
||||
};
|
||||
std::fs::remove_file(tmp_path).or_else(crate::fs_ext::ignore_not_found)?;
|
||||
let mut file = std::fs::OpenOptions::new()
|
||||
.write(true)
|
||||
// Use `create_new` so that, if we race with ourselves or something else,
|
||||
// we bail out instead of causing damage.
|
||||
.create_new(true)
|
||||
.open(tmp_path)?;
|
||||
file.write_all(content)?;
|
||||
file.sync_all()?;
|
||||
drop(file); // don't keep the fd open for longer than we have to
|
||||
|
||||
std::fs::rename(tmp_path, final_path)?;
|
||||
|
||||
let final_parent_dirfd = std::fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(final_path_parent)?;
|
||||
|
||||
final_parent_dirfd.sync_all()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
|
||||
@@ -45,7 +45,7 @@ impl Generation {
|
||||
Self::Broken
|
||||
}
|
||||
|
||||
pub fn new(v: u32) -> Self {
|
||||
pub const fn new(v: u32) -> Self {
|
||||
Self::Valid(v)
|
||||
}
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
|
||||
use once_cell::sync::Lazy;
|
||||
use routerify::ext::RequestExt;
|
||||
use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
|
||||
use tracing::{self, debug, info, info_span, warn, Instrument};
|
||||
use tracing::{debug, info, info_span, warn, Instrument};
|
||||
|
||||
use std::future::Future;
|
||||
use std::str::FromStr;
|
||||
@@ -156,6 +156,10 @@ pub struct ChannelWriter {
|
||||
buffer: BytesMut,
|
||||
pub tx: mpsc::Sender<std::io::Result<Bytes>>,
|
||||
written: usize,
|
||||
/// Time spent waiting for the channel to make progress. It is not the same as time to upload a
|
||||
/// buffer because we cannot know anything about that, but this should allow us to understand
|
||||
/// the actual time taken without the time spent `std::thread::park`ed.
|
||||
wait_time: std::time::Duration,
|
||||
}
|
||||
|
||||
impl ChannelWriter {
|
||||
@@ -168,6 +172,7 @@ impl ChannelWriter {
|
||||
buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
|
||||
tx,
|
||||
written: 0,
|
||||
wait_time: std::time::Duration::ZERO,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -180,6 +185,8 @@ impl ChannelWriter {
|
||||
tracing::trace!(n, "flushing");
|
||||
let ready = self.buffer.split().freeze();
|
||||
|
||||
let wait_started_at = std::time::Instant::now();
|
||||
|
||||
// not ideal to call from blocking code to block_on, but we are sure that this
|
||||
// operation does not spawn_blocking other tasks
|
||||
let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
|
||||
@@ -192,6 +199,9 @@ impl ChannelWriter {
|
||||
// sending it to the client.
|
||||
Ok(())
|
||||
});
|
||||
|
||||
self.wait_time += wait_started_at.elapsed();
|
||||
|
||||
if res.is_err() {
|
||||
return Err(std::io::ErrorKind::BrokenPipe.into());
|
||||
}
|
||||
@@ -202,6 +212,10 @@ impl ChannelWriter {
|
||||
pub fn flushed_bytes(&self) -> usize {
|
||||
self.written
|
||||
}
|
||||
|
||||
pub fn wait_time(&self) -> std::time::Duration {
|
||||
self.wait_time
|
||||
}
|
||||
}
|
||||
|
||||
impl std::io::Write for ChannelWriter {
|
||||
@@ -252,22 +266,52 @@ async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body
|
||||
|
||||
let span = info_span!("blocking");
|
||||
tokio::task::spawn_blocking(move || {
|
||||
// there are situations where we lose scraped metrics under load, try to gather some clues
|
||||
// since all nodes are queried this, keep the message count low.
|
||||
let spawned_at = std::time::Instant::now();
|
||||
|
||||
let _span = span.entered();
|
||||
|
||||
let metrics = metrics::gather();
|
||||
|
||||
let gathered_at = std::time::Instant::now();
|
||||
|
||||
let res = encoder
|
||||
.encode(&metrics, &mut writer)
|
||||
.and_then(|_| writer.flush().map_err(|e| e.into()));
|
||||
|
||||
// this instant is not when we finally got the full response sent, sending is done by hyper
|
||||
// in another task.
|
||||
let encoded_at = std::time::Instant::now();
|
||||
|
||||
let spawned_in = spawned_at - started_at;
|
||||
let collected_in = gathered_at - spawned_at;
|
||||
// remove the wait time here in case the tcp connection was clogged
|
||||
let encoded_in = encoded_at - gathered_at - writer.wait_time();
|
||||
let total = encoded_at - started_at;
|
||||
|
||||
match res {
|
||||
Ok(()) => {
|
||||
tracing::info!(
|
||||
bytes = writer.flushed_bytes(),
|
||||
elapsed_ms = started_at.elapsed().as_millis(),
|
||||
total_ms = total.as_millis(),
|
||||
spawning_ms = spawned_in.as_millis(),
|
||||
collection_ms = collected_in.as_millis(),
|
||||
encoding_ms = encoded_in.as_millis(),
|
||||
"responded /metrics"
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!("failed to write out /metrics response: {e:#}");
|
||||
// there is a chance that this error is not the BrokenPipe we generate in the writer
|
||||
// for "closed connection", but it is highly unlikely.
|
||||
tracing::warn!(
|
||||
after_bytes = writer.flushed_bytes(),
|
||||
total_ms = total.as_millis(),
|
||||
spawning_ms = spawned_in.as_millis(),
|
||||
collection_ms = collected_in.as_millis(),
|
||||
encoding_ms = encoded_in.as_millis(),
|
||||
"failed to write out /metrics response: {e:?}"
|
||||
);
|
||||
// semantics of this error are quite... unclear. we want to error the stream out to
|
||||
// abort the response to somehow notify the client that we failed.
|
||||
//
|
||||
|
||||
@@ -415,7 +415,6 @@ mod tests {
|
||||
|
||||
use super::*;
|
||||
|
||||
use serde::ser::Serialize;
|
||||
use serde_assert::{Deserializer, Serializer, Token, Tokens};
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#![warn(missing_docs)]
|
||||
|
||||
use std::cmp::{Eq, Ordering, PartialOrd};
|
||||
use std::cmp::{Eq, Ordering};
|
||||
use std::collections::BinaryHeap;
|
||||
use std::fmt::Debug;
|
||||
use std::mem;
|
||||
@@ -249,7 +249,6 @@ where
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
impl MonotonicCounter<i32> for i32 {
|
||||
fn cnt_advance(&mut self, val: i32) {
|
||||
|
||||
@@ -221,7 +221,7 @@ impl RcuWaitList {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::sync::Mutex;
|
||||
use std::time::Duration;
|
||||
|
||||
#[tokio::test]
|
||||
|
||||
@@ -239,7 +239,6 @@ mod tests {
|
||||
use std::{
|
||||
convert::Infallible,
|
||||
pin::{pin, Pin},
|
||||
sync::atomic::{AtomicUsize, Ordering},
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
|
||||
@@ -12,6 +12,7 @@ testing = ["fail/failpoints"]
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
arc-swap.workspace = true
|
||||
async-compression.workspace = true
|
||||
async-stream.workspace = true
|
||||
async-trait.workspace = true
|
||||
@@ -35,6 +36,7 @@ humantime.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
hyper.workspace = true
|
||||
itertools.workspace = true
|
||||
leaky-bucket.workspace = true
|
||||
md5.workspace = true
|
||||
nix.workspace = true
|
||||
# hack to get the number of worker threads tokio uses
|
||||
@@ -71,6 +73,7 @@ url.workspace = true
|
||||
walkdir.workspace = true
|
||||
metrics.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
pageserver_compaction.workspace = true
|
||||
postgres_connection.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
pq_proto.workspace = true
|
||||
@@ -82,7 +85,7 @@ workspace_hack.workspace = true
|
||||
reqwest.workspace = true
|
||||
rpds.workspace = true
|
||||
enum-map.workspace = true
|
||||
enumset.workspace = true
|
||||
enumset = { workspace = true, features = ["serde"]}
|
||||
strum.workspace = true
|
||||
strum_macros.workspace = true
|
||||
|
||||
|
||||
@@ -217,6 +217,20 @@ impl Client {
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn tenant_time_travel_remote_storage(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timestamp: &str,
|
||||
done_if_after: &str,
|
||||
) -> Result<()> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{tenant_shard_id}/time_travel_remote_storage?travel_to={timestamp}&done_if_after={done_if_after}",
|
||||
self.mgmt_api_endpoint
|
||||
);
|
||||
self.request(Method::PUT, &uri, ()).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
|
||||
let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
|
||||
self.request(Method::PUT, &uri, req).await?;
|
||||
|
||||
54
pageserver/compaction/Cargo.toml
Normal file
54
pageserver/compaction/Cargo.toml
Normal file
@@ -0,0 +1,54 @@
|
||||
[package]
|
||||
name = "pageserver_compaction"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[features]
|
||||
default = []
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
async-compression.workspace = true
|
||||
async-stream.workspace = true
|
||||
async-trait.workspace = true
|
||||
byteorder.workspace = true
|
||||
bytes.workspace = true
|
||||
chrono = { workspace = true, features = ["serde"] }
|
||||
clap = { workspace = true, features = ["string"] }
|
||||
const_format.workspace = true
|
||||
consumption_metrics.workspace = true
|
||||
crossbeam-utils.workspace = true
|
||||
either.workspace = true
|
||||
flate2.workspace = true
|
||||
fail.workspace = true
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
hex.workspace = true
|
||||
humantime.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
itertools.workspace = true
|
||||
once_cell.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
rand.workspace = true
|
||||
smallvec = { workspace = true, features = ["write"] }
|
||||
svg_fmt.workspace = true
|
||||
sync_wrapper.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
|
||||
tokio-io-timeout.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-error.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
url.workspace = true
|
||||
walkdir.workspace = true
|
||||
metrics.workspace = true
|
||||
utils.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
criterion.workspace = true
|
||||
hex-literal.workspace = true
|
||||
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
|
||||
51
pageserver/compaction/TODO.md
Normal file
51
pageserver/compaction/TODO.md
Normal file
@@ -0,0 +1,51 @@
|
||||
# TODO
|
||||
|
||||
- If the key space can be perfectly partitioned at some key, perform planning on each
|
||||
partition separately. For example, if we are compacting a level with layers like this:
|
||||
|
||||
```
|
||||
:
|
||||
+--+ +----+ : +------+
|
||||
| | | | : | |
|
||||
+--+ +----+ : +------+
|
||||
:
|
||||
+-----+ +-+ : +--------+
|
||||
| | | | : | |
|
||||
+-----+ +-+ : +--------+
|
||||
:
|
||||
```
|
||||
|
||||
At the dotted line, there is a natural split in the key space, such that all
|
||||
layers are either on the left or the right of it. We can compact the
|
||||
partitions separately. We could choose to create image layers for one
|
||||
partition but not the other one, for example.
|
||||
|
||||
- All the layers don't have to be exactly the same size, we can choose to cut a
|
||||
layer short or stretch it a little larger than the target size, if it helps
|
||||
the overall system. We can help perfect partitions (see previous bullet point)
|
||||
to happen more frequently, by choosing the cut points wisely. For example, try
|
||||
to cut layers at boundaries of underlying image layers. And "snap to grid",
|
||||
i.e. don't cut layers at any key, but e.g. only when key % 10000 = 0.
|
||||
|
||||
- Avoid rewriting layers when we'd just create an identical layer to an input
|
||||
layer.
|
||||
|
||||
- Parallelism. The code is already split up into planning and execution, so that
|
||||
we first split up the compaction work into "Jobs", and then execute them.
|
||||
It would be straightforward to execute multiple jobs in parallel.
|
||||
|
||||
- Materialize extra pages in delta layers during compaction. This would reduce
|
||||
read amplification. There has been the idea of partial image layers. Materializing
|
||||
extra pages in the delta layers achieve the same goal, without introducing a new
|
||||
concept.
|
||||
|
||||
## Simulator
|
||||
|
||||
- Expand the simulator for more workloads
|
||||
- Automate a test suite that runs the simluator with different workloads and
|
||||
spits out a table of results
|
||||
- Model read amplification
|
||||
- More sanity checking. One idea is to keep a reference count of each
|
||||
MockRecord, i.e. use Arc<MockRecord> instead of plain MockRecord, and panic if
|
||||
a MockRecord that is newer than PITR horizon is completely dropped. That would
|
||||
indicate that the record was lost.
|
||||
214
pageserver/compaction/src/bin/compaction-simulator.rs
Normal file
214
pageserver/compaction/src/bin/compaction-simulator.rs
Normal file
@@ -0,0 +1,214 @@
|
||||
use clap::{Parser, Subcommand};
|
||||
use pageserver_compaction::simulator::MockTimeline;
|
||||
use rand::Rng;
|
||||
use std::io::Write;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::OnceLock;
|
||||
|
||||
use utils::project_git_version;
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(
|
||||
version = GIT_VERSION,
|
||||
about = "Neon Pageserver compaction simulator",
|
||||
long_about = "A developer tool to visualize and test compaction"
|
||||
)]
|
||||
#[command(propagate_version = true)]
|
||||
struct CliOpts {
|
||||
#[command(subcommand)]
|
||||
command: Commands,
|
||||
}
|
||||
|
||||
#[derive(Subcommand)]
|
||||
enum Commands {
|
||||
RunSuite,
|
||||
Simulate(SimulateCmd),
|
||||
}
|
||||
|
||||
#[derive(Clone, clap::ValueEnum)]
|
||||
enum Distribution {
|
||||
Uniform,
|
||||
HotCold,
|
||||
}
|
||||
|
||||
/// Read and update pageserver metadata file
|
||||
#[derive(Parser)]
|
||||
struct SimulateCmd {
|
||||
distribution: Distribution,
|
||||
|
||||
/// Number of records to digest
|
||||
num_records: u64,
|
||||
/// Record length
|
||||
record_len: u64,
|
||||
|
||||
// Logical database size in MB
|
||||
logical_size: u64,
|
||||
}
|
||||
|
||||
async fn simulate(cmd: &SimulateCmd, results_path: &Path) -> anyhow::Result<()> {
|
||||
let mut executor = MockTimeline::new();
|
||||
|
||||
// Convert the logical size in MB into a key range.
|
||||
let key_range = 0..((cmd.logical_size * 1024 * 1024) / 8192);
|
||||
//let key_range = u64::MIN..u64::MAX;
|
||||
println!(
|
||||
"starting simulation with key range {:016X}-{:016X}",
|
||||
key_range.start, key_range.end
|
||||
);
|
||||
|
||||
// helper function to print progress indicator
|
||||
let print_progress = |i| -> anyhow::Result<()> {
|
||||
if i == 0 || (i + 1) % 10000 == 0 || i == cmd.num_records - 1 {
|
||||
print!(
|
||||
"\ringested {} / {} records, {} MiB / {} MiB...",
|
||||
i + 1,
|
||||
cmd.num_records,
|
||||
(i + 1) * cmd.record_len / (1_000_000),
|
||||
cmd.num_records * cmd.record_len / (1_000_000),
|
||||
);
|
||||
std::io::stdout().flush()?;
|
||||
}
|
||||
Ok(())
|
||||
};
|
||||
|
||||
match cmd.distribution {
|
||||
Distribution::Uniform => {
|
||||
for i in 0..cmd.num_records {
|
||||
executor.ingest_uniform(1, cmd.record_len, &key_range)?;
|
||||
executor.compact_if_needed().await?;
|
||||
|
||||
print_progress(i)?;
|
||||
}
|
||||
}
|
||||
Distribution::HotCold => {
|
||||
let splitpoint = key_range.start + (key_range.end - key_range.start) / 10;
|
||||
let hot_key_range = 0..splitpoint;
|
||||
let cold_key_range = splitpoint..key_range.end;
|
||||
|
||||
for i in 0..cmd.num_records {
|
||||
let chosen_range = if rand::thread_rng().gen_bool(0.9) {
|
||||
&hot_key_range
|
||||
} else {
|
||||
&cold_key_range
|
||||
};
|
||||
executor.ingest_uniform(1, cmd.record_len, chosen_range)?;
|
||||
executor.compact_if_needed().await?;
|
||||
|
||||
print_progress(i)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
println!("done!");
|
||||
executor.flush_l0();
|
||||
executor.compact_if_needed().await?;
|
||||
let stats = executor.stats()?;
|
||||
|
||||
// Print the stats to stdout, and also to a file
|
||||
print!("{stats}");
|
||||
std::fs::write(results_path.join("stats.txt"), stats)?;
|
||||
|
||||
let animation_path = results_path.join("compaction-animation.html");
|
||||
executor.draw_history(std::fs::File::create(&animation_path)?)?;
|
||||
println!(
|
||||
"animation: file://{}",
|
||||
animation_path.canonicalize()?.display()
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn run_suite_cmd(results_path: &Path, workload: &SimulateCmd) -> anyhow::Result<()> {
|
||||
std::fs::create_dir(results_path)?;
|
||||
|
||||
set_log_file(File::create(results_path.join("log"))?);
|
||||
let result = simulate(workload, results_path).await;
|
||||
set_log_stdout();
|
||||
result
|
||||
}
|
||||
|
||||
async fn run_suite() -> anyhow::Result<()> {
|
||||
let top_results_path = PathBuf::from(format!(
|
||||
"compaction-suite-results.{}",
|
||||
std::time::SystemTime::UNIX_EPOCH.elapsed()?.as_secs()
|
||||
));
|
||||
std::fs::create_dir(&top_results_path)?;
|
||||
|
||||
let workload = SimulateCmd {
|
||||
distribution: Distribution::Uniform,
|
||||
// Generate 20 GB of WAL
|
||||
record_len: 1_000,
|
||||
num_records: 20_000_000,
|
||||
// Logical size 5 GB
|
||||
logical_size: 5_000,
|
||||
};
|
||||
|
||||
run_suite_cmd(&top_results_path.join("uniform-20GB-5GB"), &workload).await?;
|
||||
|
||||
println!(
|
||||
"All tests finished. Results in {}",
|
||||
top_results_path.display()
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
use std::fs::File;
|
||||
use std::io::Stdout;
|
||||
use std::sync::Mutex;
|
||||
use tracing_subscriber::fmt::writer::EitherWriter;
|
||||
use tracing_subscriber::fmt::MakeWriter;
|
||||
|
||||
static LOG_FILE: OnceLock<Mutex<EitherWriter<File, Stdout>>> = OnceLock::new();
|
||||
fn get_log_output() -> &'static Mutex<EitherWriter<File, Stdout>> {
|
||||
LOG_FILE.get_or_init(|| std::sync::Mutex::new(EitherWriter::B(std::io::stdout())))
|
||||
}
|
||||
|
||||
fn set_log_file(f: File) {
|
||||
*get_log_output().lock().unwrap() = EitherWriter::A(f);
|
||||
}
|
||||
|
||||
fn set_log_stdout() {
|
||||
*get_log_output().lock().unwrap() = EitherWriter::B(std::io::stdout());
|
||||
}
|
||||
|
||||
fn init_logging() -> anyhow::Result<()> {
|
||||
// We fall back to printing all spans at info-level or above if
|
||||
// the RUST_LOG environment variable is not set.
|
||||
let rust_log_env_filter = || {
|
||||
tracing_subscriber::EnvFilter::try_from_default_env()
|
||||
.unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info"))
|
||||
};
|
||||
|
||||
// NB: the order of the with() calls does not matter.
|
||||
// See https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering
|
||||
use tracing_subscriber::prelude::*;
|
||||
tracing_subscriber::registry()
|
||||
.with({
|
||||
let log_layer = tracing_subscriber::fmt::layer()
|
||||
.with_target(false)
|
||||
.with_ansi(false)
|
||||
.with_writer(|| get_log_output().make_writer());
|
||||
log_layer.with_filter(rust_log_env_filter())
|
||||
})
|
||||
.init();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let cli = CliOpts::parse();
|
||||
|
||||
init_logging()?;
|
||||
|
||||
match cli.command {
|
||||
Commands::Simulate(cmd) => {
|
||||
simulate(&cmd, &PathBuf::from("/tmp/compactions.html")).await?;
|
||||
}
|
||||
Commands::RunSuite => {
|
||||
run_suite().await?;
|
||||
}
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
866
pageserver/compaction/src/compact_tiered.rs
Normal file
866
pageserver/compaction/src/compact_tiered.rs
Normal file
@@ -0,0 +1,866 @@
|
||||
//! # Tiered compaction algorithm.
|
||||
//!
|
||||
//! Read all the input delta files, and write a new set of delta files that
|
||||
//! include all the input WAL records. See retile_deltas().
|
||||
//!
|
||||
//! In a "normal" LSM tree, you get to remove any values that are overwritten by
|
||||
//! later values, but in our system, we keep all the history. So the reshuffling
|
||||
//! doesn't remove any garbage, it just reshuffles the records to reduce read
|
||||
//! amplification, i.e. the number of files that you need to access to find the
|
||||
//! WAL records for a given key.
|
||||
//!
|
||||
//! If the new delta files would be very "narrow", i.e. each file would cover
|
||||
//! only a narrow key range, then we create a new set of image files
|
||||
//! instead. The current threshold is that if the estimated total size of the
|
||||
//! image layers is smaller than the size of the deltas, then we create image
|
||||
//! layers. That amounts to 2x storage amplification, and it means that the
|
||||
//! distance of image layers in LSN dimension is roughly equal to the logical
|
||||
//! database size. For example, if the logical database size is 10 GB, we would
|
||||
//! generate new image layers every 10 GB of WAL.
|
||||
use futures::StreamExt;
|
||||
use tracing::{debug, info};
|
||||
|
||||
use std::collections::{HashSet, VecDeque};
|
||||
use std::ops::Range;
|
||||
|
||||
use crate::helpers::{accum_key_values, keyspace_total_size, merge_delta_keys, overlaps_with};
|
||||
use crate::interface::*;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::identify_levels::identify_level;
|
||||
|
||||
/// Main entry point to compaction.
|
||||
///
|
||||
/// The starting point is a cutoff LSN (`end_lsn`). The compaction is run on
|
||||
/// everything below that point, that needs compaction. The cutoff LSN must
|
||||
/// partition the layers so that there are no layers that span across that
|
||||
/// LSN. To start compaction at the top of the tree, pass the end LSN of the
|
||||
/// written last L0 layer.
|
||||
pub async fn compact_tiered<E: CompactionJobExecutor>(
|
||||
executor: &mut E,
|
||||
end_lsn: Lsn,
|
||||
target_file_size: u64,
|
||||
fanout: u64,
|
||||
ctx: &E::RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
assert!(fanout >= 2);
|
||||
// Start at L0
|
||||
let mut current_level_no = 0;
|
||||
let mut current_level_target_height = target_file_size;
|
||||
loop {
|
||||
// end LSN +1 to include possible image layers exactly at 'end_lsn'.
|
||||
let all_layers = executor
|
||||
.get_layers(
|
||||
&(E::Key::MIN..E::Key::MAX),
|
||||
&(Lsn(u64::MIN)..end_lsn + 1),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
info!(
|
||||
"Compacting L{}, total # of layers: {}",
|
||||
current_level_no,
|
||||
all_layers.len()
|
||||
);
|
||||
|
||||
// Identify the range of LSNs that belong to this level. We assume that
|
||||
// each file in this level span an LSN range up to 1.75x target file
|
||||
// size. That should give us enough slop that if we created a slightly
|
||||
// oversized L0 layer, e.g. because flushing the in-memory layer was
|
||||
// delayed for some reason, we don't consider the oversized layer to
|
||||
// belong to L1. But not too much slop, that we don't accidentally
|
||||
// "skip" levels.
|
||||
let max_height = (current_level_target_height as f64 * 1.75) as u64;
|
||||
let Some(level) = identify_level(all_layers, end_lsn, max_height).await? else {
|
||||
break;
|
||||
};
|
||||
|
||||
// Calculate the height of this level. If the # of tiers exceeds the
|
||||
// fanout parameter, it's time to compact it.
|
||||
let depth = level.depth();
|
||||
info!(
|
||||
"Level {} identified as LSN range {}-{}: depth {}",
|
||||
current_level_no, level.lsn_range.start, level.lsn_range.end, depth
|
||||
);
|
||||
for l in &level.layers {
|
||||
debug!("LEVEL {} layer: {}", current_level_no, l.short_id());
|
||||
}
|
||||
if depth < fanout {
|
||||
debug!(
|
||||
level = current_level_no,
|
||||
depth = depth,
|
||||
fanout,
|
||||
"too few deltas to compact"
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
compact_level(
|
||||
&level.lsn_range,
|
||||
&level.layers,
|
||||
executor,
|
||||
target_file_size,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
if target_file_size == u64::MAX {
|
||||
break;
|
||||
}
|
||||
current_level_no += 1;
|
||||
current_level_target_height = current_level_target_height.saturating_mul(fanout);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn compact_level<E: CompactionJobExecutor>(
|
||||
lsn_range: &Range<Lsn>,
|
||||
layers: &[E::Layer],
|
||||
executor: &mut E,
|
||||
target_file_size: u64,
|
||||
ctx: &E::RequestContext,
|
||||
) -> anyhow::Result<bool> {
|
||||
let mut layer_fragments = Vec::new();
|
||||
for l in layers {
|
||||
layer_fragments.push(LayerFragment::new(l.clone()));
|
||||
}
|
||||
|
||||
let mut state = LevelCompactionState {
|
||||
target_file_size,
|
||||
_lsn_range: lsn_range.clone(),
|
||||
layers: layer_fragments,
|
||||
jobs: Vec::new(),
|
||||
job_queue: Vec::new(),
|
||||
next_level: false,
|
||||
executor,
|
||||
};
|
||||
|
||||
let first_job = CompactionJob {
|
||||
key_range: E::Key::MIN..E::Key::MAX,
|
||||
lsn_range: lsn_range.clone(),
|
||||
strategy: CompactionStrategy::Divide,
|
||||
input_layers: state
|
||||
.layers
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|i| LayerId(i.0))
|
||||
.collect(),
|
||||
completed: false,
|
||||
};
|
||||
|
||||
state.jobs.push(first_job);
|
||||
state.job_queue.push(JobId(0));
|
||||
state.execute(ctx).await?;
|
||||
|
||||
info!(
|
||||
"compaction completed! Need to process next level: {}",
|
||||
state.next_level
|
||||
);
|
||||
|
||||
Ok(state.next_level)
|
||||
}
|
||||
|
||||
/// Blackboard that keeps track of the state of all the jobs and work remaining
|
||||
struct LevelCompactionState<'a, E>
|
||||
where
|
||||
E: CompactionJobExecutor,
|
||||
{
|
||||
// parameters
|
||||
target_file_size: u64,
|
||||
|
||||
_lsn_range: Range<Lsn>,
|
||||
layers: Vec<LayerFragment<E>>,
|
||||
|
||||
// job queue
|
||||
jobs: Vec<CompactionJob<E>>,
|
||||
job_queue: Vec<JobId>,
|
||||
|
||||
/// If false, no need to compact levels below this
|
||||
next_level: bool,
|
||||
|
||||
/// Interface to the outside world
|
||||
executor: &'a mut E,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
|
||||
struct LayerId(usize);
|
||||
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
|
||||
struct JobId(usize);
|
||||
|
||||
struct PendingJobSet {
|
||||
pending: HashSet<JobId>,
|
||||
completed: HashSet<JobId>,
|
||||
}
|
||||
|
||||
impl PendingJobSet {
|
||||
fn new() -> Self {
|
||||
PendingJobSet {
|
||||
pending: HashSet::new(),
|
||||
completed: HashSet::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn complete_job(&mut self, job_id: JobId) {
|
||||
self.pending.remove(&job_id);
|
||||
self.completed.insert(job_id);
|
||||
}
|
||||
|
||||
fn all_completed(&self) -> bool {
|
||||
self.pending.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
// When we decide to rewrite a set of layers, LayerFragment is used to keep
|
||||
// track which new layers supersede an old layer. When all the stakeholder jobs
|
||||
// have completed, this layer can be deleted.
|
||||
struct LayerFragment<E>
|
||||
where
|
||||
E: CompactionJobExecutor,
|
||||
{
|
||||
layer: E::Layer,
|
||||
|
||||
// If we will write new layers to replace this one, this keeps track of the
|
||||
// jobs that need to complete before this layer can be deleted. As the jobs
|
||||
// complete, they are moved from 'pending' to 'completed' set. Once the
|
||||
// 'pending' set becomes empty, the layer can be deleted.
|
||||
//
|
||||
// If None, this layer is not rewritten and must not be deleted.
|
||||
deletable_after: Option<PendingJobSet>,
|
||||
|
||||
deleted: bool,
|
||||
}
|
||||
|
||||
impl<E> LayerFragment<E>
|
||||
where
|
||||
E: CompactionJobExecutor,
|
||||
{
|
||||
fn new(layer: E::Layer) -> Self {
|
||||
LayerFragment {
|
||||
layer,
|
||||
deletable_after: None,
|
||||
deleted: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq)]
|
||||
enum CompactionStrategy {
|
||||
Divide,
|
||||
CreateDelta,
|
||||
CreateImage,
|
||||
}
|
||||
|
||||
#[allow(dead_code)] // Todo
|
||||
struct CompactionJob<E: CompactionJobExecutor> {
|
||||
key_range: Range<E::Key>,
|
||||
lsn_range: Range<Lsn>,
|
||||
|
||||
strategy: CompactionStrategy,
|
||||
|
||||
input_layers: Vec<LayerId>,
|
||||
|
||||
completed: bool,
|
||||
}
|
||||
|
||||
impl<'a, E> LevelCompactionState<'a, E>
|
||||
where
|
||||
E: CompactionJobExecutor,
|
||||
{
|
||||
/// Main loop of the executor.
|
||||
///
|
||||
/// In each iteration, we take the next job from the queue, and execute it.
|
||||
/// The execution might add new jobs to the queue. Keep going until the
|
||||
/// queue is empty.
|
||||
///
|
||||
/// Initially, the job queue consists of one Divide job over the whole
|
||||
/// level. On first call, it is divided into smaller jobs.
|
||||
async fn execute(&mut self, ctx: &E::RequestContext) -> anyhow::Result<()> {
|
||||
// TODO: this would be pretty straightforward to parallelize with FuturesUnordered
|
||||
while let Some(next_job_id) = self.job_queue.pop() {
|
||||
info!("executing job {}", next_job_id.0);
|
||||
self.execute_job(next_job_id, ctx).await?;
|
||||
}
|
||||
|
||||
// all done!
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn execute_job(&mut self, job_id: JobId, ctx: &E::RequestContext) -> anyhow::Result<()> {
|
||||
let job = &self.jobs[job_id.0];
|
||||
match job.strategy {
|
||||
CompactionStrategy::Divide => {
|
||||
self.divide_job(job_id, ctx).await?;
|
||||
Ok(())
|
||||
}
|
||||
CompactionStrategy::CreateDelta => {
|
||||
let mut deltas: Vec<E::DeltaLayer> = Vec::new();
|
||||
let mut layer_ids: Vec<LayerId> = Vec::new();
|
||||
for layer_id in &job.input_layers {
|
||||
let layer = &self.layers[layer_id.0].layer;
|
||||
if let Some(dl) = self.executor.downcast_delta_layer(layer).await? {
|
||||
deltas.push(dl.clone());
|
||||
layer_ids.push(*layer_id);
|
||||
}
|
||||
}
|
||||
|
||||
self.executor
|
||||
.create_delta(&job.lsn_range, &job.key_range, &deltas, ctx)
|
||||
.await?;
|
||||
self.jobs[job_id.0].completed = true;
|
||||
|
||||
// did we complete any fragments?
|
||||
for layer_id in layer_ids {
|
||||
let l = &mut self.layers[layer_id.0];
|
||||
if let Some(deletable_after) = l.deletable_after.as_mut() {
|
||||
deletable_after.complete_job(job_id);
|
||||
if deletable_after.all_completed() {
|
||||
self.executor.delete_layer(&l.layer, ctx).await?;
|
||||
l.deleted = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.next_level = true;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
CompactionStrategy::CreateImage => {
|
||||
self.executor
|
||||
.create_image(job.lsn_range.end, &job.key_range, ctx)
|
||||
.await?;
|
||||
self.jobs[job_id.0].completed = true;
|
||||
|
||||
// TODO: we could check if any layers < PITR horizon became deletable
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn push_job(&mut self, job: CompactionJob<E>) -> JobId {
|
||||
let job_id = JobId(self.jobs.len());
|
||||
self.jobs.push(job);
|
||||
self.job_queue.push(job_id);
|
||||
job_id
|
||||
}
|
||||
|
||||
/// Take a partition of the key space, and decide how to compact it.
|
||||
///
|
||||
/// TODO: Currently, this is called exactly once for the level, and we
|
||||
/// decide whether to create new image layers to cover the whole level, or
|
||||
/// write a new set of delta. In the future, this should try to partition
|
||||
/// the key space, and make the decision separately for each partition.
|
||||
async fn divide_job(&mut self, job_id: JobId, ctx: &E::RequestContext) -> anyhow::Result<()> {
|
||||
let job = &self.jobs[job_id.0];
|
||||
assert!(job.strategy == CompactionStrategy::Divide);
|
||||
|
||||
// Check for dummy cases
|
||||
if job.input_layers.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let job = &self.jobs[job_id.0];
|
||||
assert!(job.strategy == CompactionStrategy::Divide);
|
||||
|
||||
// Would it be better to create images for this partition?
|
||||
// Decide based on the average density of the level
|
||||
let keyspace_size = keyspace_total_size(
|
||||
&self
|
||||
.executor
|
||||
.get_keyspace(&job.key_range, job.lsn_range.end, ctx)
|
||||
.await?,
|
||||
) * 8192;
|
||||
|
||||
let wal_size = job
|
||||
.input_layers
|
||||
.iter()
|
||||
.filter(|layer_id| self.layers[layer_id.0].layer.is_delta())
|
||||
.map(|layer_id| self.layers[layer_id.0].layer.file_size())
|
||||
.sum::<u64>();
|
||||
if keyspace_size < wal_size {
|
||||
// seems worth it
|
||||
info!(
|
||||
"covering with images, because keyspace_size is {}, size of deltas between {}-{} is {}",
|
||||
keyspace_size, job.lsn_range.start, job.lsn_range.end, wal_size
|
||||
);
|
||||
self.cover_with_images(job_id, ctx).await
|
||||
} else {
|
||||
// do deltas
|
||||
info!(
|
||||
"coverage not worth it, keyspace_size {}, wal_size {}",
|
||||
keyspace_size, wal_size
|
||||
);
|
||||
self.retile_deltas(job_id, ctx).await
|
||||
}
|
||||
}
|
||||
|
||||
// LSN
|
||||
// ^
|
||||
// |
|
||||
// | ###|###|#####
|
||||
// | +--+-----+--+ +--+-----+--+
|
||||
// | | | | | | | | |
|
||||
// | +--+--+--+--+ +--+--+--+--+
|
||||
// | | | | | | |
|
||||
// | +---+-+-+---+ ==> +---+-+-+---+
|
||||
// | | | | | | | | |
|
||||
// | +---+-+-++--+ +---+-+-++--+
|
||||
// | | | | | | | | |
|
||||
// | +-----+--+--+ +-----+--+--+
|
||||
// |
|
||||
// +--------------> key
|
||||
//
|
||||
async fn cover_with_images(
|
||||
&mut self,
|
||||
job_id: JobId,
|
||||
ctx: &E::RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let job = &self.jobs[job_id.0];
|
||||
assert!(job.strategy == CompactionStrategy::Divide);
|
||||
|
||||
// XXX: do we still need the "holes" stuff?
|
||||
|
||||
let mut new_jobs = Vec::new();
|
||||
|
||||
// Slide a window through the keyspace
|
||||
let keyspace = self
|
||||
.executor
|
||||
.get_keyspace(&job.key_range, job.lsn_range.end, ctx)
|
||||
.await?;
|
||||
|
||||
let mut window = KeyspaceWindow::new(
|
||||
E::Key::MIN..E::Key::MAX,
|
||||
keyspace,
|
||||
self.target_file_size / 8192,
|
||||
);
|
||||
while let Some(key_range) = window.choose_next_image() {
|
||||
new_jobs.push(CompactionJob::<E> {
|
||||
key_range,
|
||||
lsn_range: job.lsn_range.clone(),
|
||||
strategy: CompactionStrategy::CreateImage,
|
||||
input_layers: Vec::new(), // XXX: Is it OK for this to be empty for image layer?
|
||||
completed: false,
|
||||
});
|
||||
}
|
||||
|
||||
for j in new_jobs.into_iter().rev() {
|
||||
let _job_id = self.push_job(j);
|
||||
|
||||
// TODO: image layers don't let us delete anything. unless < PITR horizon
|
||||
//let j = &self.jobs[job_id.0];
|
||||
// for layer_id in j.input_layers.iter() {
|
||||
// self.layers[layer_id.0].pending_stakeholders.insert(job_id);
|
||||
//}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Merge the contents of all the input delta layers into a new set
|
||||
// of delta layers, based on the current partitioning.
|
||||
//
|
||||
// We split the new delta layers on the key dimension. We iterate through
|
||||
// the key space, and for each key, check if including the next key to the
|
||||
// current output layer we're building would cause the layer to become too
|
||||
// large. If so, dump the current output layer and start new one. It's
|
||||
// possible that there is a single key with so many page versions that
|
||||
// storing all of them in a single layer file would be too large. In that
|
||||
// case, we also split on the LSN dimension.
|
||||
//
|
||||
// LSN
|
||||
// ^
|
||||
// |
|
||||
// | +-----------+ +--+--+--+--+
|
||||
// | | | | | | | |
|
||||
// | +-----------+ | | | | |
|
||||
// | | | | | | | |
|
||||
// | +-----------+ ==> | | | | |
|
||||
// | | | | | | | |
|
||||
// | +-----------+ | | | | |
|
||||
// | | | | | | | |
|
||||
// | +-----------+ +--+--+--+--+
|
||||
// |
|
||||
// +--------------> key
|
||||
//
|
||||
//
|
||||
// If one key (X) has a lot of page versions:
|
||||
//
|
||||
// LSN
|
||||
// ^
|
||||
// | (X)
|
||||
// | +-----------+ +--+--+--+--+
|
||||
// | | | | | | | |
|
||||
// | +-----------+ | | +--+ |
|
||||
// | | | | | | | |
|
||||
// | +-----------+ ==> | | | | |
|
||||
// | | | | | +--+ |
|
||||
// | +-----------+ | | | | |
|
||||
// | | | | | | | |
|
||||
// | +-----------+ +--+--+--+--+
|
||||
// |
|
||||
// +--------------> key
|
||||
//
|
||||
// TODO: this actually divides the layers into fixed-size chunks, not
|
||||
// based on the partitioning.
|
||||
//
|
||||
// TODO: we should also opportunistically materialize and
|
||||
// garbage collect what we can.
|
||||
async fn retile_deltas(
|
||||
&mut self,
|
||||
job_id: JobId,
|
||||
ctx: &E::RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let job = &self.jobs[job_id.0];
|
||||
assert!(job.strategy == CompactionStrategy::Divide);
|
||||
|
||||
// Sweep the key space left to right, running an estimate of how much
|
||||
// disk size and keyspace we have accumulated
|
||||
//
|
||||
// Once the disk size reaches the target threshold, stop and think.
|
||||
// If we have accumulated only a narrow band of keyspace, create an
|
||||
// image layer. Otherwise write a delta layer.
|
||||
|
||||
// FIXME: deal with the case of lots of values for same key
|
||||
|
||||
// FIXME: we are ignoring images here. Did we already divide the work
|
||||
// so that we won't encounter them here?
|
||||
|
||||
let mut deltas: Vec<E::DeltaLayer> = Vec::new();
|
||||
for layer_id in &job.input_layers {
|
||||
let l = &self.layers[layer_id.0];
|
||||
if let Some(dl) = self.executor.downcast_delta_layer(&l.layer).await? {
|
||||
deltas.push(dl.clone());
|
||||
}
|
||||
}
|
||||
// Open stream
|
||||
let key_value_stream = std::pin::pin!(merge_delta_keys::<E>(deltas.as_slice(), ctx));
|
||||
let mut new_jobs = Vec::new();
|
||||
|
||||
// Slide a window through the keyspace
|
||||
let mut key_accum = std::pin::pin!(accum_key_values(key_value_stream));
|
||||
let mut all_in_window: bool = false;
|
||||
let mut window = Window::new();
|
||||
loop {
|
||||
if all_in_window && window.elems.is_empty() {
|
||||
// All done!
|
||||
break;
|
||||
}
|
||||
if let Some(key_range) = window.choose_next_delta(self.target_file_size, !all_in_window)
|
||||
{
|
||||
let batch_layers: Vec<LayerId> = job
|
||||
.input_layers
|
||||
.iter()
|
||||
.filter(|layer_id| {
|
||||
overlaps_with(self.layers[layer_id.0].layer.key_range(), &key_range)
|
||||
})
|
||||
.cloned()
|
||||
.collect();
|
||||
assert!(!batch_layers.is_empty());
|
||||
new_jobs.push(CompactionJob {
|
||||
key_range,
|
||||
lsn_range: job.lsn_range.clone(),
|
||||
strategy: CompactionStrategy::CreateDelta,
|
||||
input_layers: batch_layers,
|
||||
completed: false,
|
||||
});
|
||||
} else {
|
||||
assert!(!all_in_window);
|
||||
if let Some(next_key) = key_accum.next().await.transpose()? {
|
||||
window.feed(next_key.key, next_key.size);
|
||||
} else {
|
||||
all_in_window = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// All the input files are rewritten. Set up the tracking for when they can
|
||||
// be deleted.
|
||||
for layer_id in job.input_layers.iter() {
|
||||
let l = &mut self.layers[layer_id.0];
|
||||
assert!(l.deletable_after.is_none());
|
||||
l.deletable_after = Some(PendingJobSet::new());
|
||||
}
|
||||
for j in new_jobs.into_iter().rev() {
|
||||
let job_id = self.push_job(j);
|
||||
let j = &self.jobs[job_id.0];
|
||||
for layer_id in j.input_layers.iter() {
|
||||
self.layers[layer_id.0]
|
||||
.deletable_after
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.pending
|
||||
.insert(job_id);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
// Sliding window through keyspace and values
|
||||
// This is used by over_with_images to decide on good split points
|
||||
struct KeyspaceWindow<K> {
|
||||
head: KeyspaceWindowHead<K>,
|
||||
|
||||
start_pos: KeyspaceWindowPos<K>,
|
||||
}
|
||||
struct KeyspaceWindowHead<K> {
|
||||
// overall key range to cover
|
||||
key_range: Range<K>,
|
||||
|
||||
keyspace: Vec<Range<K>>,
|
||||
target_keysize: u64,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct KeyspaceWindowPos<K> {
|
||||
end_key: K,
|
||||
|
||||
keyspace_idx: usize,
|
||||
|
||||
accum_keysize: u64,
|
||||
}
|
||||
impl<K: CompactionKey> KeyspaceWindowPos<K> {
|
||||
fn reached_end(&self, w: &KeyspaceWindowHead<K>) -> bool {
|
||||
self.keyspace_idx == w.keyspace.len()
|
||||
}
|
||||
|
||||
// Advance the cursor until it reaches 'target_keysize'.
|
||||
fn advance_until_size(&mut self, w: &KeyspaceWindowHead<K>, max_size: u64) {
|
||||
while self.accum_keysize < max_size && !self.reached_end(w) {
|
||||
let curr_range = &w.keyspace[self.keyspace_idx];
|
||||
if self.end_key < curr_range.start {
|
||||
// skip over any unused space
|
||||
self.end_key = curr_range.start;
|
||||
}
|
||||
|
||||
// We're now within 'curr_range'. Can we advance past it completely?
|
||||
let distance = K::key_range_size(&(self.end_key..curr_range.end));
|
||||
if (self.accum_keysize + distance as u64) < max_size {
|
||||
// oh yeah, it fits
|
||||
self.end_key = curr_range.end;
|
||||
self.keyspace_idx += 1;
|
||||
self.accum_keysize += distance as u64;
|
||||
} else {
|
||||
// advance within the range
|
||||
let skip_key = self.end_key.skip_some();
|
||||
let distance = K::key_range_size(&(self.end_key..skip_key));
|
||||
if (self.accum_keysize + distance as u64) < max_size {
|
||||
self.end_key = skip_key;
|
||||
self.accum_keysize += distance as u64;
|
||||
} else {
|
||||
self.end_key = self.end_key.next();
|
||||
self.accum_keysize += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<K> KeyspaceWindow<K>
|
||||
where
|
||||
K: CompactionKey,
|
||||
{
|
||||
fn new(key_range: Range<K>, keyspace: CompactionKeySpace<K>, target_keysize: u64) -> Self {
|
||||
assert!(keyspace.first().unwrap().start >= key_range.start);
|
||||
|
||||
let start_key = key_range.start;
|
||||
let start_pos = KeyspaceWindowPos::<K> {
|
||||
end_key: start_key,
|
||||
keyspace_idx: 0,
|
||||
accum_keysize: 0,
|
||||
};
|
||||
Self {
|
||||
head: KeyspaceWindowHead::<K> {
|
||||
key_range,
|
||||
keyspace,
|
||||
target_keysize,
|
||||
},
|
||||
start_pos,
|
||||
}
|
||||
}
|
||||
|
||||
fn choose_next_image(&mut self) -> Option<Range<K>> {
|
||||
if self.start_pos.keyspace_idx == self.head.keyspace.len() {
|
||||
// we've reached the end
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut next_pos = self.start_pos.clone();
|
||||
next_pos.advance_until_size(
|
||||
&self.head,
|
||||
self.start_pos.accum_keysize + self.head.target_keysize,
|
||||
);
|
||||
|
||||
// See if we can gobble up the rest of the keyspace if we stretch out the layer, up to
|
||||
// 1.25x target size
|
||||
let mut end_pos = next_pos.clone();
|
||||
end_pos.advance_until_size(
|
||||
&self.head,
|
||||
self.start_pos.accum_keysize + (self.head.target_keysize * 5 / 4),
|
||||
);
|
||||
if end_pos.reached_end(&self.head) {
|
||||
// gobble up any unused keyspace between the last used key and end of the range
|
||||
assert!(end_pos.end_key <= self.head.key_range.end);
|
||||
end_pos.end_key = self.head.key_range.end;
|
||||
next_pos = end_pos;
|
||||
}
|
||||
|
||||
let start_key = self.start_pos.end_key;
|
||||
self.start_pos = next_pos;
|
||||
Some(start_key..self.start_pos.end_key)
|
||||
}
|
||||
}
|
||||
|
||||
// Sliding window through keyspace and values
|
||||
//
|
||||
// This is used to decide what layer to write next, from the beginning of the window.
|
||||
//
|
||||
// Candidates:
|
||||
//
|
||||
// 1. Create an image layer, snapping to previous images
|
||||
// 2. Create a delta layer, snapping to previous images
|
||||
// 3. Create an image layer, snapping to
|
||||
//
|
||||
//
|
||||
|
||||
// Take previous partitioning, based on the image layers below.
|
||||
//
|
||||
// Candidate is at the front:
|
||||
//
|
||||
// Consider stretching an image layer to next divider? If it's close enough,
|
||||
// that's the image candidate
|
||||
//
|
||||
// If it's too far, consider splitting at a reasonable point
|
||||
//
|
||||
// Is the image candidate smaller than the equivalent delta? If so,
|
||||
// split off the image. Otherwise, split off one delta.
|
||||
// Try to snap off the delta at a reasonable point
|
||||
|
||||
struct WindowElement<K> {
|
||||
start_key: K, // inclusive
|
||||
last_key: K, // inclusive
|
||||
accum_size: u64,
|
||||
}
|
||||
struct Window<K> {
|
||||
elems: VecDeque<WindowElement<K>>,
|
||||
|
||||
// last key that was split off, inclusive
|
||||
splitoff_key: Option<K>,
|
||||
splitoff_size: u64,
|
||||
}
|
||||
|
||||
impl<K> Window<K>
|
||||
where
|
||||
K: CompactionKey,
|
||||
{
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
elems: VecDeque::new(),
|
||||
splitoff_key: None,
|
||||
splitoff_size: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn feed(&mut self, key: K, size: u64) {
|
||||
let last_size;
|
||||
if let Some(last) = self.elems.back_mut() {
|
||||
assert!(last.last_key <= key);
|
||||
if key == last.last_key {
|
||||
last.accum_size += size;
|
||||
return;
|
||||
}
|
||||
last_size = last.accum_size;
|
||||
} else {
|
||||
last_size = 0;
|
||||
}
|
||||
// This is a new key.
|
||||
let elem = WindowElement {
|
||||
start_key: key,
|
||||
last_key: key,
|
||||
accum_size: last_size + size,
|
||||
};
|
||||
self.elems.push_back(elem);
|
||||
}
|
||||
|
||||
fn remain_size(&self) -> u64 {
|
||||
self.elems.back().unwrap().accum_size - self.splitoff_size
|
||||
}
|
||||
|
||||
fn peek_size(&self) -> u64 {
|
||||
self.elems.front().unwrap().accum_size - self.splitoff_size
|
||||
}
|
||||
|
||||
fn commit_upto(&mut self, mut upto: usize) {
|
||||
while upto > 1 {
|
||||
let popped = self.elems.pop_front().unwrap();
|
||||
self.elems.front_mut().unwrap().start_key = popped.start_key;
|
||||
upto -= 1;
|
||||
}
|
||||
}
|
||||
|
||||
fn find_size_split(&self, target_size: u64) -> usize {
|
||||
self.elems
|
||||
.partition_point(|elem| elem.accum_size - self.splitoff_size < target_size)
|
||||
}
|
||||
|
||||
fn pop(&mut self) {
|
||||
let first = self.elems.pop_front().unwrap();
|
||||
self.splitoff_size = first.accum_size;
|
||||
|
||||
self.splitoff_key = Some(first.last_key);
|
||||
}
|
||||
|
||||
// the difference between delta and image is that an image covers
|
||||
// any unused keyspace before and after, while a delta tries to
|
||||
// minimize that. TODO: difference not implemented
|
||||
fn pop_delta(&mut self) -> Range<K> {
|
||||
let first = self.elems.front().unwrap();
|
||||
let key_range = first.start_key..first.last_key.next();
|
||||
|
||||
self.pop();
|
||||
key_range
|
||||
}
|
||||
|
||||
// Prerequisite: we have enough input in the window
|
||||
//
|
||||
// On return None, the caller should feed more data and call again
|
||||
fn choose_next_delta(&mut self, target_size: u64, has_more: bool) -> Option<Range<K>> {
|
||||
if has_more && self.elems.is_empty() {
|
||||
// Starting up
|
||||
return None;
|
||||
}
|
||||
|
||||
// If we still have an undersized candidate, just keep going
|
||||
while self.peek_size() < target_size {
|
||||
if self.elems.len() > 1 {
|
||||
self.commit_upto(2);
|
||||
} else if has_more {
|
||||
return None;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure we have enough input in the window to make a good decision
|
||||
if has_more && self.remain_size() < target_size * 5 / 4 {
|
||||
return None;
|
||||
}
|
||||
|
||||
// The candidate on the front is now large enough, for a delta.
|
||||
// And we have enough data in the window to decide.
|
||||
|
||||
// If we're willing to stretch it up to 1.25 target size, could we
|
||||
// gobble up the rest of the work? This avoids creating very small
|
||||
// "tail" layers at the end of the keyspace
|
||||
if !has_more && self.remain_size() < target_size * 5 / 3 {
|
||||
self.commit_upto(self.elems.len());
|
||||
} else {
|
||||
let delta_split_at = self.find_size_split(target_size);
|
||||
self.commit_upto(delta_split_at);
|
||||
|
||||
// If it's still not large enough, request the caller to fill the window
|
||||
if self.elems.len() == 1 && has_more {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
Some(self.pop_delta())
|
||||
}
|
||||
}
|
||||
242
pageserver/compaction/src/helpers.rs
Normal file
242
pageserver/compaction/src/helpers.rs
Normal file
@@ -0,0 +1,242 @@
|
||||
//! This file contains generic utility functions over the interface types,
|
||||
//! which could be handy for any compaction implementation.
|
||||
use crate::interface::*;
|
||||
|
||||
use futures::future::BoxFuture;
|
||||
use futures::{Stream, StreamExt};
|
||||
use itertools::Itertools;
|
||||
use pin_project_lite::pin_project;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::collections::VecDeque;
|
||||
use std::future::Future;
|
||||
use std::ops::{DerefMut, Range};
|
||||
use std::pin::Pin;
|
||||
use std::task::{ready, Poll};
|
||||
|
||||
pub fn keyspace_total_size<K>(keyspace: &CompactionKeySpace<K>) -> u64
|
||||
where
|
||||
K: CompactionKey,
|
||||
{
|
||||
keyspace.iter().map(|r| K::key_range_size(r) as u64).sum()
|
||||
}
|
||||
|
||||
pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
|
||||
!(a.end <= b.start || b.end <= a.start)
|
||||
}
|
||||
|
||||
pub fn union_to_keyspace<K: Ord>(a: &mut CompactionKeySpace<K>, b: CompactionKeySpace<K>) {
|
||||
let x = std::mem::take(a);
|
||||
let mut all_ranges_iter = [x.into_iter(), b.into_iter()]
|
||||
.into_iter()
|
||||
.kmerge_by(|a, b| a.start < b.start);
|
||||
let mut ranges = Vec::new();
|
||||
if let Some(first) = all_ranges_iter.next() {
|
||||
let (mut start, mut end) = (first.start, first.end);
|
||||
|
||||
for r in all_ranges_iter {
|
||||
assert!(r.start >= start);
|
||||
if r.start > end {
|
||||
ranges.push(start..end);
|
||||
start = r.start;
|
||||
end = r.end;
|
||||
} else if r.end > end {
|
||||
end = r.end;
|
||||
}
|
||||
}
|
||||
ranges.push(start..end);
|
||||
}
|
||||
*a = ranges
|
||||
}
|
||||
|
||||
pub fn intersect_keyspace<K: Ord + Clone + Copy>(
|
||||
a: &CompactionKeySpace<K>,
|
||||
r: &Range<K>,
|
||||
) -> CompactionKeySpace<K> {
|
||||
let mut ranges: Vec<Range<K>> = Vec::new();
|
||||
|
||||
for x in a.iter() {
|
||||
if x.end <= r.start {
|
||||
continue;
|
||||
}
|
||||
if x.start >= r.end {
|
||||
break;
|
||||
}
|
||||
ranges.push(x.clone())
|
||||
}
|
||||
|
||||
// trim the ends
|
||||
if let Some(first) = ranges.first_mut() {
|
||||
first.start = std::cmp::max(first.start, r.start);
|
||||
}
|
||||
if let Some(last) = ranges.last_mut() {
|
||||
last.end = std::cmp::min(last.end, r.end);
|
||||
}
|
||||
ranges
|
||||
}
|
||||
|
||||
/// Create a stream that iterates through all DeltaEntrys among all input
|
||||
/// layers, in key-lsn order.
|
||||
///
|
||||
/// This is public because the create_delta() implementation likely wants to use this too
|
||||
/// TODO: move to a more shared place
|
||||
pub fn merge_delta_keys<'a, E: CompactionJobExecutor>(
|
||||
layers: &'a [E::DeltaLayer],
|
||||
ctx: &'a E::RequestContext,
|
||||
) -> MergeDeltaKeys<'a, E> {
|
||||
// Use a binary heap to merge the layers. Each input layer is initially
|
||||
// represented by a LazyLoadLayer::Unloaded element, which uses the start of
|
||||
// the layer's key range as the key. The first time a layer reaches the top
|
||||
// of the heap, all the keys of the layer are loaded into a sorted vector.
|
||||
//
|
||||
// This helps to keep the memory usage reasonable: we only need to hold in
|
||||
// memory the DeltaEntrys of the layers that overlap with the "current" key.
|
||||
let mut heap: BinaryHeap<LazyLoadLayer<'a, E>> = BinaryHeap::new();
|
||||
for l in layers {
|
||||
heap.push(LazyLoadLayer::Unloaded(l));
|
||||
}
|
||||
MergeDeltaKeys {
|
||||
heap,
|
||||
ctx,
|
||||
load_future: None,
|
||||
}
|
||||
}
|
||||
|
||||
enum LazyLoadLayer<'a, E: CompactionJobExecutor> {
|
||||
Loaded(VecDeque<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>),
|
||||
Unloaded(&'a E::DeltaLayer),
|
||||
}
|
||||
impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> {
|
||||
fn key(&self) -> E::Key {
|
||||
match self {
|
||||
Self::Loaded(entries) => entries.front().unwrap().key(),
|
||||
Self::Unloaded(dl) => dl.key_range().start,
|
||||
}
|
||||
}
|
||||
}
|
||||
impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
impl<'a, E: CompactionJobExecutor> Ord for LazyLoadLayer<'a, E> {
|
||||
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||
// reverse order so that we get a min-heap
|
||||
other.key().cmp(&self.key())
|
||||
}
|
||||
}
|
||||
impl<'a, E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'a, E> {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.key().eq(&other.key())
|
||||
}
|
||||
}
|
||||
impl<'a, E: CompactionJobExecutor> Eq for LazyLoadLayer<'a, E> {}
|
||||
|
||||
type LoadFuture<'a, E> = BoxFuture<'a, anyhow::Result<Vec<E>>>;
|
||||
|
||||
// Stream returned by `merge_delta_keys`
|
||||
pin_project! {
|
||||
#[allow(clippy::type_complexity)]
|
||||
pub struct MergeDeltaKeys<'a, E: CompactionJobExecutor> {
|
||||
heap: BinaryHeap<LazyLoadLayer<'a, E>>,
|
||||
|
||||
#[pin]
|
||||
load_future: Option<LoadFuture<'a, <E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>>,
|
||||
|
||||
ctx: &'a E::RequestContext,
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, E> Stream for MergeDeltaKeys<'a, E>
|
||||
where
|
||||
E: CompactionJobExecutor + 'a,
|
||||
{
|
||||
type Item = anyhow::Result<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>;
|
||||
|
||||
fn poll_next(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
) -> Poll<std::option::Option<<Self as futures::Stream>::Item>> {
|
||||
let mut this = self.project();
|
||||
loop {
|
||||
if let Some(mut load_future) = this.load_future.as_mut().as_pin_mut() {
|
||||
// We are waiting for loading the keys to finish
|
||||
match ready!(load_future.as_mut().poll(cx)) {
|
||||
Ok(entries) => {
|
||||
this.load_future.set(None);
|
||||
*this.heap.peek_mut().unwrap() =
|
||||
LazyLoadLayer::Loaded(VecDeque::from(entries));
|
||||
}
|
||||
Err(e) => {
|
||||
return Poll::Ready(Some(Err(e)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If the topmost layer in the heap hasn't been loaded yet, start
|
||||
// loading it. Otherwise return the next entry from it and update
|
||||
// the layer's position in the heap (this decreaseKey operation is
|
||||
// performed implicitly when `top` is dropped).
|
||||
if let Some(mut top) = this.heap.peek_mut() {
|
||||
match top.deref_mut() {
|
||||
LazyLoadLayer::Unloaded(ref mut l) => {
|
||||
let fut = l.load_keys(this.ctx);
|
||||
this.load_future.set(Some(fut));
|
||||
continue;
|
||||
}
|
||||
LazyLoadLayer::Loaded(ref mut entries) => {
|
||||
let result = entries.pop_front().unwrap();
|
||||
if entries.is_empty() {
|
||||
std::collections::binary_heap::PeekMut::pop(top);
|
||||
}
|
||||
return Poll::Ready(Some(Ok(result)));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
return Poll::Ready(None);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Accumulate values at key boundaries
|
||||
pub struct KeySize<K> {
|
||||
pub key: K,
|
||||
pub num_values: u64,
|
||||
pub size: u64,
|
||||
}
|
||||
|
||||
pub fn accum_key_values<'a, I, K, D, E>(input: I) -> impl Stream<Item = Result<KeySize<K>, E>>
|
||||
where
|
||||
K: Eq,
|
||||
I: Stream<Item = Result<D, E>>,
|
||||
D: CompactionDeltaEntry<'a, K>,
|
||||
{
|
||||
async_stream::try_stream! {
|
||||
// Initialize the state from the first value
|
||||
let mut input = std::pin::pin!(input);
|
||||
|
||||
if let Some(first) = input.next().await {
|
||||
let first = first?;
|
||||
let mut accum: KeySize<K> = KeySize {
|
||||
key: first.key(),
|
||||
num_values: 1,
|
||||
size: first.size(),
|
||||
};
|
||||
while let Some(this) = input.next().await {
|
||||
let this = this?;
|
||||
if this.key() == accum.key {
|
||||
accum.size += this.size();
|
||||
accum.num_values += 1;
|
||||
} else {
|
||||
yield accum;
|
||||
accum = KeySize {
|
||||
key: this.key(),
|
||||
num_values: 1,
|
||||
size: this.size(),
|
||||
};
|
||||
}
|
||||
}
|
||||
yield accum;
|
||||
}
|
||||
}
|
||||
}
|
||||
376
pageserver/compaction/src/identify_levels.rs
Normal file
376
pageserver/compaction/src/identify_levels.rs
Normal file
@@ -0,0 +1,376 @@
|
||||
//! An LSM tree consists of multiple levels, each exponential larger than the
|
||||
//! previous level. And each level consists of be multiple "tiers". With tiered
|
||||
//! compaction, a level is compacted when it has accumulated more than N tiers,
|
||||
//! forming one tier on the next level.
|
||||
//!
|
||||
//! In the pageserver, we don't explicitly track the levels and tiers. Instead,
|
||||
//! we identify them by looking at the shapes of the layers. It's an easy task
|
||||
//! for a human, but it's not straightforward to come up with the exact
|
||||
//! rules. Especially if there are cases like interrupted, half-finished
|
||||
//! compactions, or highly skewed data distributions that have let us "skip"
|
||||
//! some levels. It's not critical to classify all cases correctly; at worst we
|
||||
//! delay some compaction work, and suffer from more read amplification, or we
|
||||
//! perform some unnecessary compaction work.
|
||||
//!
|
||||
//! `identify_level` performs that shape-matching.
|
||||
//!
|
||||
//! It returns a Level struct, which has `depth()` function to count the number
|
||||
//! of "tiers" in the level. The tier count is the max depth of stacked layers
|
||||
//! within the level. That's a good measure, because the point of compacting is
|
||||
//! to reduce read amplification, and the depth is what determines that.
|
||||
//!
|
||||
//! One interesting effect of this is that if we generate very small delta
|
||||
//! layers at L0, e.g. because the L0 layers are flushed by timeout rather than
|
||||
//! because they reach the target size, the L0 compaction will combine them to
|
||||
//! one larger file. But if the combined file is still smaller than the target
|
||||
//! file size, the file will still be considered to be part of L0 at the next
|
||||
//! iteration.
|
||||
|
||||
use anyhow::bail;
|
||||
use std::collections::BTreeSet;
|
||||
use std::ops::Range;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::interface::*;
|
||||
|
||||
use tracing::{info, trace};
|
||||
|
||||
pub struct Level<L> {
|
||||
pub lsn_range: Range<Lsn>,
|
||||
pub layers: Vec<L>,
|
||||
}
|
||||
|
||||
/// Identify an LSN > `end_lsn` that partitions the LSN space, so that there are
|
||||
/// no layers that cross the boundary LSN.
|
||||
///
|
||||
/// A further restriction is that all layers in the returned partition cover at
|
||||
/// most 'lsn_max_size' LSN bytes.
|
||||
pub async fn identify_level<K, L>(
|
||||
all_layers: Vec<L>,
|
||||
end_lsn: Lsn,
|
||||
lsn_max_size: u64,
|
||||
) -> anyhow::Result<Option<Level<L>>>
|
||||
where
|
||||
K: CompactionKey,
|
||||
L: CompactionLayer<K> + Clone,
|
||||
{
|
||||
// filter out layers that are above the `end_lsn`, they are completely irrelevant.
|
||||
let mut layers = Vec::new();
|
||||
for l in all_layers {
|
||||
if l.lsn_range().start < end_lsn && l.lsn_range().end > end_lsn {
|
||||
// shouldn't happen. Indicates that the caller passed a bogus
|
||||
// end_lsn.
|
||||
bail!("identify_level() called with end_lsn that does not partition the LSN space: end_lsn {} intersects with layer {}", end_lsn, l.short_id());
|
||||
}
|
||||
// include image layers sitting exacty at `end_lsn`.
|
||||
let is_image = !l.is_delta();
|
||||
if (is_image && l.lsn_range().start > end_lsn)
|
||||
|| (!is_image && l.lsn_range().start >= end_lsn)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
layers.push(l);
|
||||
}
|
||||
// All the remaining layers either belong to this level, or are below it.
|
||||
info!(
|
||||
"identify level at {}, size {}, num layers below: {}",
|
||||
end_lsn,
|
||||
lsn_max_size,
|
||||
layers.len()
|
||||
);
|
||||
if layers.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
// Walk the ranges in LSN order.
|
||||
//
|
||||
// ----- end_lsn
|
||||
// |
|
||||
// |
|
||||
// v
|
||||
//
|
||||
layers.sort_by_key(|l| l.lsn_range().end);
|
||||
let mut candidate_start_lsn = end_lsn;
|
||||
let mut candidate_layers: Vec<L> = Vec::new();
|
||||
let mut current_best_start_lsn = end_lsn;
|
||||
let mut current_best_layers: Vec<L> = Vec::new();
|
||||
let mut iter = layers.into_iter();
|
||||
loop {
|
||||
let Some(l) = iter.next_back() else {
|
||||
// Reached end. Accept the last candidate
|
||||
current_best_start_lsn = candidate_start_lsn;
|
||||
current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers));
|
||||
break;
|
||||
};
|
||||
trace!(
|
||||
"inspecting {} for candidate {}, current best {}",
|
||||
l.short_id(),
|
||||
candidate_start_lsn,
|
||||
current_best_start_lsn
|
||||
);
|
||||
|
||||
let r = l.lsn_range();
|
||||
|
||||
// Image layers don't restrict our choice of cutoff LSN
|
||||
if l.is_delta() {
|
||||
// Is this candidate workable? In other words, are there any
|
||||
// delta layers that span across this LSN
|
||||
//
|
||||
// Valid: Not valid:
|
||||
// + +
|
||||
// | | +
|
||||
// + <- candidate + | <- candidate
|
||||
// + +
|
||||
// |
|
||||
// +
|
||||
if r.end <= candidate_start_lsn {
|
||||
// Hooray, there are no crossing LSNs. And we have visited
|
||||
// through all the layers within candidate..end_lsn. The
|
||||
// current candidate can be accepted.
|
||||
current_best_start_lsn = r.end;
|
||||
current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers));
|
||||
candidate_start_lsn = r.start;
|
||||
}
|
||||
|
||||
// Is it small enough to be considered part of this level?
|
||||
if r.end.0 - r.start.0 > lsn_max_size {
|
||||
// Too large, this layer belongs to next level. Stop.
|
||||
trace!(
|
||||
"too large {}, size {} vs {}",
|
||||
l.short_id(),
|
||||
r.end.0 - r.start.0,
|
||||
lsn_max_size
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
// If this crosses the candidate lsn, push it down.
|
||||
if r.start < candidate_start_lsn {
|
||||
trace!(
|
||||
"layer {} prevents from stopping at {}",
|
||||
l.short_id(),
|
||||
candidate_start_lsn
|
||||
);
|
||||
candidate_start_lsn = r.start;
|
||||
}
|
||||
}
|
||||
|
||||
// Include this layer in our candidate
|
||||
candidate_layers.push(l);
|
||||
}
|
||||
|
||||
Ok(if current_best_start_lsn == end_lsn {
|
||||
// empty level
|
||||
None
|
||||
} else {
|
||||
Some(Level {
|
||||
lsn_range: current_best_start_lsn..end_lsn,
|
||||
layers: current_best_layers,
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
// helper struct used in depth()
|
||||
struct Event<K> {
|
||||
key: K,
|
||||
layer_idx: usize,
|
||||
start: bool,
|
||||
}
|
||||
|
||||
impl<L> Level<L> {
|
||||
/// Count the number of deltas stacked on each other.
|
||||
pub fn depth<K>(&self) -> u64
|
||||
where
|
||||
K: CompactionKey,
|
||||
L: CompactionLayer<K>,
|
||||
{
|
||||
let mut events: Vec<Event<K>> = Vec::new();
|
||||
for (idx, l) in self.layers.iter().enumerate() {
|
||||
events.push(Event {
|
||||
key: l.key_range().start,
|
||||
layer_idx: idx,
|
||||
start: true,
|
||||
});
|
||||
events.push(Event {
|
||||
key: l.key_range().end,
|
||||
layer_idx: idx,
|
||||
start: false,
|
||||
});
|
||||
}
|
||||
events.sort_by_key(|e| (e.key, e.start));
|
||||
|
||||
// Sweep the key space left to right. Stop at each distinct key, and
|
||||
// count the number of deltas on top of the highest image at that key.
|
||||
//
|
||||
// This is a little enefficient, as we walk through the active_set on
|
||||
// every key. We could increment/decrement a counter on each step
|
||||
// instead, but that'd require a bit more complex bookkeeping.
|
||||
let mut active_set: BTreeSet<(Lsn, bool, usize)> = BTreeSet::new();
|
||||
let mut max_depth = 0;
|
||||
let mut events_iter = events.iter().peekable();
|
||||
while let Some(e) = events_iter.next() {
|
||||
let l = &self.layers[e.layer_idx];
|
||||
let is_image = !l.is_delta();
|
||||
|
||||
// update the active set
|
||||
if e.start {
|
||||
active_set.insert((l.lsn_range().end, is_image, e.layer_idx));
|
||||
} else {
|
||||
active_set.remove(&(l.lsn_range().end, is_image, e.layer_idx));
|
||||
}
|
||||
|
||||
// recalculate depth if this was the last event at this point
|
||||
let more_events_at_this_key = events_iter
|
||||
.peek()
|
||||
.map_or(false, |next_e| next_e.key == e.key);
|
||||
if !more_events_at_this_key {
|
||||
let mut active_depth = 0;
|
||||
for (_end_lsn, is_image, _idx) in active_set.iter().rev() {
|
||||
if *is_image {
|
||||
break;
|
||||
}
|
||||
active_depth += 1;
|
||||
}
|
||||
if active_depth > max_depth {
|
||||
max_depth = active_depth;
|
||||
}
|
||||
}
|
||||
}
|
||||
max_depth
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::simulator::{Key, MockDeltaLayer, MockImageLayer, MockLayer};
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
fn delta(key_range: Range<Key>, lsn_range: Range<Lsn>) -> MockLayer {
|
||||
MockLayer::Delta(Arc::new(MockDeltaLayer {
|
||||
key_range,
|
||||
lsn_range,
|
||||
// identify_level() doesn't pay attention to the rest of the fields
|
||||
file_size: 0,
|
||||
deleted: Mutex::new(false),
|
||||
records: vec![],
|
||||
}))
|
||||
}
|
||||
|
||||
fn image(key_range: Range<Key>, lsn: Lsn) -> MockLayer {
|
||||
MockLayer::Image(Arc::new(MockImageLayer {
|
||||
key_range,
|
||||
lsn_range: lsn..(lsn + 1),
|
||||
// identify_level() doesn't pay attention to the rest of the fields
|
||||
file_size: 0,
|
||||
deleted: Mutex::new(false),
|
||||
}))
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_identify_level() -> anyhow::Result<()> {
|
||||
let layers = vec![
|
||||
delta(Key::MIN..Key::MAX, Lsn(0x8000)..Lsn(0x9000)),
|
||||
delta(Key::MIN..Key::MAX, Lsn(0x5000)..Lsn(0x7000)),
|
||||
delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)),
|
||||
delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)),
|
||||
delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)),
|
||||
delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2000)),
|
||||
];
|
||||
|
||||
// All layers fit in the max file size
|
||||
let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
|
||||
.await?
|
||||
.unwrap();
|
||||
assert_eq!(level.depth(), 6);
|
||||
|
||||
// Same LSN with smaller max file size. The second layer from the top is larger
|
||||
// and belongs to next level.
|
||||
let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000)
|
||||
.await?
|
||||
.unwrap();
|
||||
assert_eq!(level.depth(), 1);
|
||||
|
||||
// Call with a smaller LSN
|
||||
let level = identify_level(layers.clone(), Lsn(0x3000), 0x1000)
|
||||
.await?
|
||||
.unwrap();
|
||||
assert_eq!(level.depth(), 2);
|
||||
|
||||
// Call with an LSN that doesn't partition the space
|
||||
let result = identify_level(layers, Lsn(0x6000), 0x1000).await;
|
||||
assert!(result.is_err());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_overlapping_lsn_ranges() -> anyhow::Result<()> {
|
||||
// The files LSN ranges overlap, so even though there are more files that
|
||||
// fit under the file size, they are not included in the level because they
|
||||
// overlap so that we'd need to include the oldest file, too, which is
|
||||
// larger
|
||||
let layers = vec![
|
||||
delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)),
|
||||
delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)), // overlap
|
||||
delta(Key::MIN..Key::MAX, Lsn(0x2500)..Lsn(0x3500)), // overlap
|
||||
delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)), // overlap
|
||||
delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2500)), // larger
|
||||
];
|
||||
|
||||
let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000)
|
||||
.await?
|
||||
.unwrap();
|
||||
assert_eq!(level.depth(), 1);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_depth_nonoverlapping() -> anyhow::Result<()> {
|
||||
// The key ranges don't overlap, so depth is only 1.
|
||||
let layers = vec![
|
||||
delta(4000..5000, Lsn(0x6000)..Lsn(0x7000)),
|
||||
delta(3000..4000, Lsn(0x7000)..Lsn(0x8000)),
|
||||
delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
|
||||
];
|
||||
|
||||
let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
|
||||
.await?
|
||||
.unwrap();
|
||||
assert_eq!(level.layers.len(), 3);
|
||||
assert_eq!(level.depth(), 1);
|
||||
|
||||
// Staggered. The 1st and 3rd layer don't overlap with each other.
|
||||
let layers = vec![
|
||||
delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
|
||||
delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)),
|
||||
delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)),
|
||||
];
|
||||
|
||||
let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
|
||||
.await?
|
||||
.unwrap();
|
||||
assert_eq!(level.layers.len(), 3);
|
||||
assert_eq!(level.depth(), 2);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_depth_images() -> anyhow::Result<()> {
|
||||
let layers: Vec<MockLayer> = vec![
|
||||
delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
|
||||
delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)),
|
||||
delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)),
|
||||
// This covers the same key range as the 2nd delta layer. The depth
|
||||
// in that key range is therefore 0.
|
||||
image(1500..2500, Lsn(0x9000)),
|
||||
];
|
||||
|
||||
let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
|
||||
.await?
|
||||
.unwrap();
|
||||
assert_eq!(level.layers.len(), 4);
|
||||
assert_eq!(level.depth(), 1);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
167
pageserver/compaction/src/interface.rs
Normal file
167
pageserver/compaction/src/interface.rs
Normal file
@@ -0,0 +1,167 @@
|
||||
//! This is what the compaction implementation needs to know about
|
||||
//! layers, keyspace etc.
|
||||
//!
|
||||
//! All the heavy lifting is done by the create_image and create_delta
|
||||
//! functions that the implementor provides.
|
||||
use async_trait::async_trait;
|
||||
use pageserver_api::{key::Key, keyspace::key_range_size};
|
||||
use std::ops::Range;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
/// Public interface. This is the main thing that the implementor needs to provide
|
||||
#[async_trait]
|
||||
pub trait CompactionJobExecutor {
|
||||
// Type system.
|
||||
//
|
||||
// We assume that there are two kinds of layers, deltas and images. The
|
||||
// compaction doesn't distinguish whether they are stored locally or
|
||||
// remotely.
|
||||
//
|
||||
// The keyspace is defined by CompactionKey trait.
|
||||
//
|
||||
type Key: CompactionKey;
|
||||
|
||||
type Layer: CompactionLayer<Self::Key> + Clone;
|
||||
type DeltaLayer: CompactionDeltaLayer<Self> + Clone;
|
||||
type ImageLayer: CompactionImageLayer<Self> + Clone;
|
||||
|
||||
// This is passed through to all the interface functions. The compaction
|
||||
// implementation doesn't do anything with it, but it might be useful for
|
||||
// the interface implementation.
|
||||
type RequestContext: CompactionRequestContext;
|
||||
|
||||
// ----
|
||||
// Functions that the planner uses to support its decisions
|
||||
// ----
|
||||
|
||||
/// Return all layers that overlap the given bounding box.
|
||||
async fn get_layers(
|
||||
&mut self,
|
||||
key_range: &Range<Self::Key>,
|
||||
lsn_range: &Range<Lsn>,
|
||||
ctx: &Self::RequestContext,
|
||||
) -> anyhow::Result<Vec<Self::Layer>>;
|
||||
|
||||
async fn get_keyspace(
|
||||
&mut self,
|
||||
key_range: &Range<Self::Key>,
|
||||
lsn: Lsn,
|
||||
ctx: &Self::RequestContext,
|
||||
) -> anyhow::Result<CompactionKeySpace<Self::Key>>;
|
||||
|
||||
/// NB: This is a pretty expensive operation. In the real pageserver
|
||||
/// implementation, it downloads the layer, and keeps it resident
|
||||
/// until the DeltaLayer is dropped.
|
||||
async fn downcast_delta_layer(
|
||||
&self,
|
||||
layer: &Self::Layer,
|
||||
) -> anyhow::Result<Option<Self::DeltaLayer>>;
|
||||
|
||||
// ----
|
||||
// Functions to execute the plan
|
||||
// ----
|
||||
|
||||
/// Create a new image layer, materializing all the values in the key range,
|
||||
/// at given 'lsn'.
|
||||
async fn create_image(
|
||||
&mut self,
|
||||
lsn: Lsn,
|
||||
key_range: &Range<Self::Key>,
|
||||
ctx: &Self::RequestContext,
|
||||
) -> anyhow::Result<()>;
|
||||
|
||||
/// Create a new delta layer, containing all the values from 'input_layers'
|
||||
/// in the given key and LSN range.
|
||||
async fn create_delta(
|
||||
&mut self,
|
||||
lsn_range: &Range<Lsn>,
|
||||
key_range: &Range<Self::Key>,
|
||||
input_layers: &[Self::DeltaLayer],
|
||||
ctx: &Self::RequestContext,
|
||||
) -> anyhow::Result<()>;
|
||||
|
||||
/// Delete a layer. The compaction implementation will call this only after
|
||||
/// all the create_image() or create_delta() calls that deletion of this
|
||||
/// layer depends on have finished. But if the implementor has extra lazy
|
||||
/// background tasks, like uploading the index json file to remote storage,
|
||||
/// it is the implementation's responsibility to track those.
|
||||
async fn delete_layer(
|
||||
&mut self,
|
||||
layer: &Self::Layer,
|
||||
ctx: &Self::RequestContext,
|
||||
) -> anyhow::Result<()>;
|
||||
}
|
||||
|
||||
pub trait CompactionKey: std::cmp::Ord + Clone + Copy + std::fmt::Display {
|
||||
const MIN: Self;
|
||||
const MAX: Self;
|
||||
|
||||
/// Calculate distance between key_range.start and key_range.end.
|
||||
///
|
||||
/// This returns u32, for compatibility with Repository::key. If the
|
||||
/// distance is larger, return u32::MAX.
|
||||
fn key_range_size(key_range: &Range<Self>) -> u32;
|
||||
|
||||
// return "self + 1"
|
||||
fn next(&self) -> Self;
|
||||
|
||||
// return "self + <some decent amount to skip>". The amount to skip
|
||||
// is left to the implementation.
|
||||
// FIXME: why not just "add(u32)" ? This is hard to use
|
||||
fn skip_some(&self) -> Self;
|
||||
}
|
||||
|
||||
impl CompactionKey for Key {
|
||||
const MIN: Self = Self::MIN;
|
||||
const MAX: Self = Self::MAX;
|
||||
|
||||
fn key_range_size(r: &std::ops::Range<Self>) -> u32 {
|
||||
key_range_size(r)
|
||||
}
|
||||
fn next(&self) -> Key {
|
||||
(self as &Key).next()
|
||||
}
|
||||
fn skip_some(&self) -> Key {
|
||||
self.add(128)
|
||||
}
|
||||
}
|
||||
|
||||
/// Contiguous ranges of keys that belong to the key space. In key order, and
|
||||
/// with no overlap.
|
||||
pub type CompactionKeySpace<K> = Vec<Range<K>>;
|
||||
|
||||
/// Functions needed from all layers.
|
||||
pub trait CompactionLayer<K: CompactionKey + ?Sized> {
|
||||
fn key_range(&self) -> &Range<K>;
|
||||
fn lsn_range(&self) -> &Range<Lsn>;
|
||||
|
||||
fn file_size(&self) -> u64;
|
||||
|
||||
/// For debugging, short human-readable representation of the layer. E.g. filename.
|
||||
fn short_id(&self) -> String;
|
||||
|
||||
fn is_delta(&self) -> bool;
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
pub trait CompactionDeltaLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {
|
||||
type DeltaEntry<'a>: CompactionDeltaEntry<'a, E::Key>
|
||||
where
|
||||
Self: 'a;
|
||||
|
||||
/// Return all keys in this delta layer.
|
||||
async fn load_keys<'a>(
|
||||
&self,
|
||||
ctx: &E::RequestContext,
|
||||
) -> anyhow::Result<Vec<Self::DeltaEntry<'_>>>;
|
||||
}
|
||||
|
||||
pub trait CompactionImageLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {}
|
||||
|
||||
pub trait CompactionDeltaEntry<'a, K> {
|
||||
fn key(&self) -> K;
|
||||
fn lsn(&self) -> Lsn;
|
||||
fn size(&self) -> u64;
|
||||
}
|
||||
|
||||
pub trait CompactionRequestContext {}
|
||||
12
pageserver/compaction/src/lib.rs
Normal file
12
pageserver/compaction/src/lib.rs
Normal file
@@ -0,0 +1,12 @@
|
||||
// The main module implementing the compaction algorithm
|
||||
pub mod compact_tiered;
|
||||
pub(crate) mod identify_levels;
|
||||
|
||||
// Traits that the caller of the compaction needs to implement
|
||||
pub mod interface;
|
||||
|
||||
// Utility functions, useful for the implementation
|
||||
pub mod helpers;
|
||||
|
||||
// A simulator with mock implementations of 'interface'
|
||||
pub mod simulator;
|
||||
613
pageserver/compaction/src/simulator.rs
Normal file
613
pageserver/compaction/src/simulator.rs
Normal file
@@ -0,0 +1,613 @@
|
||||
mod draw;
|
||||
|
||||
use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp};
|
||||
|
||||
use async_trait::async_trait;
|
||||
use futures::StreamExt;
|
||||
use rand::Rng;
|
||||
use tracing::info;
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use std::fmt::Write;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use std::sync::Mutex;
|
||||
|
||||
use crate::helpers::{merge_delta_keys, overlaps_with};
|
||||
|
||||
use crate::interface;
|
||||
use crate::interface::CompactionLayer;
|
||||
|
||||
//
|
||||
// Implementation for the CompactionExecutor interface
|
||||
//
|
||||
pub struct MockTimeline {
|
||||
// Parameters for the compaction algorithm
|
||||
pub target_file_size: u64,
|
||||
tiers_per_level: u64,
|
||||
|
||||
num_l0_flushes: u64,
|
||||
last_compact_at_flush: u64,
|
||||
last_flush_lsn: Lsn,
|
||||
|
||||
// In-memory layer
|
||||
records: Vec<MockRecord>,
|
||||
total_len: u64,
|
||||
start_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
|
||||
// Current keyspace at `end_lsn`. This is updated on every ingested record.
|
||||
keyspace: KeySpace,
|
||||
|
||||
// historic keyspaces
|
||||
old_keyspaces: Vec<(Lsn, KeySpace)>,
|
||||
|
||||
// "on-disk" layers
|
||||
pub live_layers: Vec<MockLayer>,
|
||||
|
||||
num_deleted_layers: u64,
|
||||
|
||||
// Statistics
|
||||
wal_ingested: u64,
|
||||
bytes_written: u64,
|
||||
bytes_deleted: u64,
|
||||
layers_created: u64,
|
||||
layers_deleted: u64,
|
||||
|
||||
// All the events - creation and deletion of files - are collected
|
||||
// in 'history'. It is used to draw the SVG animation at the end.
|
||||
time: u64,
|
||||
history: Vec<draw::LayerTraceEvent>,
|
||||
}
|
||||
|
||||
type KeySpace = interface::CompactionKeySpace<Key>;
|
||||
|
||||
pub struct MockRequestContext {}
|
||||
impl interface::CompactionRequestContext for MockRequestContext {}
|
||||
|
||||
pub type Key = u64;
|
||||
|
||||
impl interface::CompactionKey for Key {
|
||||
const MIN: Self = u64::MIN;
|
||||
const MAX: Self = u64::MAX;
|
||||
|
||||
fn key_range_size(key_range: &Range<Self>) -> u32 {
|
||||
std::cmp::min(key_range.end - key_range.start, u32::MAX as u64) as u32
|
||||
}
|
||||
|
||||
fn next(&self) -> Self {
|
||||
self + 1
|
||||
}
|
||||
fn skip_some(&self) -> Self {
|
||||
// round up to next xx
|
||||
self + 100
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct MockRecord {
|
||||
lsn: Lsn,
|
||||
key: Key,
|
||||
len: u64,
|
||||
}
|
||||
|
||||
impl interface::CompactionDeltaEntry<'_, Key> for MockRecord {
|
||||
fn key(&self) -> Key {
|
||||
self.key
|
||||
}
|
||||
fn lsn(&self) -> Lsn {
|
||||
self.lsn
|
||||
}
|
||||
fn size(&self) -> u64 {
|
||||
self.len
|
||||
}
|
||||
}
|
||||
|
||||
pub struct MockDeltaLayer {
|
||||
pub key_range: Range<Key>,
|
||||
pub lsn_range: Range<Lsn>,
|
||||
|
||||
pub file_size: u64,
|
||||
|
||||
pub deleted: Mutex<bool>,
|
||||
|
||||
pub records: Vec<MockRecord>,
|
||||
}
|
||||
|
||||
impl interface::CompactionLayer<Key> for Arc<MockDeltaLayer> {
|
||||
fn key_range(&self) -> &Range<Key> {
|
||||
&self.key_range
|
||||
}
|
||||
fn lsn_range(&self) -> &Range<Lsn> {
|
||||
&self.lsn_range
|
||||
}
|
||||
|
||||
fn file_size(&self) -> u64 {
|
||||
self.file_size
|
||||
}
|
||||
|
||||
fn short_id(&self) -> String {
|
||||
format!(
|
||||
"{:016X}-{:016X}__{:08X}-{:08X}",
|
||||
self.key_range.start, self.key_range.end, self.lsn_range.start.0, self.lsn_range.end.0
|
||||
)
|
||||
}
|
||||
|
||||
fn is_delta(&self) -> bool {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl interface::CompactionDeltaLayer<MockTimeline> for Arc<MockDeltaLayer> {
|
||||
type DeltaEntry<'a> = MockRecord;
|
||||
|
||||
async fn load_keys<'a>(&self, _ctx: &MockRequestContext) -> anyhow::Result<Vec<MockRecord>> {
|
||||
Ok(self.records.clone())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct MockImageLayer {
|
||||
pub key_range: Range<Key>,
|
||||
pub lsn_range: Range<Lsn>,
|
||||
|
||||
pub file_size: u64,
|
||||
|
||||
pub deleted: Mutex<bool>,
|
||||
}
|
||||
|
||||
impl interface::CompactionImageLayer<MockTimeline> for Arc<MockImageLayer> {}
|
||||
|
||||
impl interface::CompactionLayer<Key> for Arc<MockImageLayer> {
|
||||
fn key_range(&self) -> &Range<Key> {
|
||||
&self.key_range
|
||||
}
|
||||
fn lsn_range(&self) -> &Range<Lsn> {
|
||||
&self.lsn_range
|
||||
}
|
||||
|
||||
fn file_size(&self) -> u64 {
|
||||
self.file_size
|
||||
}
|
||||
|
||||
fn short_id(&self) -> String {
|
||||
format!(
|
||||
"{:016X}-{:016X}__{:08X}",
|
||||
self.key_range.start, self.key_range.end, self.lsn_range.start.0,
|
||||
)
|
||||
}
|
||||
|
||||
fn is_delta(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
impl MockTimeline {
|
||||
pub fn new() -> Self {
|
||||
MockTimeline {
|
||||
target_file_size: 256 * 1024 * 1024,
|
||||
tiers_per_level: 4,
|
||||
|
||||
num_l0_flushes: 0,
|
||||
last_compact_at_flush: 0,
|
||||
last_flush_lsn: Lsn(0),
|
||||
|
||||
records: Vec::new(),
|
||||
total_len: 0,
|
||||
start_lsn: Lsn(1000),
|
||||
end_lsn: Lsn(1000),
|
||||
keyspace: KeySpace::new(),
|
||||
|
||||
old_keyspaces: vec![],
|
||||
|
||||
live_layers: vec![],
|
||||
|
||||
num_deleted_layers: 0,
|
||||
|
||||
wal_ingested: 0,
|
||||
bytes_written: 0,
|
||||
bytes_deleted: 0,
|
||||
layers_created: 0,
|
||||
layers_deleted: 0,
|
||||
|
||||
time: 0,
|
||||
history: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn compact(&mut self) -> anyhow::Result<()> {
|
||||
let ctx = MockRequestContext {};
|
||||
|
||||
crate::compact_tiered::compact_tiered(
|
||||
self,
|
||||
self.last_flush_lsn,
|
||||
self.target_file_size,
|
||||
self.tiers_per_level,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Ingest one record to the timeline
|
||||
pub fn ingest_record(&mut self, key: Key, len: u64) {
|
||||
self.records.push(MockRecord {
|
||||
lsn: self.end_lsn,
|
||||
key,
|
||||
len,
|
||||
});
|
||||
self.total_len += len;
|
||||
self.end_lsn += len;
|
||||
|
||||
if self.total_len > self.target_file_size {
|
||||
self.flush_l0();
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn compact_if_needed(&mut self) -> anyhow::Result<()> {
|
||||
if self.num_l0_flushes - self.last_compact_at_flush >= self.tiers_per_level {
|
||||
self.compact().await?;
|
||||
self.last_compact_at_flush = self.num_l0_flushes;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn flush_l0(&mut self) {
|
||||
if self.records.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
let mut records = std::mem::take(&mut self.records);
|
||||
records.sort_by_key(|rec| rec.key);
|
||||
|
||||
let lsn_range = self.start_lsn..self.end_lsn;
|
||||
let new_layer = Arc::new(MockDeltaLayer {
|
||||
key_range: Key::MIN..Key::MAX,
|
||||
lsn_range: lsn_range.clone(),
|
||||
file_size: self.total_len,
|
||||
records,
|
||||
deleted: Mutex::new(false),
|
||||
});
|
||||
info!("flushed L0 layer {}", new_layer.short_id());
|
||||
self.live_layers.push(MockLayer::from(&new_layer));
|
||||
|
||||
// reset L0
|
||||
self.start_lsn = self.end_lsn;
|
||||
self.total_len = 0;
|
||||
self.records = Vec::new();
|
||||
|
||||
self.layers_created += 1;
|
||||
self.bytes_written += new_layer.file_size;
|
||||
|
||||
self.time += 1;
|
||||
self.history.push(LayerTraceEvent {
|
||||
time_rel: self.time,
|
||||
op: LayerTraceOp::Flush,
|
||||
file: LayerTraceFile {
|
||||
filename: new_layer.short_id(),
|
||||
key_range: new_layer.key_range.clone(),
|
||||
lsn_range: new_layer.lsn_range.clone(),
|
||||
},
|
||||
});
|
||||
|
||||
self.num_l0_flushes += 1;
|
||||
self.last_flush_lsn = self.end_lsn;
|
||||
}
|
||||
|
||||
// Ingest `num_records' records to the timeline, with random keys
|
||||
// uniformly distributed in `key_range`
|
||||
pub fn ingest_uniform(
|
||||
&mut self,
|
||||
num_records: u64,
|
||||
len: u64,
|
||||
key_range: &Range<Key>,
|
||||
) -> anyhow::Result<()> {
|
||||
crate::helpers::union_to_keyspace(&mut self.keyspace, vec![key_range.clone()]);
|
||||
let mut rng = rand::thread_rng();
|
||||
for _ in 0..num_records {
|
||||
self.ingest_record(rng.gen_range(key_range.clone()), len);
|
||||
self.wal_ingested += len;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn stats(&self) -> anyhow::Result<String> {
|
||||
let mut s = String::new();
|
||||
|
||||
writeln!(s, "STATISTICS:")?;
|
||||
writeln!(
|
||||
s,
|
||||
"WAL ingested: {:>10} MB",
|
||||
self.wal_ingested / (1024 * 1024)
|
||||
)?;
|
||||
writeln!(
|
||||
s,
|
||||
"size created: {:>10} MB",
|
||||
self.bytes_written / (1024 * 1024)
|
||||
)?;
|
||||
writeln!(
|
||||
s,
|
||||
"size deleted: {:>10} MB",
|
||||
self.bytes_deleted / (1024 * 1024)
|
||||
)?;
|
||||
writeln!(s, "files created: {:>10}", self.layers_created)?;
|
||||
writeln!(s, "files deleted: {:>10}", self.layers_deleted)?;
|
||||
writeln!(
|
||||
s,
|
||||
"write amp: {:>10.2}",
|
||||
self.bytes_written as f64 / self.wal_ingested as f64
|
||||
)?;
|
||||
writeln!(
|
||||
s,
|
||||
"storage amp: {:>10.2}",
|
||||
(self.bytes_written - self.bytes_deleted) as f64 / self.wal_ingested as f64
|
||||
)?;
|
||||
|
||||
Ok(s)
|
||||
}
|
||||
|
||||
pub fn draw_history<W: std::io::Write>(&self, output: W) -> anyhow::Result<()> {
|
||||
draw::draw_history(&self.history, output)
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for MockTimeline {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub enum MockLayer {
|
||||
Delta(Arc<MockDeltaLayer>),
|
||||
Image(Arc<MockImageLayer>),
|
||||
}
|
||||
|
||||
impl interface::CompactionLayer<Key> for MockLayer {
|
||||
fn key_range(&self) -> &Range<Key> {
|
||||
match self {
|
||||
MockLayer::Delta(this) => this.key_range(),
|
||||
MockLayer::Image(this) => this.key_range(),
|
||||
}
|
||||
}
|
||||
fn lsn_range(&self) -> &Range<Lsn> {
|
||||
match self {
|
||||
MockLayer::Delta(this) => this.lsn_range(),
|
||||
MockLayer::Image(this) => this.lsn_range(),
|
||||
}
|
||||
}
|
||||
fn file_size(&self) -> u64 {
|
||||
match self {
|
||||
MockLayer::Delta(this) => this.file_size(),
|
||||
MockLayer::Image(this) => this.file_size(),
|
||||
}
|
||||
}
|
||||
fn short_id(&self) -> String {
|
||||
match self {
|
||||
MockLayer::Delta(this) => this.short_id(),
|
||||
MockLayer::Image(this) => this.short_id(),
|
||||
}
|
||||
}
|
||||
|
||||
fn is_delta(&self) -> bool {
|
||||
match self {
|
||||
MockLayer::Delta(_) => true,
|
||||
MockLayer::Image(_) => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl MockLayer {
|
||||
fn is_deleted(&self) -> bool {
|
||||
let guard = match self {
|
||||
MockLayer::Delta(this) => this.deleted.lock().unwrap(),
|
||||
MockLayer::Image(this) => this.deleted.lock().unwrap(),
|
||||
};
|
||||
*guard
|
||||
}
|
||||
fn mark_deleted(&self) {
|
||||
let mut deleted_guard = match self {
|
||||
MockLayer::Delta(this) => this.deleted.lock().unwrap(),
|
||||
MockLayer::Image(this) => this.deleted.lock().unwrap(),
|
||||
};
|
||||
assert!(!*deleted_guard, "layer already deleted");
|
||||
*deleted_guard = true;
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&Arc<MockDeltaLayer>> for MockLayer {
|
||||
fn from(l: &Arc<MockDeltaLayer>) -> Self {
|
||||
MockLayer::Delta(l.clone())
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&Arc<MockImageLayer>> for MockLayer {
|
||||
fn from(l: &Arc<MockImageLayer>) -> Self {
|
||||
MockLayer::Image(l.clone())
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl interface::CompactionJobExecutor for MockTimeline {
|
||||
type Key = Key;
|
||||
type Layer = MockLayer;
|
||||
type DeltaLayer = Arc<MockDeltaLayer>;
|
||||
type ImageLayer = Arc<MockImageLayer>;
|
||||
type RequestContext = MockRequestContext;
|
||||
|
||||
async fn get_layers(
|
||||
&mut self,
|
||||
key_range: &Range<Self::Key>,
|
||||
lsn_range: &Range<Lsn>,
|
||||
_ctx: &Self::RequestContext,
|
||||
) -> anyhow::Result<Vec<Self::Layer>> {
|
||||
// Clear any deleted layers from our vec
|
||||
self.live_layers.retain(|l| !l.is_deleted());
|
||||
|
||||
let layers: Vec<MockLayer> = self
|
||||
.live_layers
|
||||
.iter()
|
||||
.filter(|l| {
|
||||
overlaps_with(l.lsn_range(), lsn_range) && overlaps_with(l.key_range(), key_range)
|
||||
})
|
||||
.cloned()
|
||||
.collect();
|
||||
|
||||
Ok(layers)
|
||||
}
|
||||
|
||||
async fn get_keyspace(
|
||||
&mut self,
|
||||
key_range: &Range<Self::Key>,
|
||||
_lsn: Lsn,
|
||||
_ctx: &Self::RequestContext,
|
||||
) -> anyhow::Result<interface::CompactionKeySpace<Key>> {
|
||||
// find it in the levels
|
||||
if self.old_keyspaces.is_empty() {
|
||||
Ok(crate::helpers::intersect_keyspace(
|
||||
&self.keyspace,
|
||||
key_range,
|
||||
))
|
||||
} else {
|
||||
// not implemented
|
||||
|
||||
// The mock implementation only allows requesting the
|
||||
// keyspace at the level's end LSN. That's all that the
|
||||
// current implementation needs.
|
||||
panic!("keyspace not available for requested lsn");
|
||||
}
|
||||
}
|
||||
|
||||
async fn downcast_delta_layer(
|
||||
&self,
|
||||
layer: &MockLayer,
|
||||
) -> anyhow::Result<Option<Arc<MockDeltaLayer>>> {
|
||||
Ok(match layer {
|
||||
MockLayer::Delta(l) => Some(l.clone()),
|
||||
MockLayer::Image(_) => None,
|
||||
})
|
||||
}
|
||||
|
||||
async fn create_image(
|
||||
&mut self,
|
||||
lsn: Lsn,
|
||||
key_range: &Range<Key>,
|
||||
ctx: &MockRequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let keyspace = self.get_keyspace(key_range, lsn, ctx).await?;
|
||||
|
||||
let mut accum_size: u64 = 0;
|
||||
for r in keyspace {
|
||||
accum_size += r.end - r.start;
|
||||
}
|
||||
|
||||
let new_layer = Arc::new(MockImageLayer {
|
||||
key_range: key_range.clone(),
|
||||
lsn_range: lsn..lsn,
|
||||
file_size: accum_size * 8192,
|
||||
deleted: Mutex::new(false),
|
||||
});
|
||||
info!(
|
||||
"created image layer, size {}: {}",
|
||||
new_layer.file_size,
|
||||
new_layer.short_id()
|
||||
);
|
||||
self.live_layers.push(MockLayer::Image(new_layer.clone()));
|
||||
|
||||
// update stats
|
||||
self.bytes_written += new_layer.file_size;
|
||||
self.layers_created += 1;
|
||||
|
||||
self.time += 1;
|
||||
self.history.push(LayerTraceEvent {
|
||||
time_rel: self.time,
|
||||
op: LayerTraceOp::CreateImage,
|
||||
file: LayerTraceFile {
|
||||
filename: new_layer.short_id(),
|
||||
key_range: new_layer.key_range.clone(),
|
||||
lsn_range: new_layer.lsn_range.clone(),
|
||||
},
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_delta(
|
||||
&mut self,
|
||||
lsn_range: &Range<Lsn>,
|
||||
key_range: &Range<Key>,
|
||||
input_layers: &[Arc<MockDeltaLayer>],
|
||||
ctx: &MockRequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut key_value_stream =
|
||||
std::pin::pin!(merge_delta_keys::<MockTimeline>(input_layers, ctx));
|
||||
let mut records: Vec<MockRecord> = Vec::new();
|
||||
let mut total_len = 2;
|
||||
while let Some(delta_entry) = key_value_stream.next().await {
|
||||
let delta_entry: MockRecord = delta_entry?;
|
||||
if key_range.contains(&delta_entry.key) && lsn_range.contains(&delta_entry.lsn) {
|
||||
total_len += delta_entry.len;
|
||||
records.push(delta_entry);
|
||||
}
|
||||
}
|
||||
let total_records = records.len();
|
||||
let new_layer = Arc::new(MockDeltaLayer {
|
||||
key_range: key_range.clone(),
|
||||
lsn_range: lsn_range.clone(),
|
||||
file_size: total_len,
|
||||
records,
|
||||
deleted: Mutex::new(false),
|
||||
});
|
||||
info!(
|
||||
"created delta layer, recs {}, size {}: {}",
|
||||
total_records,
|
||||
total_len,
|
||||
new_layer.short_id()
|
||||
);
|
||||
self.live_layers.push(MockLayer::Delta(new_layer.clone()));
|
||||
|
||||
// update stats
|
||||
self.bytes_written += total_len;
|
||||
self.layers_created += 1;
|
||||
|
||||
self.time += 1;
|
||||
self.history.push(LayerTraceEvent {
|
||||
time_rel: self.time,
|
||||
op: LayerTraceOp::CreateDelta,
|
||||
file: LayerTraceFile {
|
||||
filename: new_layer.short_id(),
|
||||
key_range: new_layer.key_range.clone(),
|
||||
lsn_range: new_layer.lsn_range.clone(),
|
||||
},
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn delete_layer(
|
||||
&mut self,
|
||||
layer: &Self::Layer,
|
||||
_ctx: &MockRequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let layer = std::pin::pin!(layer);
|
||||
info!("deleting layer: {}", layer.short_id());
|
||||
self.num_deleted_layers += 1;
|
||||
self.bytes_deleted += layer.file_size();
|
||||
layer.mark_deleted();
|
||||
|
||||
self.time += 1;
|
||||
self.history.push(LayerTraceEvent {
|
||||
time_rel: self.time,
|
||||
op: LayerTraceOp::Delete,
|
||||
file: LayerTraceFile {
|
||||
filename: layer.short_id(),
|
||||
key_range: layer.key_range().clone(),
|
||||
lsn_range: layer.lsn_range().clone(),
|
||||
},
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
411
pageserver/compaction/src/simulator/draw.rs
Normal file
411
pageserver/compaction/src/simulator/draw.rs
Normal file
@@ -0,0 +1,411 @@
|
||||
use super::Key;
|
||||
use anyhow::Result;
|
||||
use std::cmp::Ordering;
|
||||
use std::{
|
||||
collections::{BTreeMap, BTreeSet, HashSet},
|
||||
fmt::Write,
|
||||
ops::Range,
|
||||
};
|
||||
use svg_fmt::{rgb, BeginSvg, EndSvg, Fill, Stroke, Style};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
// Map values to their compressed coordinate - the index the value
|
||||
// would have in a sorted and deduplicated list of all values.
|
||||
struct CoordinateMap<T: Ord + Copy> {
|
||||
map: BTreeMap<T, usize>,
|
||||
stretch: f32,
|
||||
}
|
||||
|
||||
impl<T: Ord + Copy> CoordinateMap<T> {
|
||||
fn new(coords: Vec<T>, stretch: f32) -> Self {
|
||||
let set: BTreeSet<T> = coords.into_iter().collect();
|
||||
|
||||
let mut map: BTreeMap<T, usize> = BTreeMap::new();
|
||||
for (i, e) in set.iter().enumerate() {
|
||||
map.insert(*e, i);
|
||||
}
|
||||
|
||||
Self { map, stretch }
|
||||
}
|
||||
|
||||
// This assumes that the map contains an exact point for this.
|
||||
// Use map_inexact for values inbetween
|
||||
fn map(&self, val: T) -> f32 {
|
||||
*self.map.get(&val).unwrap() as f32 * self.stretch
|
||||
}
|
||||
|
||||
// the value is still assumed to be within the min/max bounds
|
||||
// (this is currently unused)
|
||||
fn _map_inexact(&self, val: T) -> f32 {
|
||||
let prev = *self.map.range(..=val).next().unwrap().1;
|
||||
let next = *self.map.range(val..).next().unwrap().1;
|
||||
|
||||
// interpolate
|
||||
(prev as f32 + (next - prev) as f32) * self.stretch
|
||||
}
|
||||
|
||||
fn max(&self) -> f32 {
|
||||
self.map.len() as f32 * self.stretch
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Hash, Eq)]
|
||||
pub enum LayerTraceOp {
|
||||
Flush,
|
||||
CreateDelta,
|
||||
CreateImage,
|
||||
Delete,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for LayerTraceOp {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
|
||||
let op_str = match self {
|
||||
LayerTraceOp::Flush => "flush",
|
||||
LayerTraceOp::CreateDelta => "create_delta",
|
||||
LayerTraceOp::CreateImage => "create_image",
|
||||
LayerTraceOp::Delete => "delete",
|
||||
};
|
||||
f.write_str(op_str)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Hash, Eq, Clone)]
|
||||
pub struct LayerTraceFile {
|
||||
pub filename: String,
|
||||
pub key_range: Range<Key>,
|
||||
pub lsn_range: Range<Lsn>,
|
||||
}
|
||||
|
||||
impl LayerTraceFile {
|
||||
fn is_image(&self) -> bool {
|
||||
self.lsn_range.end == self.lsn_range.start
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LayerTraceEvent {
|
||||
pub time_rel: u64,
|
||||
pub op: LayerTraceOp,
|
||||
pub file: LayerTraceFile,
|
||||
}
|
||||
|
||||
pub fn draw_history<W: std::io::Write>(history: &[LayerTraceEvent], mut output: W) -> Result<()> {
|
||||
let mut files: Vec<LayerTraceFile> = Vec::new();
|
||||
|
||||
for event in history {
|
||||
files.push(event.file.clone());
|
||||
}
|
||||
let last_time_rel = history.last().unwrap().time_rel;
|
||||
|
||||
// Collect all coordinates
|
||||
let mut keys: Vec<Key> = vec![];
|
||||
let mut lsns: Vec<Lsn> = vec![];
|
||||
for f in files.iter() {
|
||||
keys.push(f.key_range.start);
|
||||
keys.push(f.key_range.end);
|
||||
lsns.push(f.lsn_range.start);
|
||||
lsns.push(f.lsn_range.end);
|
||||
}
|
||||
|
||||
// Analyze
|
||||
let key_map = CoordinateMap::new(keys, 2.0);
|
||||
// Stretch out vertically for better visibility
|
||||
let lsn_map = CoordinateMap::new(lsns, 3.0);
|
||||
|
||||
let mut svg = String::new();
|
||||
|
||||
// Draw
|
||||
writeln!(
|
||||
svg,
|
||||
"{}",
|
||||
BeginSvg {
|
||||
w: key_map.max(),
|
||||
h: lsn_map.max(),
|
||||
}
|
||||
)?;
|
||||
let lsn_max = lsn_map.max();
|
||||
|
||||
// Sort the files by LSN, but so that image layers go after all delta layers
|
||||
// The SVG is painted in the order the elements appear, and we want to draw
|
||||
// image layers on top of the delta layers if they overlap
|
||||
//
|
||||
// (This could also be implemented via z coordinates: image layers get one z
|
||||
// coord, delta layers get another z coord.)
|
||||
let mut files_sorted: Vec<LayerTraceFile> = files.into_iter().collect();
|
||||
files_sorted.sort_by(|a, b| {
|
||||
if a.is_image() && !b.is_image() {
|
||||
Ordering::Greater
|
||||
} else if !a.is_image() && b.is_image() {
|
||||
Ordering::Less
|
||||
} else {
|
||||
a.lsn_range.end.cmp(&b.lsn_range.end)
|
||||
}
|
||||
});
|
||||
|
||||
writeln!(svg, "<!-- layers -->")?;
|
||||
let mut files_seen = HashSet::new();
|
||||
for f in files_sorted {
|
||||
if files_seen.contains(&f) {
|
||||
continue;
|
||||
}
|
||||
let key_start = key_map.map(f.key_range.start);
|
||||
let key_end = key_map.map(f.key_range.end);
|
||||
let key_diff = key_end - key_start;
|
||||
|
||||
if key_start >= key_end {
|
||||
panic!("Invalid key range {}-{}", key_start, key_end);
|
||||
}
|
||||
|
||||
let lsn_start = lsn_map.map(f.lsn_range.start);
|
||||
let lsn_end = lsn_map.map(f.lsn_range.end);
|
||||
|
||||
// Fill in and thicken rectangle if it's an
|
||||
// image layer so that we can see it.
|
||||
let mut style = Style::default();
|
||||
style.fill = Fill::Color(rgb(0x80, 0x80, 0x80));
|
||||
style.stroke = Stroke::Color(rgb(0, 0, 0), 0.5);
|
||||
|
||||
let y_start = lsn_max - lsn_start;
|
||||
let y_end = lsn_max - lsn_end;
|
||||
|
||||
let x_margin = 0.25;
|
||||
let y_margin = 0.5;
|
||||
|
||||
match f.lsn_range.start.cmp(&f.lsn_range.end) {
|
||||
Ordering::Less => {
|
||||
write!(
|
||||
svg,
|
||||
r#" <rect id="layer_{}" x="{}" y="{}" width="{}" height="{}" ry="{}" style="{}">"#,
|
||||
f.filename,
|
||||
key_start + x_margin,
|
||||
y_end + y_margin,
|
||||
key_diff - x_margin * 2.0,
|
||||
y_start - y_end - y_margin * 2.0,
|
||||
1.0, // border_radius,
|
||||
style,
|
||||
)?;
|
||||
write!(svg, "<title>{}</title>", f.filename)?;
|
||||
writeln!(svg, "</rect>")?;
|
||||
}
|
||||
Ordering::Equal => {
|
||||
//lsn_diff = 0.3;
|
||||
//lsn_offset = -lsn_diff / 2.0;
|
||||
//margin = 0.05;
|
||||
style.fill = Fill::Color(rgb(0x80, 0, 0x80));
|
||||
style.stroke = Stroke::Color(rgb(0x80, 0, 0x80), 3.0);
|
||||
write!(
|
||||
svg,
|
||||
r#" <line id="layer_{}" x1="{}" y1="{}" x2="{}" y2="{}" style="{}">"#,
|
||||
f.filename,
|
||||
key_start + x_margin,
|
||||
y_end,
|
||||
key_end - x_margin,
|
||||
y_end,
|
||||
style,
|
||||
)?;
|
||||
write!(
|
||||
svg,
|
||||
"<title>{}<br>{} - {}</title>",
|
||||
f.filename, lsn_end, y_end
|
||||
)?;
|
||||
writeln!(svg, "</line>")?;
|
||||
}
|
||||
Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end),
|
||||
}
|
||||
files_seen.insert(f);
|
||||
}
|
||||
|
||||
let mut record_style = Style::default();
|
||||
record_style.fill = Fill::Color(rgb(0x80, 0x80, 0x80));
|
||||
record_style.stroke = Stroke::None;
|
||||
|
||||
writeln!(svg, "{}", EndSvg)?;
|
||||
|
||||
let mut layer_events_str = String::new();
|
||||
let mut first = true;
|
||||
for e in history {
|
||||
if !first {
|
||||
writeln!(layer_events_str, ",")?;
|
||||
}
|
||||
write!(
|
||||
layer_events_str,
|
||||
r#" {{"time_rel": {}, "filename": "{}", "op": "{}"}}"#,
|
||||
e.time_rel, e.file.filename, e.op
|
||||
)?;
|
||||
first = false;
|
||||
}
|
||||
writeln!(layer_events_str)?;
|
||||
|
||||
writeln!(
|
||||
output,
|
||||
r#"<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<style>
|
||||
/* Keep the slider pinned at top */
|
||||
.topbar {{
|
||||
display: block;
|
||||
overflow: hidden;
|
||||
background-color: lightgrey;
|
||||
position: fixed;
|
||||
top: 0;
|
||||
width: 100%;
|
||||
/* width: 500px; */
|
||||
}}
|
||||
.slidercontainer {{
|
||||
float: left;
|
||||
width: 50%;
|
||||
margin-right: 200px;
|
||||
}}
|
||||
.slider {{
|
||||
float: left;
|
||||
width: 100%;
|
||||
}}
|
||||
.legend {{
|
||||
width: 200px;
|
||||
float: right;
|
||||
}}
|
||||
|
||||
/* Main content */
|
||||
.main {{
|
||||
margin-top: 50px; /* Add a top margin to avoid content overlay */
|
||||
}}
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body onload="init()">
|
||||
<script type="text/javascript">
|
||||
|
||||
var layer_events = [{layer_events_str}]
|
||||
|
||||
let ticker;
|
||||
|
||||
function init() {{
|
||||
for (let i = 0; i < layer_events.length; i++) {{
|
||||
var layer = document.getElementById("layer_" + layer_events[i].filename);
|
||||
layer.style.visibility = "hidden";
|
||||
}}
|
||||
last_layer_event = -1;
|
||||
moveSlider(last_slider_pos)
|
||||
}}
|
||||
|
||||
function startAnimation() {{
|
||||
ticker = setInterval(animateStep, 100);
|
||||
}}
|
||||
function stopAnimation() {{
|
||||
clearInterval(ticker);
|
||||
}}
|
||||
|
||||
function animateStep() {{
|
||||
if (last_layer_event < layer_events.length - 1) {{
|
||||
var slider = document.getElementById("time-slider");
|
||||
let prevPos = slider.value
|
||||
let nextEvent = last_layer_event + 1
|
||||
while (nextEvent <= layer_events.length - 1) {{
|
||||
if (layer_events[nextEvent].time_rel > prevPos) {{
|
||||
break;
|
||||
}}
|
||||
nextEvent += 1;
|
||||
}}
|
||||
let nextPos = layer_events[nextEvent].time_rel
|
||||
slider.value = nextPos
|
||||
moveSlider(nextPos)
|
||||
}}
|
||||
}}
|
||||
|
||||
function redoLayerEvent(n, dir) {{
|
||||
var layer = document.getElementById("layer_" + layer_events[n].filename);
|
||||
switch (layer_events[n].op) {{
|
||||
case "flush":
|
||||
layer.style.visibility = "visible";
|
||||
break;
|
||||
case "create_delta":
|
||||
layer.style.visibility = "visible";
|
||||
break;
|
||||
case "create_image":
|
||||
layer.style.visibility = "visible";
|
||||
break;
|
||||
case "delete":
|
||||
layer.style.visibility = "hidden";
|
||||
break;
|
||||
}}
|
||||
}}
|
||||
function undoLayerEvent(n) {{
|
||||
var layer = document.getElementById("layer_" + layer_events[n].filename);
|
||||
switch (layer_events[n].op) {{
|
||||
case "flush":
|
||||
layer.style.visibility = "hidden";
|
||||
break;
|
||||
case "create_delta":
|
||||
layer.style.visibility = "hidden";
|
||||
break;
|
||||
case "create_image":
|
||||
layer.style.visibility = "hidden";
|
||||
break;
|
||||
case "delete":
|
||||
layer.style.visibility = "visible";
|
||||
break;
|
||||
}}
|
||||
}}
|
||||
|
||||
var last_slider_pos = 0
|
||||
var last_layer_event = 0
|
||||
|
||||
var moveSlider = function(new_pos) {{
|
||||
if (new_pos > last_slider_pos) {{
|
||||
while (last_layer_event < layer_events.length - 1) {{
|
||||
if (layer_events[last_layer_event + 1].time_rel > new_pos) {{
|
||||
break;
|
||||
}}
|
||||
last_layer_event += 1;
|
||||
redoLayerEvent(last_layer_event)
|
||||
}}
|
||||
}}
|
||||
if (new_pos < last_slider_pos) {{
|
||||
while (last_layer_event >= 0) {{
|
||||
if (layer_events[last_layer_event].time_rel <= new_pos) {{
|
||||
break;
|
||||
}}
|
||||
undoLayerEvent(last_layer_event)
|
||||
last_layer_event -= 1;
|
||||
}}
|
||||
}}
|
||||
last_slider_pos = new_pos;
|
||||
document.getElementById("debug_pos").textContent=new_pos;
|
||||
if (last_layer_event >= 0) {{
|
||||
document.getElementById("debug_layer_event").textContent=last_layer_event + " " + layer_events[last_layer_event].time_rel + " " + layer_events[last_layer_event].op;
|
||||
}} else {{
|
||||
document.getElementById("debug_layer_event").textContent="begin";
|
||||
}}
|
||||
}}
|
||||
</script>
|
||||
|
||||
<div class="topbar">
|
||||
<div class="slidercontainer">
|
||||
<label for="time-slider">TIME</label>:
|
||||
<input id="time-slider" class="slider" type="range" min="0" max="{last_time_rel}" value="0" oninput="moveSlider(this.value)"><br>
|
||||
|
||||
pos: <span id="debug_pos"></span><br>
|
||||
event: <span id="debug_layer_event"></span><br>
|
||||
gc: <span id="debug_gc_event"></span><br>
|
||||
</div>
|
||||
|
||||
<button onclick="startAnimation()">Play</button>
|
||||
<button onclick="stopAnimation()">Stop</button>
|
||||
|
||||
<svg class="legend">
|
||||
<rect x=5 y=0 width=20 height=20 style="fill:rgb(128,128,128);stroke:rgb(0,0,0);stroke-width:0.5;fill-opacity:1;stroke-opacity:1;"/>
|
||||
<line x1=5 y1=30 x2=25 y2=30 style="fill:rgb(128,0,128);stroke:rgb(128,0,128);stroke-width:3;fill-opacity:1;stroke-opacity:1;"/>
|
||||
<line x1=0 y1=40 x2=30 y2=40 style="fill:none;stroke:rgb(255,0,0);stroke-width:0.5;fill-opacity:1;stroke-opacity:1;"/>
|
||||
</svg>
|
||||
</div>
|
||||
|
||||
<div class="main">
|
||||
{svg}
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
"#
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
35
pageserver/compaction/tests/tests.rs
Normal file
35
pageserver/compaction/tests/tests.rs
Normal file
@@ -0,0 +1,35 @@
|
||||
use pageserver_compaction::interface::CompactionLayer;
|
||||
use pageserver_compaction::simulator::MockTimeline;
|
||||
|
||||
/// Test the extreme case that there are so many updates for a single key that
|
||||
/// even if we produce an extremely narrow delta layer, spanning just that one
|
||||
/// key, we still too many records to fit in the target file size. We need to
|
||||
/// split in the LSN dimension too in that case.
|
||||
///
|
||||
/// TODO: The code to avoid this problem has not been implemented yet! So the
|
||||
/// assertion currently fails, but we need to make it not fail.
|
||||
#[ignore]
|
||||
#[tokio::test]
|
||||
async fn test_many_updates_for_single_key() {
|
||||
let mut executor = MockTimeline::new();
|
||||
executor.target_file_size = 10_000_000; // 10 MB
|
||||
|
||||
// Ingest 100 MB of updates to a single key.
|
||||
for _ in 1..1000 {
|
||||
executor.ingest_uniform(100, 10, &(0..100_000)).unwrap();
|
||||
executor.ingest_uniform(10_000, 10, &(0..1)).unwrap();
|
||||
executor.compact().await.unwrap();
|
||||
}
|
||||
|
||||
// Check that all the layers are smaller than the target size (with some slop)
|
||||
for l in executor.live_layers.iter() {
|
||||
println!("layer {}: {}", l.short_id(), l.file_size());
|
||||
}
|
||||
for l in executor.live_layers.iter() {
|
||||
assert!(l.file_size() < executor.target_file_size * 2);
|
||||
// sanity check that none of the delta layers are stupidly small either
|
||||
if l.is_delta() {
|
||||
assert!(l.file_size() > executor.target_file_size / 2);
|
||||
}
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user